From be5a0c126ad1dea2128dc5aef12c87083518d1ab Mon Sep 17 00:00:00 2001
From: venkatesh.pallipadi@intel.com
Date: Wed, 10 Feb 2010 11:57:06 -0800
Subject: x86, pat: Preparatory changes in pat.c for bigger rbtree change

Minor changes in pat.c to cleanup code and make it smoother to introduce
bigger rbtree only change in the following patch. The changes are cleaup
only and should not have any functional impact.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
LKML-Reference: <20100210195909.792781000@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/pat.c          | 170 +++++++++++++++++++++++----------------------
 arch/x86/mm/pat_internal.h |  28 ++++++++
 2 files changed, 116 insertions(+), 82 deletions(-)
 create mode 100644 arch/x86/mm/pat_internal.h

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index ae9648eb1c7f..628e507b7936 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -30,6 +30,8 @@
 #include <asm/pat.h>
 #include <asm/io.h>
 
+#include "pat_internal.h"
+
 #ifdef CONFIG_X86_PAT
 int __read_mostly pat_enabled = 1;
 
@@ -53,19 +55,15 @@ static inline void pat_disable(const char *reason)
 #endif
 
 
-static int debug_enable;
+int pat_debug_enable;
 
 static int __init pat_debug_setup(char *str)
 {
-	debug_enable = 1;
+	pat_debug_enable = 1;
 	return 0;
 }
 __setup("debugpat", pat_debug_setup);
 
-#define dprintk(fmt, arg...) \
-	do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
-
-
 static u64 __read_mostly boot_pat_state;
 
 enum {
@@ -132,17 +130,6 @@ void pat_init(void)
 
 #undef PAT
 
-static char *cattr_name(unsigned long flags)
-{
-	switch (flags & _PAGE_CACHE_MASK) {
-	case _PAGE_CACHE_UC:		return "uncached";
-	case _PAGE_CACHE_UC_MINUS:	return "uncached-minus";
-	case _PAGE_CACHE_WB:		return "write-back";
-	case _PAGE_CACHE_WC:		return "write-combining";
-	default:			return "broken";
-	}
-}
-
 /*
  * The global memtype list keeps track of memory type for specific
  * physical memory areas. Conflicting memory types in different
@@ -159,14 +146,6 @@ static char *cattr_name(unsigned long flags)
  * memtype_lock protects both the linear list and rbtree.
  */
 
-struct memtype {
-	u64			start;
-	u64			end;
-	unsigned long		type;
-	struct list_head	nd;
-	struct rb_node		rb;
-};
-
 static struct rb_root memtype_rbroot = RB_ROOT;
 static LIST_HEAD(memtype_list);
 static DEFINE_SPINLOCK(memtype_lock);	/* protects memtype list */
@@ -349,6 +328,64 @@ static int free_ram_pages_type(u64 start, u64 end)
 	return 0;
 }
 
+static int memtype_check_insert(struct memtype *new, unsigned long *new_type)
+{
+	struct memtype *entry;
+	u64 start, end;
+	unsigned long actual_type;
+	struct list_head *where;
+	int err = 0;
+
+	start = new->start;
+	end = new->end;
+	actual_type = new->type;
+
+	/* Search for existing mapping that overlaps the current range */
+	where = NULL;
+	list_for_each_entry(entry, &memtype_list, nd) {
+		if (end <= entry->start) {
+			where = entry->nd.prev;
+			break;
+		} else if (start <= entry->start) { /* end > entry->start */
+			err = chk_conflict(new, entry, new_type);
+			if (!err) {
+				dprintk("Overlap at 0x%Lx-0x%Lx\n",
+					entry->start, entry->end);
+				where = entry->nd.prev;
+			}
+			break;
+		} else if (start < entry->end) { /* start > entry->start */
+			err = chk_conflict(new, entry, new_type);
+			if (!err) {
+				dprintk("Overlap at 0x%Lx-0x%Lx\n",
+					entry->start, entry->end);
+
+				/*
+				 * Move to right position in the linked
+				 * list to add this new entry
+				 */
+				list_for_each_entry_continue(entry,
+							&memtype_list, nd) {
+					if (start <= entry->start) {
+						where = entry->nd.prev;
+						break;
+					}
+				}
+			}
+			break;
+		}
+	}
+	if (!err) {
+		if (where)
+			list_add(&new->nd, where);
+		else
+			list_add_tail(&new->nd, &memtype_list);
+
+		memtype_rb_insert(&memtype_rbroot, new);
+	}
+	return err;
+}
+
 /*
  * req_type typically has one of the:
  * - _PAGE_CACHE_WB
@@ -364,9 +401,8 @@ static int free_ram_pages_type(u64 start, u64 end)
 int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 		    unsigned long *new_type)
 {
-	struct memtype *new, *entry;
+	struct memtype *new;
 	unsigned long actual_type;
-	struct list_head *where;
 	int is_range_ram;
 	int err = 0;
 
@@ -423,42 +459,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 	spin_lock(&memtype_lock);
 
-	/* Search for existing mapping that overlaps the current range */
-	where = NULL;
-	list_for_each_entry(entry, &memtype_list, nd) {
-		if (end <= entry->start) {
-			where = entry->nd.prev;
-			break;
-		} else if (start <= entry->start) { /* end > entry->start */
-			err = chk_conflict(new, entry, new_type);
-			if (!err) {
-				dprintk("Overlap at 0x%Lx-0x%Lx\n",
-					entry->start, entry->end);
-				where = entry->nd.prev;
-			}
-			break;
-		} else if (start < entry->end) { /* start > entry->start */
-			err = chk_conflict(new, entry, new_type);
-			if (!err) {
-				dprintk("Overlap at 0x%Lx-0x%Lx\n",
-					entry->start, entry->end);
-
-				/*
-				 * Move to right position in the linked
-				 * list to add this new entry
-				 */
-				list_for_each_entry_continue(entry,
-							&memtype_list, nd) {
-					if (start <= entry->start) {
-						where = entry->nd.prev;
-						break;
-					}
-				}
-			}
-			break;
-		}
-	}
-
+	err = memtype_check_insert(new, new_type);
 	if (err) {
 		printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
 		       "track %s, req %s\n",
@@ -469,13 +470,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 		return err;
 	}
 
-	if (where)
-		list_add(&new->nd, where);
-	else
-		list_add_tail(&new->nd, &memtype_list);
-
-	memtype_rb_insert(&memtype_rbroot, new);
-
 	spin_unlock(&memtype_lock);
 
 	dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
@@ -937,28 +931,40 @@ EXPORT_SYMBOL_GPL(pgprot_writecombine);
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
 
 /* get Nth element of the linked list */
-static struct memtype *memtype_get_idx(loff_t pos)
+static int copy_memtype_nth_element(struct memtype *out, loff_t pos)
 {
-	struct memtype *list_node, *print_entry;
+	struct memtype *list_node;
 	int i = 1;
 
-	print_entry  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
-	if (!print_entry)
-		return NULL;
-
-	spin_lock(&memtype_lock);
 	list_for_each_entry(list_node, &memtype_list, nd) {
 		if (pos == i) {
-			*print_entry = *list_node;
-			spin_unlock(&memtype_lock);
-			return print_entry;
+			*out = *list_node;
+			return 0;
 		}
 		++i;
 	}
+	return 1;
+}
+
+static struct memtype *memtype_get_idx(loff_t pos)
+{
+	struct memtype *print_entry;
+	int ret;
+
+	print_entry  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
+	if (!print_entry)
+		return NULL;
+
+	spin_lock(&memtype_lock);
+	ret = copy_memtype_nth_element(print_entry, pos);
 	spin_unlock(&memtype_lock);
-	kfree(print_entry);
 
-	return NULL;
+	if (!ret) {
+		return print_entry;
+	} else {
+		kfree(print_entry);
+		return NULL;
+	}
 }
 
 static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
new file mode 100644
index 000000000000..6c98780eb731
--- /dev/null
+++ b/arch/x86/mm/pat_internal.h
@@ -0,0 +1,28 @@
+#ifndef __PAT_INTERNAL_H_
+#define __PAT_INTERNAL_H_
+
+extern int pat_debug_enable;
+
+#define dprintk(fmt, arg...) \
+	do { if (pat_debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
+
+struct memtype {
+	u64			start;
+	u64			end;
+	unsigned long		type;
+	struct list_head	nd;
+	struct rb_node		rb;
+};
+
+static inline char *cattr_name(unsigned long flags)
+{
+	switch (flags & _PAGE_CACHE_MASK) {
+	case _PAGE_CACHE_UC:		return "uncached";
+	case _PAGE_CACHE_UC_MINUS:	return "uncached-minus";
+	case _PAGE_CACHE_WB:		return "write-back";
+	case _PAGE_CACHE_WC:		return "write-combining";
+	default:			return "broken";
+	}
+}
+
+#endif /* __PAT_INTERNAL_H_ */
-- 
cgit v1.2.3-55-g7522


From 9e41a49aab88a5a6c8f4875bf10a5543bc321f2d Mon Sep 17 00:00:00 2001
From: Pallipadi, Venkatesh
Date: Wed, 10 Feb 2010 15:26:07 -0800
Subject: x86, pat: Migrate to rbtree only backend for pat memtype management

Move pat backend to fully rbtree based implementation from the existing
rbtree and linked list hybrid.

New rbtree based solution uses interval trees (augmented rbtrees) in
order to store the PAT ranges. The new code seprates out the pat backend
to pat_rbtree.c file, making is cleaner. The change also makes the PAT
lookup, reserve and free operations more optimal, as we don't have to
traverse linear linked list of few tens of entries in normal case.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
LKML-Reference: <20100210232607.GB11465@linux-os.sc.intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/Makefile       |   1 +
 arch/x86/mm/pat.c          | 209 +---------------------------------
 arch/x86/mm/pat_internal.h |  20 +++-
 arch/x86/mm/pat_rbtree.c   | 271 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 296 insertions(+), 205 deletions(-)
 create mode 100644 arch/x86/mm/pat_rbtree.c

(limited to 'arch/x86')

diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 06630d26e56d..a4c768397baa 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -6,6 +6,7 @@ nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_physaddr.o		:= $(nostackp)
 CFLAGS_setup_nx.o		:= $(nostackp)
 
+obj-$(CONFIG_X86_PAT)		+= pat_rbtree.o
 obj-$(CONFIG_SMP)		+= tlb.o
 
 obj-$(CONFIG_X86_32)		+= pgtable_32.o iomap_32.o
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 628e507b7936..951011166ef5 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -130,65 +130,7 @@ void pat_init(void)
 
 #undef PAT
 
-/*
- * The global memtype list keeps track of memory type for specific
- * physical memory areas. Conflicting memory types in different
- * mappings can cause CPU cache corruption. To avoid this we keep track.
- *
- * The list is sorted based on starting address and can contain multiple
- * entries for each address (this allows reference counting for overlapping
- * areas). All the aliases have the same cache attributes of course.
- * Zero attributes are represented as holes.
- *
- * The data structure is a list that is also organized as an rbtree
- * sorted on the start address of memtype range.
- *
- * memtype_lock protects both the linear list and rbtree.
- */
-
-static struct rb_root memtype_rbroot = RB_ROOT;
-static LIST_HEAD(memtype_list);
-static DEFINE_SPINLOCK(memtype_lock);	/* protects memtype list */
-
-static struct memtype *memtype_rb_search(struct rb_root *root, u64 start)
-{
-	struct rb_node *node = root->rb_node;
-	struct memtype *last_lower = NULL;
-
-	while (node) {
-		struct memtype *data = container_of(node, struct memtype, rb);
-
-		if (data->start < start) {
-			last_lower = data;
-			node = node->rb_right;
-		} else if (data->start > start) {
-			node = node->rb_left;
-		} else
-			return data;
-	}
-
-	/* Will return NULL if there is no entry with its start <= start */
-	return last_lower;
-}
-
-static void memtype_rb_insert(struct rb_root *root, struct memtype *data)
-{
-	struct rb_node **new = &(root->rb_node);
-	struct rb_node *parent = NULL;
-
-	while (*new) {
-		struct memtype *this = container_of(*new, struct memtype, rb);
-
-		parent = *new;
-		if (data->start <= this->start)
-			new = &((*new)->rb_left);
-		else if (data->start > this->start)
-			new = &((*new)->rb_right);
-	}
-
-	rb_link_node(&data->rb, parent, new);
-	rb_insert_color(&data->rb, root);
-}
+static DEFINE_SPINLOCK(memtype_lock);	/* protects memtype accesses */
 
 /*
  * Does intersection of PAT memory type and MTRR memory type and returns
@@ -216,33 +158,6 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
 	return req_type;
 }
 
-static int
-chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
-{
-	if (new->type != entry->type) {
-		if (type) {
-			new->type = entry->type;
-			*type = entry->type;
-		} else
-			goto conflict;
-	}
-
-	 /* check overlaps with more than one entry in the list */
-	list_for_each_entry_continue(entry, &memtype_list, nd) {
-		if (new->end <= entry->start)
-			break;
-		else if (new->type != entry->type)
-			goto conflict;
-	}
-	return 0;
-
- conflict:
-	printk(KERN_INFO "%s:%d conflicting memory types "
-	       "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
-	       new->end, cattr_name(new->type), cattr_name(entry->type));
-	return -EBUSY;
-}
-
 static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
 {
 	int ram_page = 0, not_rampage = 0;
@@ -328,64 +243,6 @@ static int free_ram_pages_type(u64 start, u64 end)
 	return 0;
 }
 
-static int memtype_check_insert(struct memtype *new, unsigned long *new_type)
-{
-	struct memtype *entry;
-	u64 start, end;
-	unsigned long actual_type;
-	struct list_head *where;
-	int err = 0;
-
-	start = new->start;
-	end = new->end;
-	actual_type = new->type;
-
-	/* Search for existing mapping that overlaps the current range */
-	where = NULL;
-	list_for_each_entry(entry, &memtype_list, nd) {
-		if (end <= entry->start) {
-			where = entry->nd.prev;
-			break;
-		} else if (start <= entry->start) { /* end > entry->start */
-			err = chk_conflict(new, entry, new_type);
-			if (!err) {
-				dprintk("Overlap at 0x%Lx-0x%Lx\n",
-					entry->start, entry->end);
-				where = entry->nd.prev;
-			}
-			break;
-		} else if (start < entry->end) { /* start > entry->start */
-			err = chk_conflict(new, entry, new_type);
-			if (!err) {
-				dprintk("Overlap at 0x%Lx-0x%Lx\n",
-					entry->start, entry->end);
-
-				/*
-				 * Move to right position in the linked
-				 * list to add this new entry
-				 */
-				list_for_each_entry_continue(entry,
-							&memtype_list, nd) {
-					if (start <= entry->start) {
-						where = entry->nd.prev;
-						break;
-					}
-				}
-			}
-			break;
-		}
-	}
-	if (!err) {
-		if (where)
-			list_add(&new->nd, where);
-		else
-			list_add_tail(&new->nd, &memtype_list);
-
-		memtype_rb_insert(&memtype_rbroot, new);
-	}
-	return err;
-}
-
 /*
  * req_type typically has one of the:
  * - _PAGE_CACHE_WB
@@ -459,7 +316,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 	spin_lock(&memtype_lock);
 
-	err = memtype_check_insert(new, new_type);
+	err = rbt_memtype_check_insert(new, new_type);
 	if (err) {
 		printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
 		       "track %s, req %s\n",
@@ -481,7 +338,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 int free_memtype(u64 start, u64 end)
 {
-	struct memtype *entry, *saved_entry;
 	int err = -EINVAL;
 	int is_range_ram;
 
@@ -505,46 +361,7 @@ int free_memtype(u64 start, u64 end)
 	}
 
 	spin_lock(&memtype_lock);
-
-	entry = memtype_rb_search(&memtype_rbroot, start);
-	if (unlikely(entry == NULL))
-		goto unlock_ret;
-
-	/*
-	 * Saved entry points to an entry with start same or less than what
-	 * we searched for. Now go through the list in both directions to look
-	 * for the entry that matches with both start and end, with list stored
-	 * in sorted start address
-	 */
-	saved_entry = entry;
-	list_for_each_entry_from(entry, &memtype_list, nd) {
-		if (entry->start == start && entry->end == end) {
-			rb_erase(&entry->rb, &memtype_rbroot);
-			list_del(&entry->nd);
-			kfree(entry);
-			err = 0;
-			break;
-		} else if (entry->start > start) {
-			break;
-		}
-	}
-
-	if (!err)
-		goto unlock_ret;
-
-	entry = saved_entry;
-	list_for_each_entry_reverse(entry, &memtype_list, nd) {
-		if (entry->start == start && entry->end == end) {
-			rb_erase(&entry->rb, &memtype_rbroot);
-			list_del(&entry->nd);
-			kfree(entry);
-			err = 0;
-			break;
-		} else if (entry->start < start) {
-			break;
-		}
-	}
-unlock_ret:
+	err = rbt_memtype_erase(start, end);
 	spin_unlock(&memtype_lock);
 
 	if (err) {
@@ -593,7 +410,7 @@ static unsigned long lookup_memtype(u64 paddr)
 
 	spin_lock(&memtype_lock);
 
-	entry = memtype_rb_search(&memtype_rbroot, paddr);
+	entry = rbt_memtype_lookup(paddr);
 	if (entry != NULL)
 		rettype = entry->type;
 	else
@@ -930,22 +747,6 @@ EXPORT_SYMBOL_GPL(pgprot_writecombine);
 
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
 
-/* get Nth element of the linked list */
-static int copy_memtype_nth_element(struct memtype *out, loff_t pos)
-{
-	struct memtype *list_node;
-	int i = 1;
-
-	list_for_each_entry(list_node, &memtype_list, nd) {
-		if (pos == i) {
-			*out = *list_node;
-			return 0;
-		}
-		++i;
-	}
-	return 1;
-}
-
 static struct memtype *memtype_get_idx(loff_t pos)
 {
 	struct memtype *print_entry;
@@ -956,7 +757,7 @@ static struct memtype *memtype_get_idx(loff_t pos)
 		return NULL;
 
 	spin_lock(&memtype_lock);
-	ret = copy_memtype_nth_element(print_entry, pos);
+	ret = rbt_memtype_copy_nth_element(print_entry, pos);
 	spin_unlock(&memtype_lock);
 
 	if (!ret) {
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
index 6c98780eb731..4f39eefa3e61 100644
--- a/arch/x86/mm/pat_internal.h
+++ b/arch/x86/mm/pat_internal.h
@@ -9,8 +9,8 @@ extern int pat_debug_enable;
 struct memtype {
 	u64			start;
 	u64			end;
+	u64			subtree_max_end;
 	unsigned long		type;
-	struct list_head	nd;
 	struct rb_node		rb;
 };
 
@@ -25,4 +25,22 @@ static inline char *cattr_name(unsigned long flags)
 	}
 }
 
+#ifdef CONFIG_X86_PAT
+extern int rbt_memtype_check_insert(struct memtype *new,
+					unsigned long *new_type);
+extern int rbt_memtype_erase(u64 start, u64 end);
+extern struct memtype *rbt_memtype_lookup(u64 addr);
+extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos);
+#else
+static inline int rbt_memtype_check_insert(struct memtype *new,
+					unsigned long *new_type)
+{ return 0; }
+static inline int rbt_memtype_erase(u64 start, u64 end)
+{ return 0; }
+static inline struct memtype *rbt_memtype_lookup(u64 addr)
+{ return NULL; }
+static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
+{ return 0; }
+#endif
+
 #endif /* __PAT_INTERNAL_H_ */
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
new file mode 100644
index 000000000000..9063f40b638b
--- /dev/null
+++ b/arch/x86/mm/pat_rbtree.c
@@ -0,0 +1,271 @@
+/*
+ * Handle caching attributes in page tables (PAT)
+ *
+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *          Suresh B Siddha <suresh.b.siddha@intel.com>
+ *
+ * Interval tree (augmented rbtree) used to store the PAT memory type
+ * reservations.
+ */
+
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+
+#include <asm/pgtable.h>
+#include <asm/pat.h>
+
+#include "pat_internal.h"
+
+/*
+ * The memtype tree keeps track of memory type for specific
+ * physical memory areas. Without proper tracking, conflicting memory
+ * types in different mappings can cause CPU cache corruption.
+ *
+ * The tree is an interval tree (augmented rbtree) with tree ordered
+ * on starting address. Tree can contain multiple entries for
+ * different regions which overlap. All the aliases have the same
+ * cache attributes of course.
+ *
+ * memtype_lock protects the rbtree.
+ */
+
+static void memtype_rb_augment_cb(struct rb_node *node);
+static struct rb_root memtype_rbroot = RB_AUGMENT_ROOT(&memtype_rb_augment_cb);
+
+static int is_node_overlap(struct memtype *node, u64 start, u64 end)
+{
+	if (node->start >= end || node->end <= start)
+		return 0;
+
+	return 1;
+}
+
+static u64 get_subtree_max_end(struct rb_node *node)
+{
+	u64 ret = 0;
+	if (node) {
+		struct memtype *data = container_of(node, struct memtype, rb);
+		ret = data->subtree_max_end;
+	}
+	return ret;
+}
+
+/* Update 'subtree_max_end' for a node, based on node and its children */
+static void update_node_max_end(struct rb_node *node)
+{
+	struct memtype *data;
+	u64 max_end, child_max_end;
+
+	if (!node)
+		return;
+
+	data = container_of(node, struct memtype, rb);
+	max_end = data->end;
+
+	child_max_end = get_subtree_max_end(node->rb_right);
+	if (child_max_end > max_end)
+		max_end = child_max_end;
+
+	child_max_end = get_subtree_max_end(node->rb_left);
+	if (child_max_end > max_end)
+		max_end = child_max_end;
+
+	data->subtree_max_end = max_end;
+}
+
+/* Update 'subtree_max_end' for a node and all its ancestors */
+static void update_path_max_end(struct rb_node *node)
+{
+	u64 old_max_end, new_max_end;
+
+	while (node) {
+		struct memtype *data = container_of(node, struct memtype, rb);
+
+		old_max_end = data->subtree_max_end;
+		update_node_max_end(node);
+		new_max_end = data->subtree_max_end;
+
+		if (new_max_end == old_max_end)
+			break;
+
+		node = rb_parent(node);
+	}
+}
+
+/* Find the first (lowest start addr) overlapping range from rb tree */
+static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
+				u64 start, u64 end)
+{
+	struct rb_node *node = root->rb_node;
+	struct memtype *last_lower = NULL;
+
+	while (node) {
+		struct memtype *data = container_of(node, struct memtype, rb);
+
+		if (get_subtree_max_end(node->rb_left) > start) {
+			/* Lowest overlap if any must be on left side */
+			node = node->rb_left;
+		} else if (is_node_overlap(data, start, end)) {
+			last_lower = data;
+			break;
+		} else if (start >= data->start) {
+			/* Lowest overlap if any must be on right side */
+			node = node->rb_right;
+		} else {
+			break;
+		}
+	}
+	return last_lower; /* Returns NULL if there is no overlap */
+}
+
+static struct memtype *memtype_rb_exact_match(struct rb_root *root,
+				u64 start, u64 end)
+{
+	struct memtype *match;
+
+	match = memtype_rb_lowest_match(root, start, end);
+	while (match != NULL && match->start < end) {
+		struct rb_node *node;
+
+		if (match->start == start && match->end == end)
+			return match;
+
+		node = rb_next(&match->rb);
+		if (node)
+			match = container_of(node, struct memtype, rb);
+		else
+			match = NULL;
+	}
+
+	return NULL; /* Returns NULL if there is no exact match */
+}
+
+static int memtype_rb_check_conflict(struct rb_root *root,
+				u64 start, u64 end,
+				unsigned long reqtype, unsigned long *newtype)
+{
+	struct rb_node *node;
+	struct memtype *match;
+	int found_type = reqtype;
+
+	match = memtype_rb_lowest_match(&memtype_rbroot, start, end);
+	if (match == NULL)
+		goto success;
+
+	if (match->type != found_type && newtype == NULL)
+		goto failure;
+
+	dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end);
+	found_type = match->type;
+
+	node = rb_next(&match->rb);
+	while (node) {
+		match = container_of(node, struct memtype, rb);
+
+		if (match->start >= end) /* Checked all possible matches */
+			goto success;
+
+		if (is_node_overlap(match, start, end) &&
+		    match->type != found_type) {
+			goto failure;
+		}
+
+		node = rb_next(&match->rb);
+	}
+success:
+	if (newtype)
+		*newtype = found_type;
+
+	return 0;
+
+failure:
+	printk(KERN_INFO "%s:%d conflicting memory types "
+		"%Lx-%Lx %s<->%s\n", current->comm, current->pid, start,
+		end, cattr_name(found_type), cattr_name(match->type));
+	return -EBUSY;
+}
+
+static void memtype_rb_augment_cb(struct rb_node *node)
+{
+	if (node)
+		update_path_max_end(node);
+}
+
+static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
+{
+	struct rb_node **node = &(root->rb_node);
+	struct rb_node *parent = NULL;
+
+	while (*node) {
+		struct memtype *data = container_of(*node, struct memtype, rb);
+
+		parent = *node;
+		if (newdata->start <= data->start)
+			node = &((*node)->rb_left);
+		else if (newdata->start > data->start)
+			node = &((*node)->rb_right);
+	}
+
+	rb_link_node(&newdata->rb, parent, node);
+	rb_insert_color(&newdata->rb, root);
+}
+
+int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
+{
+	int err = 0;
+
+	err = memtype_rb_check_conflict(&memtype_rbroot, new->start, new->end,
+						new->type, ret_type);
+
+	if (!err) {
+		new->type = *ret_type;
+		memtype_rb_insert(&memtype_rbroot, new);
+	}
+	return err;
+}
+
+int rbt_memtype_erase(u64 start, u64 end)
+{
+	struct memtype *data;
+
+	data = memtype_rb_exact_match(&memtype_rbroot, start, end);
+	if (!data)
+		return -EINVAL;
+
+	rb_erase(&data->rb, &memtype_rbroot);
+	return 0;
+}
+
+struct memtype *rbt_memtype_lookup(u64 addr)
+{
+	struct memtype *data;
+	data = memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE);
+	return data;
+}
+
+#if defined(CONFIG_DEBUG_FS)
+int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
+{
+	struct rb_node *node;
+	int i = 1;
+
+	node = rb_first(&memtype_rbroot);
+	while (node && pos != i) {
+		node = rb_next(node);
+		i++;
+	}
+
+	if (node) { /* pos == i */
+		struct memtype *this = container_of(node, struct memtype, rb);
+		*out = *this;
+		return 0;
+	} else {
+		return 1;
+	}
+}
+#endif
-- 
cgit v1.2.3-55-g7522


From b3ac891b67bd4b1fc728d1c784cad1212dea433d Mon Sep 17 00:00:00 2001
From: Luca Barbieri
Date: Wed, 24 Feb 2010 10:54:22 +0100
Subject: x86: Add support for lock prefix in alternatives

The current lock prefix UP/SMP alternative code doesn't allow
LOCK_PREFIX to be used in alternatives code.

This patch solves the problem by adding a new LOCK_PREFIX_ALTERNATIVE_PATCH
macro that only records the lock prefix location but does not emit
the prefix.

The user of this macro can then start any alternative sequence with
"lock" and have it UP/SMP patched.

To make this work, the UP/SMP alternative code is changed to do the
lock/DS prefix switching only if the byte actually contains a lock or
DS prefix.

Thus, if an alternative without the "lock" is selected, it will now do
nothing instead of clobbering the code.

Changes in v2:
- Naming change
- Change label to not conflict with alternatives

Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267005265-27958-2-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/alternative.h | 8 +++++---
 arch/x86/kernel/alternative.c      | 6 ++++--
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 3b5b828767b6..55fee12cea6d 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -28,12 +28,14 @@
  */
 
 #ifdef CONFIG_SMP
-#define LOCK_PREFIX \
+#define LOCK_PREFIX_HERE \
 		".section .smp_locks,\"a\"\n"	\
 		_ASM_ALIGN "\n"			\
-		_ASM_PTR "661f\n" /* address */	\
+		_ASM_PTR "671f\n" /* address */	\
 		".previous\n"			\
-		"661:\n\tlock; "
+		"671:"
+
+#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; "
 
 #else /* ! CONFIG_SMP */
 #define LOCK_PREFIX ""
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 2589ea4c60ce..80b222ea4cf6 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -244,7 +244,8 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
 		if (*ptr > text_end)
 			continue;
 		/* turn DS segment override prefix into lock prefix */
-		text_poke(*ptr, ((unsigned char []){0xf0}), 1);
+		if (**ptr == 0x3e)
+			text_poke(*ptr, ((unsigned char []){0xf0}), 1);
 	};
 	mutex_unlock(&text_mutex);
 }
@@ -263,7 +264,8 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
 		if (*ptr > text_end)
 			continue;
 		/* turn lock prefix into DS segment override prefix */
-		text_poke(*ptr, ((unsigned char []){0x3E}), 1);
+		if (**ptr == 0xf0)
+			text_poke(*ptr, ((unsigned char []){0x3E}), 1);
 	};
 	mutex_unlock(&text_mutex);
 }
-- 
cgit v1.2.3-55-g7522


From 9c76b38476b18c45f97098a10b0176b321eba3ea Mon Sep 17 00:00:00 2001
From: Luca Barbieri
Date: Wed, 24 Feb 2010 10:54:23 +0100
Subject: x86-32: Allow UP/SMP lock replacement in cmpxchg64

Use the functionality just introduced in the previous patch: mark the
lock prefixes in cmpxchg64 alternatives for UP removal.

Changes in v2:
- Naming change

Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267005265-27958-3-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/cmpxchg_32.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index ffb9bb6b6c37..8859e12dd3cf 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -271,7 +271,8 @@ extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
 	__typeof__(*(ptr)) __ret;				\
 	__typeof__(*(ptr)) __old = (o);				\
 	__typeof__(*(ptr)) __new = (n);				\
-	alternative_io("call cmpxchg8b_emu",			\
+	alternative_io(LOCK_PREFIX_HERE				\
+			"call cmpxchg8b_emu",			\
 			"lock; cmpxchg8b (%%esi)" ,		\
 		       X86_FEATURE_CX8,				\
 		       "=A" (__ret),				\
-- 
cgit v1.2.3-55-g7522


From a7e926abc3adfbd2e5e20d2b46177adb4e313915 Mon Sep 17 00:00:00 2001
From: Luca Barbieri
Date: Wed, 24 Feb 2010 10:54:25 +0100
Subject: x86-32: Rewrite 32-bit atomic64 functions in assembly

This patch replaces atomic64_32.c with two assembly implementations,
one for 386/486 machines using pushf/cli/popf and one for 586+ machines
using cmpxchg8b.

The cmpxchg8b implementation provides the following advantages over the
current one:

1. Implements atomic64_add_unless, atomic64_dec_if_positive and
   atomic64_inc_not_zero

2. Uses the ZF flag changed by cmpxchg8b instead of doing a comparison

3. Uses custom register calling conventions that reduce or eliminate
   register moves to suit cmpxchg8b

4. Reads the initial value instead of using cmpxchg8b to do that.
   Currently we use lock xaddl and movl, which seems the fastest.

5. Does not use the lock prefix for atomic64_set
   64-bit writes are already atomic, so we don't need that.
   We still need it for atomic64_read to avoid restoring a value
   changed in the meantime.

6. Allocates registers as well or better than gcc

The 386 implementation provides support for 386 and 486 machines.
386/486 SMP is not supported (we dropped it), but such support can be
added easily if desired.

A pure assembly implementation is required due to the custom calling
conventions, and desire to use %ebp in atomic64_add_return (we need
7 registers...), as well as the ability to use pushf/popf in the 386
code without an intermediate pop/push.

The parameter names are changed to match the convention in atomic_64.h

Changes in v3 (due to rebasing to tip/x86/asm):
- Patches atomic64_32.h instead of atomic_32.h
- Uses the CALL alternative mechanism from commit
  1b1d9258181bae199dc940f4bd0298126b9a73d9

Changes in v2:
- Merged 386 and cx8 support in the same patch
- 386 support now done in assembly, C code no longer used at all
- cmpxchg64 is used for atomic64_cmpxchg
- stop using macros, use one-line inline functions instead
- miscellanous changes and improvements

Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267005265-27958-5-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/atomic64_32.h | 278 ++++++++++++++++++++++++++++---------
 arch/x86/lib/Makefile              |   3 +-
 arch/x86/lib/atomic64_32.c         | 273 +++++++-----------------------------
 arch/x86/lib/atomic64_386_32.S     | 175 +++++++++++++++++++++++
 arch/x86/lib/atomic64_cx8_32.S     | 225 ++++++++++++++++++++++++++++++
 5 files changed, 664 insertions(+), 290 deletions(-)
 create mode 100644 arch/x86/lib/atomic64_386_32.S
 create mode 100644 arch/x86/lib/atomic64_cx8_32.S

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 03027bf28de5..2a934aa19a43 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -14,109 +14,193 @@ typedef struct {
 
 #define ATOMIC64_INIT(val)	{ (val) }
 
-extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val);
+#ifdef CONFIG_X86_CMPXCHG64
+#define ATOMIC64_ALTERNATIVE_(f, g) "call atomic64_" #g "_cx8"
+#else
+#define ATOMIC64_ALTERNATIVE_(f, g) ALTERNATIVE("call atomic64_" #f "_386", "call atomic64_" #g "_cx8", X86_FEATURE_CX8)
+#endif
+
+#define ATOMIC64_ALTERNATIVE(f) ATOMIC64_ALTERNATIVE_(f, f)
+
+/**
+ * atomic64_cmpxchg - cmpxchg atomic64 variable
+ * @p: pointer to type atomic64_t
+ * @o: expected value
+ * @n: new value
+ *
+ * Atomically sets @v to @n if it was equal to @o and returns
+ * the old value.
+ */
+
+static inline long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n)
+{
+	return cmpxchg64(&v->counter, o, n);
+}
 
 /**
  * atomic64_xchg - xchg atomic64 variable
- * @ptr:      pointer to type atomic64_t
- * @new_val:  value to assign
+ * @v: pointer to type atomic64_t
+ * @n: value to assign
  *
- * Atomically xchgs the value of @ptr to @new_val and returns
+ * Atomically xchgs the value of @v to @n and returns
  * the old value.
  */
-extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val);
+static inline long long atomic64_xchg(atomic64_t *v, long long n)
+{
+	long long o;
+	unsigned high = (unsigned)(n >> 32);
+	unsigned low = (unsigned)n;
+	asm volatile(ATOMIC64_ALTERNATIVE(xchg)
+		     : "=A" (o), "+b" (low), "+c" (high)
+		     : "S" (v)
+		     : "memory"
+		     );
+	return o;
+}
 
 /**
  * atomic64_set - set atomic64 variable
- * @ptr:      pointer to type atomic64_t
- * @new_val:  value to assign
+ * @v: pointer to type atomic64_t
+ * @n: value to assign
  *
- * Atomically sets the value of @ptr to @new_val.
+ * Atomically sets the value of @v to @n.
  */
-extern void atomic64_set(atomic64_t *ptr, u64 new_val);
+static inline void atomic64_set(atomic64_t *v, long long i)
+{
+	unsigned high = (unsigned)(i >> 32);
+	unsigned low = (unsigned)i;
+	asm volatile(ATOMIC64_ALTERNATIVE(set)
+		     : "+b" (low), "+c" (high)
+		     : "S" (v)
+		     : "eax", "edx", "memory"
+		     );
+}
 
 /**
  * atomic64_read - read atomic64 variable
- * @ptr:      pointer to type atomic64_t
+ * @v: pointer to type atomic64_t
  *
- * Atomically reads the value of @ptr and returns it.
+ * Atomically reads the value of @v and returns it.
  */
-static inline u64 atomic64_read(atomic64_t *ptr)
+static inline long long atomic64_read(atomic64_t *v)
 {
-	u64 res;
-
-	/*
-	 * Note, we inline this atomic64_t primitive because
-	 * it only clobbers EAX/EDX and leaves the others
-	 * untouched. We also (somewhat subtly) rely on the
-	 * fact that cmpxchg8b returns the current 64-bit value
-	 * of the memory location we are touching:
-	 */
-	asm volatile(
-		"mov %%ebx, %%eax\n\t"
-		"mov %%ecx, %%edx\n\t"
-		LOCK_PREFIX "cmpxchg8b %1\n"
-			: "=&A" (res)
-			: "m" (*ptr)
-		);
-
-	return res;
-}
-
-extern u64 atomic64_read(atomic64_t *ptr);
+	long long r;
+	asm volatile(ATOMIC64_ALTERNATIVE(read)
+		     : "=A" (r), "+c" (v)
+		     : : "memory"
+		     );
+	return r;
+ }
 
 /**
  * atomic64_add_return - add and return
- * @delta: integer value to add
- * @ptr:   pointer to type atomic64_t
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
  *
- * Atomically adds @delta to @ptr and returns @delta + *@ptr
+ * Atomically adds @i to @v and returns @i + *@v
  */
-extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr);
+static inline long long atomic64_add_return(long long i, atomic64_t *v)
+{
+	asm volatile(ATOMIC64_ALTERNATIVE(add_return)
+		     : "+A" (i), "+c" (v)
+		     : : "memory"
+		     );
+	return i;
+}
 
 /*
  * Other variants with different arithmetic operators:
  */
-extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr);
-extern u64 atomic64_inc_return(atomic64_t *ptr);
-extern u64 atomic64_dec_return(atomic64_t *ptr);
+static inline long long atomic64_sub_return(long long i, atomic64_t *v)
+{
+	asm volatile(ATOMIC64_ALTERNATIVE(sub_return)
+		     : "+A" (i), "+c" (v)
+		     : : "memory"
+		     );
+	return i;
+}
+
+static inline long long atomic64_inc_return(atomic64_t *v)
+{
+	long long a;
+	asm volatile(ATOMIC64_ALTERNATIVE(inc_return)
+		     : "=A" (a)
+		     : "S" (v)
+		     : "memory", "ecx"
+		     );
+	return a;
+}
+
+static inline long long atomic64_dec_return(atomic64_t *v)
+{
+	long long a;
+	asm volatile(ATOMIC64_ALTERNATIVE(dec_return)
+		     : "=A" (a)
+		     : "S" (v)
+		     : "memory", "ecx"
+		     );
+	return a;
+}
 
 /**
  * atomic64_add - add integer to atomic64 variable
- * @delta: integer value to add
- * @ptr:   pointer to type atomic64_t
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
  *
- * Atomically adds @delta to @ptr.
+ * Atomically adds @i to @v.
  */
-extern void atomic64_add(u64 delta, atomic64_t *ptr);
+static inline long long atomic64_add(long long i, atomic64_t *v)
+{
+	asm volatile(ATOMIC64_ALTERNATIVE_(add, add_return)
+		     : "+A" (i), "+c" (v)
+		     : : "memory"
+		     );
+	return i;
+}
 
 /**
  * atomic64_sub - subtract the atomic64 variable
- * @delta: integer value to subtract
- * @ptr:   pointer to type atomic64_t
+ * @i: integer value to subtract
+ * @v: pointer to type atomic64_t
  *
- * Atomically subtracts @delta from @ptr.
+ * Atomically subtracts @i from @v.
  */
-extern void atomic64_sub(u64 delta, atomic64_t *ptr);
+static inline long long atomic64_sub(long long i, atomic64_t *v)
+{
+	asm volatile(ATOMIC64_ALTERNATIVE_(sub, sub_return)
+		     : "+A" (i), "+c" (v)
+		     : : "memory"
+		     );
+	return i;
+}
 
 /**
  * atomic64_sub_and_test - subtract value from variable and test result
- * @delta: integer value to subtract
- * @ptr:   pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr and returns
+ * @i: integer value to subtract
+ * @v: pointer to type atomic64_t
+  *
+ * Atomically subtracts @i from @v and returns
  * true if the result is zero, or false for all
  * other cases.
  */
-extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr);
+static inline int atomic64_sub_and_test(long long i, atomic64_t *v)
+{
+	return atomic64_sub_return(i, v) == 0;
+}
 
 /**
  * atomic64_inc - increment atomic64 variable
- * @ptr: pointer to type atomic64_t
+ * @v: pointer to type atomic64_t
  *
- * Atomically increments @ptr by 1.
+ * Atomically increments @v by 1.
  */
-extern void atomic64_inc(atomic64_t *ptr);
+static inline void atomic64_inc(atomic64_t *v)
+{
+	asm volatile(ATOMIC64_ALTERNATIVE_(inc, inc_return)
+		     : : "S" (v)
+		     : "memory", "eax", "ecx", "edx"
+		     );
+}
 
 /**
  * atomic64_dec - decrement atomic64 variable
@@ -124,37 +208,97 @@ extern void atomic64_inc(atomic64_t *ptr);
  *
  * Atomically decrements @ptr by 1.
  */
-extern void atomic64_dec(atomic64_t *ptr);
+static inline void atomic64_dec(atomic64_t *v)
+{
+	asm volatile(ATOMIC64_ALTERNATIVE_(dec, dec_return)
+		     : : "S" (v)
+		     : "memory", "eax", "ecx", "edx"
+		     );
+}
 
 /**
  * atomic64_dec_and_test - decrement and test
- * @ptr: pointer to type atomic64_t
+ * @v: pointer to type atomic64_t
  *
- * Atomically decrements @ptr by 1 and
+ * Atomically decrements @v by 1 and
  * returns true if the result is 0, or false for all other
  * cases.
  */
-extern int atomic64_dec_and_test(atomic64_t *ptr);
+static inline int atomic64_dec_and_test(atomic64_t *v)
+{
+	return atomic64_dec_return(v) == 0;
+}
 
 /**
  * atomic64_inc_and_test - increment and test
- * @ptr: pointer to type atomic64_t
+ * @v: pointer to type atomic64_t
  *
- * Atomically increments @ptr by 1
+ * Atomically increments @v by 1
  * and returns true if the result is zero, or false for all
  * other cases.
  */
-extern int atomic64_inc_and_test(atomic64_t *ptr);
+static inline int atomic64_inc_and_test(atomic64_t *v)
+{
+	return atomic64_inc_return(v) == 0;
+}
 
 /**
  * atomic64_add_negative - add and test if negative
- * @delta: integer value to add
- * @ptr:   pointer to type atomic64_t
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
  *
- * Atomically adds @delta to @ptr and returns true
+ * Atomically adds @i to @v and returns true
  * if the result is negative, or false when
  * result is greater than or equal to zero.
  */
-extern int atomic64_add_negative(u64 delta, atomic64_t *ptr);
+static inline int atomic64_add_negative(long long i, atomic64_t *v)
+{
+	return atomic64_add_return(i, v) < 0;
+}
+
+/**
+ * atomic64_add_unless - add unless the number is a given value
+ * @v: pointer of type atomic64_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, so long as it was not @u.
+ * Returns non-zero if @v was not @u, and zero otherwise.
+ */
+static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
+{
+	unsigned low = (unsigned)u;
+	unsigned high = (unsigned)(u >> 32);
+	asm volatile(ATOMIC64_ALTERNATIVE(add_unless) "\n\t"
+		     : "+A" (a), "+c" (v), "+S" (low), "+D" (high)
+		     : : "memory");
+	return (int)a;
+}
+
+
+static inline int atomic64_inc_not_zero(atomic64_t *v)
+{
+	int r;
+	asm volatile(ATOMIC64_ALTERNATIVE(inc_not_zero)
+		     : "=a" (r)
+		     : "S" (v)
+		     : "ecx", "edx", "memory"
+		     );
+	return r;
+}
+
+static inline long long atomic64_dec_if_positive(atomic64_t *v)
+{
+	long long r;
+	asm volatile(ATOMIC64_ALTERNATIVE(dec_if_positive)
+		     : "=A" (r)
+		     : "S" (v)
+		     : "ecx", "memory"
+		     );
+	return r;
+}
+
+#undef ATOMIC64_ALTERNATIVE
+#undef ATOMIC64_ALTERNATIVE_
 
 #endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index cffd754f3039..05d686bbbe9f 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -26,11 +26,12 @@ obj-y += msr.o msr-reg.o msr-reg-export.o
 
 ifeq ($(CONFIG_X86_32),y)
         obj-y += atomic64_32.o
+        lib-y += atomic64_cx8_32.o
         lib-y += checksum_32.o
         lib-y += strstr_32.o
         lib-y += semaphore_32.o string_32.o
 ifneq ($(CONFIG_X86_CMPXCHG64),y)
-        lib-y += cmpxchg8b_emu.o
+        lib-y += cmpxchg8b_emu.o atomic64_386_32.o
 endif
         lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
 else
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
index 824fa0be55a3..540179e8e9fa 100644
--- a/arch/x86/lib/atomic64_32.c
+++ b/arch/x86/lib/atomic64_32.c
@@ -6,225 +6,54 @@
 #include <asm/cmpxchg.h>
 #include <asm/atomic.h>
 
-static noinline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new)
-{
-	u32 low = new;
-	u32 high = new >> 32;
-
-	asm volatile(
-		LOCK_PREFIX "cmpxchg8b %1\n"
-		     : "+A" (old), "+m" (*ptr)
-		     :  "b" (low),  "c" (high)
-		     );
-	return old;
-}
-
-u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val)
-{
-	return cmpxchg8b(&ptr->counter, old_val, new_val);
-}
-EXPORT_SYMBOL(atomic64_cmpxchg);
-
-/**
- * atomic64_xchg - xchg atomic64 variable
- * @ptr:      pointer to type atomic64_t
- * @new_val:  value to assign
- *
- * Atomically xchgs the value of @ptr to @new_val and returns
- * the old value.
- */
-u64 atomic64_xchg(atomic64_t *ptr, u64 new_val)
-{
-	/*
-	 * Try first with a (possibly incorrect) assumption about
-	 * what we have there. We'll do two loops most likely,
-	 * but we'll get an ownership MESI transaction straight away
-	 * instead of a read transaction followed by a
-	 * flush-for-ownership transaction:
-	 */
-	u64 old_val, real_val = 0;
-
-	do {
-		old_val = real_val;
-
-		real_val = atomic64_cmpxchg(ptr, old_val, new_val);
-
-	} while (real_val != old_val);
-
-	return old_val;
-}
-EXPORT_SYMBOL(atomic64_xchg);
-
-/**
- * atomic64_set - set atomic64 variable
- * @ptr:      pointer to type atomic64_t
- * @new_val:  value to assign
- *
- * Atomically sets the value of @ptr to @new_val.
- */
-void atomic64_set(atomic64_t *ptr, u64 new_val)
-{
-	atomic64_xchg(ptr, new_val);
-}
-EXPORT_SYMBOL(atomic64_set);
-
-/**
-EXPORT_SYMBOL(atomic64_read);
- * atomic64_add_return - add and return
- * @delta: integer value to add
- * @ptr:   pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr and returns @delta + *@ptr
- */
-noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr)
-{
-	/*
-	 * Try first with a (possibly incorrect) assumption about
-	 * what we have there. We'll do two loops most likely,
-	 * but we'll get an ownership MESI transaction straight away
-	 * instead of a read transaction followed by a
-	 * flush-for-ownership transaction:
-	 */
-	u64 old_val, new_val, real_val = 0;
-
-	do {
-		old_val = real_val;
-		new_val = old_val + delta;
-
-		real_val = atomic64_cmpxchg(ptr, old_val, new_val);
-
-	} while (real_val != old_val);
-
-	return new_val;
-}
-EXPORT_SYMBOL(atomic64_add_return);
-
-u64 atomic64_sub_return(u64 delta, atomic64_t *ptr)
-{
-	return atomic64_add_return(-delta, ptr);
-}
-EXPORT_SYMBOL(atomic64_sub_return);
-
-u64 atomic64_inc_return(atomic64_t *ptr)
-{
-	return atomic64_add_return(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_inc_return);
-
-u64 atomic64_dec_return(atomic64_t *ptr)
-{
-	return atomic64_sub_return(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_dec_return);
-
-/**
- * atomic64_add - add integer to atomic64 variable
- * @delta: integer value to add
- * @ptr:   pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr.
- */
-void atomic64_add(u64 delta, atomic64_t *ptr)
-{
-	atomic64_add_return(delta, ptr);
-}
-EXPORT_SYMBOL(atomic64_add);
-
-/**
- * atomic64_sub - subtract the atomic64 variable
- * @delta: integer value to subtract
- * @ptr:   pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr.
- */
-void atomic64_sub(u64 delta, atomic64_t *ptr)
-{
-	atomic64_add(-delta, ptr);
-}
-EXPORT_SYMBOL(atomic64_sub);
-
-/**
- * atomic64_sub_and_test - subtract value from variable and test result
- * @delta: integer value to subtract
- * @ptr:   pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-int atomic64_sub_and_test(u64 delta, atomic64_t *ptr)
-{
-	u64 new_val = atomic64_sub_return(delta, ptr);
-
-	return new_val == 0;
-}
-EXPORT_SYMBOL(atomic64_sub_and_test);
-
-/**
- * atomic64_inc - increment atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically increments @ptr by 1.
- */
-void atomic64_inc(atomic64_t *ptr)
-{
-	atomic64_add(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_inc);
-
-/**
- * atomic64_dec - decrement atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically decrements @ptr by 1.
- */
-void atomic64_dec(atomic64_t *ptr)
-{
-	atomic64_sub(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_dec);
-
-/**
- * atomic64_dec_and_test - decrement and test
- * @ptr: pointer to type atomic64_t
- *
- * Atomically decrements @ptr by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-int atomic64_dec_and_test(atomic64_t *ptr)
-{
-	return atomic64_sub_and_test(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_dec_and_test);
-
-/**
- * atomic64_inc_and_test - increment and test
- * @ptr: pointer to type atomic64_t
- *
- * Atomically increments @ptr by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-int atomic64_inc_and_test(atomic64_t *ptr)
-{
-	return atomic64_sub_and_test(-1, ptr);
-}
-EXPORT_SYMBOL(atomic64_inc_and_test);
-
-/**
- * atomic64_add_negative - add and test if negative
- * @delta: integer value to add
- * @ptr:   pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-int atomic64_add_negative(u64 delta, atomic64_t *ptr)
-{
-	s64 new_val = atomic64_add_return(delta, ptr);
-
-	return new_val < 0;
-}
-EXPORT_SYMBOL(atomic64_add_negative);
+long long atomic64_read_cx8(long long, const atomic64_t *v);
+EXPORT_SYMBOL(atomic64_read_cx8);
+long long atomic64_set_cx8(long long, const atomic64_t *v);
+EXPORT_SYMBOL(atomic64_set_cx8);
+long long atomic64_xchg_cx8(long long, unsigned high);
+EXPORT_SYMBOL(atomic64_xchg_cx8);
+long long atomic64_add_return_cx8(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_add_return_cx8);
+long long atomic64_sub_return_cx8(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_sub_return_cx8);
+long long atomic64_inc_return_cx8(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_inc_return_cx8);
+long long atomic64_dec_return_cx8(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_dec_return_cx8);
+long long atomic64_dec_if_positive_cx8(atomic64_t *v);
+EXPORT_SYMBOL(atomic64_dec_if_positive_cx8);
+int atomic64_inc_not_zero_cx8(atomic64_t *v);
+EXPORT_SYMBOL(atomic64_inc_not_zero_cx8);
+int atomic64_add_unless_cx8(atomic64_t *v, long long a, long long u);
+EXPORT_SYMBOL(atomic64_add_unless_cx8);
+
+#ifndef CONFIG_X86_CMPXCHG64
+long long atomic64_read_386(long long, const atomic64_t *v);
+EXPORT_SYMBOL(atomic64_read_386);
+long long atomic64_set_386(long long, const atomic64_t *v);
+EXPORT_SYMBOL(atomic64_set_386);
+long long atomic64_xchg_386(long long, unsigned high);
+EXPORT_SYMBOL(atomic64_xchg_386);
+long long atomic64_add_return_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_add_return_386);
+long long atomic64_sub_return_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_sub_return_386);
+long long atomic64_inc_return_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_inc_return_386);
+long long atomic64_dec_return_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_dec_return_386);
+long long atomic64_add_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_add_386);
+long long atomic64_sub_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_sub_386);
+long long atomic64_inc_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_inc_386);
+long long atomic64_dec_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_dec_386);
+long long atomic64_dec_if_positive_386(atomic64_t *v);
+EXPORT_SYMBOL(atomic64_dec_if_positive_386);
+int atomic64_inc_not_zero_386(atomic64_t *v);
+EXPORT_SYMBOL(atomic64_inc_not_zero_386);
+int atomic64_add_unless_386(atomic64_t *v, long long a, long long u);
+EXPORT_SYMBOL(atomic64_add_unless_386);
+#endif
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
new file mode 100644
index 000000000000..5db07fe4a0ca
--- /dev/null
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -0,0 +1,175 @@
+/*
+ * atomic64_t for 386/486
+ *
+ * Copyright © 2010  Luca Barbieri
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/dwarf2.h>
+
+/* if you want SMP support, implement these with real spinlocks */
+.macro LOCK reg
+	pushfl
+	CFI_ADJUST_CFA_OFFSET 4
+	cli
+.endm
+
+.macro UNLOCK reg
+	popfl
+	CFI_ADJUST_CFA_OFFSET -4
+.endm
+
+.macro BEGIN func reg
+$v = \reg
+
+ENTRY(atomic64_\func\()_386)
+	CFI_STARTPROC
+	LOCK $v
+
+.macro RETURN
+	UNLOCK $v
+	ret
+.endm
+
+.macro END_
+	CFI_ENDPROC
+ENDPROC(atomic64_\func\()_386)
+.purgem RETURN
+.purgem END_
+.purgem END
+.endm
+
+.macro END
+RETURN
+END_
+.endm
+.endm
+
+BEGIN read %ecx
+	movl  ($v), %eax
+	movl 4($v), %edx
+END
+
+BEGIN set %esi
+	movl %ebx,  ($v)
+	movl %ecx, 4($v)
+END
+
+BEGIN xchg %esi
+	movl  ($v), %eax
+	movl 4($v), %edx
+	movl %ebx,  ($v)
+	movl %ecx, 4($v)
+END
+
+BEGIN add %ecx
+	addl %eax,  ($v)
+	adcl %edx, 4($v)
+END
+
+BEGIN add_return %ecx
+	addl  ($v), %eax
+	adcl 4($v), %edx
+	movl %eax,  ($v)
+	movl %edx, 4($v)
+END
+
+BEGIN sub %ecx
+	subl %eax,  ($v)
+	sbbl %edx, 4($v)
+END
+
+BEGIN sub_return %ecx
+	negl %edx
+	negl %eax
+	sbbl $0, %edx
+	addl  ($v), %eax
+	adcl 4($v), %edx
+	movl %eax,  ($v)
+	movl %edx, 4($v)
+END
+
+BEGIN inc %esi
+	addl $1,  ($v)
+	adcl $0, 4($v)
+END
+
+BEGIN inc_return %esi
+	movl  ($v), %eax
+	movl 4($v), %edx
+	addl $1, %eax
+	adcl $0, %edx
+	movl %eax,  ($v)
+	movl %edx, 4($v)
+END
+
+BEGIN dec %esi
+	subl $1,  ($v)
+	sbbl $0, 4($v)
+END
+
+BEGIN dec_return %esi
+	movl  ($v), %eax
+	movl 4($v), %edx
+	subl $1, %eax
+	sbbl $0, %edx
+	movl %eax,  ($v)
+	movl %edx, 4($v)
+END
+
+BEGIN add_unless %ecx
+	addl %eax, %esi
+	adcl %edx, %edi
+	addl  ($v), %eax
+	adcl 4($v), %edx
+	cmpl %eax, %esi
+	je 3f
+1:
+	movl %eax,  ($v)
+	movl %edx, 4($v)
+	xorl %eax, %eax
+2:
+RETURN
+3:
+	cmpl %edx, %edi
+	jne 1b
+	movl $1, %eax
+	jmp 2b
+END_
+
+BEGIN inc_not_zero %esi
+	movl  ($v), %eax
+	movl 4($v), %edx
+	testl %eax, %eax
+	je 3f
+1:
+	addl $1, %eax
+	adcl $0, %edx
+	movl %eax,  ($v)
+	movl %edx, 4($v)
+	xorl %eax, %eax
+2:
+RETURN
+3:
+	testl %edx, %edx
+	jne 1b
+	movl $1, %eax
+	jmp 2b
+END_
+
+BEGIN dec_if_positive %esi
+	movl  ($v), %eax
+	movl 4($v), %edx
+	subl $1, %eax
+	sbbl $0, %edx
+	js 1f
+	movl %eax,  ($v)
+	movl %edx, 4($v)
+1:
+END
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
new file mode 100644
index 000000000000..e49c4ebca9f4
--- /dev/null
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -0,0 +1,225 @@
+/*
+ * atomic64_t for 586+
+ *
+ * Copyright © 2010  Luca Barbieri
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/dwarf2.h>
+
+.macro SAVE reg
+	pushl %\reg
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET \reg, 0
+.endm
+
+.macro RESTORE reg
+	popl %\reg
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE \reg
+.endm
+
+.macro read64 reg
+	movl %ebx, %eax
+	movl %ecx, %edx
+/* we need LOCK_PREFIX since otherwise cmpxchg8b always does the write */
+	LOCK_PREFIX
+	cmpxchg8b (\reg)
+.endm
+
+ENTRY(atomic64_read_cx8)
+	CFI_STARTPROC
+
+	read64 %ecx
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_read_cx8)
+
+ENTRY(atomic64_set_cx8)
+	CFI_STARTPROC
+
+1:
+/* we don't need LOCK_PREFIX since aligned 64-bit writes
+ * are atomic on 586 and newer */
+	cmpxchg8b (%esi)
+	jne 1b
+
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_set_cx8)
+
+ENTRY(atomic64_xchg_cx8)
+	CFI_STARTPROC
+
+	movl %ebx, %eax
+	movl %ecx, %edx
+1:
+	LOCK_PREFIX
+	cmpxchg8b (%esi)
+	jne 1b
+
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_xchg_cx8)
+
+.macro addsub_return func ins insc
+ENTRY(atomic64_\func\()_return_cx8)
+	CFI_STARTPROC
+	SAVE ebp
+	SAVE ebx
+	SAVE esi
+	SAVE edi
+
+	movl %eax, %esi
+	movl %edx, %edi
+	movl %ecx, %ebp
+
+	read64 %ebp
+1:
+	movl %eax, %ebx
+	movl %edx, %ecx
+	\ins\()l %esi, %ebx
+	\insc\()l %edi, %ecx
+	LOCK_PREFIX
+	cmpxchg8b (%ebp)
+	jne 1b
+
+10:
+	movl %ebx, %eax
+	movl %ecx, %edx
+	RESTORE edi
+	RESTORE esi
+	RESTORE ebx
+	RESTORE ebp
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_\func\()_return_cx8)
+.endm
+
+addsub_return add add adc
+addsub_return sub sub sbb
+
+.macro incdec_return func ins insc
+ENTRY(atomic64_\func\()_return_cx8)
+	CFI_STARTPROC
+	SAVE ebx
+
+	read64 %esi
+1:
+	movl %eax, %ebx
+	movl %edx, %ecx
+	\ins\()l $1, %ebx
+	\insc\()l $0, %ecx
+	LOCK_PREFIX
+	cmpxchg8b (%esi)
+	jne 1b
+
+10:
+	movl %ebx, %eax
+	movl %ecx, %edx
+	RESTORE ebx
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_\func\()_return_cx8)
+.endm
+
+incdec_return inc add adc
+incdec_return dec sub sbb
+
+ENTRY(atomic64_dec_if_positive_cx8)
+	CFI_STARTPROC
+	SAVE ebx
+
+	read64 %esi
+1:
+	movl %eax, %ebx
+	movl %edx, %ecx
+	subl $1, %ebx
+	sbb $0, %ecx
+	js 2f
+	LOCK_PREFIX
+	cmpxchg8b (%esi)
+	jne 1b
+
+2:
+	movl %ebx, %eax
+	movl %ecx, %edx
+	RESTORE ebx
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_dec_if_positive_cx8)
+
+ENTRY(atomic64_add_unless_cx8)
+	CFI_STARTPROC
+	SAVE ebp
+	SAVE ebx
+/* these just push these two parameters on the stack */
+	SAVE edi
+	SAVE esi
+
+	movl %ecx, %ebp
+	movl %eax, %esi
+	movl %edx, %edi
+
+	read64 %ebp
+1:
+	cmpl %eax, 0(%esp)
+	je 4f
+2:
+	movl %eax, %ebx
+	movl %edx, %ecx
+	addl %esi, %ebx
+	adcl %edi, %ecx
+	LOCK_PREFIX
+	cmpxchg8b (%ebp)
+	jne 1b
+
+	xorl %eax, %eax
+3:
+	addl $8, %esp
+	CFI_ADJUST_CFA_OFFSET -8
+	RESTORE ebx
+	RESTORE ebp
+	ret
+4:
+	cmpl %edx, 4(%esp)
+	jne 2b
+	movl $1, %eax
+	jmp 3b
+	CFI_ENDPROC
+ENDPROC(atomic64_add_unless_cx8)
+
+ENTRY(atomic64_inc_not_zero_cx8)
+	CFI_STARTPROC
+	SAVE ebx
+
+	read64 %esi
+1:
+	testl %eax, %eax
+	je 4f
+2:
+	movl %eax, %ebx
+	movl %edx, %ecx
+	addl $1, %ebx
+	adcl $0, %ecx
+	LOCK_PREFIX
+	cmpxchg8b (%esi)
+	jne 1b
+
+	xorl %eax, %eax
+3:
+	RESTORE ebx
+	ret
+4:
+	testl %edx, %edx
+	jne 2b
+	movl $1, %eax
+	jmp 3b
+	CFI_ENDPROC
+ENDPROC(atomic64_inc_not_zero_cx8)
-- 
cgit v1.2.3-55-g7522


From d7f6de1e9c4a12e11ba7186c70f0f40caa76f590 Mon Sep 17 00:00:00 2001
From: Luca Barbieri
Date: Fri, 26 Feb 2010 12:22:41 +0100
Subject: x86: Implement atomic[64]_dec_if_positive()

Add support for atomic_dec_if_positive(), and
atomic64_dec_if_positive() for x86-64.

atomic64_dec_if_positive() for x86-32 was already implemented in a previous patch.

Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267183361-20775-2-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/atomic.h      | 23 +++++++++++++++++++++++
 arch/x86/include/asm/atomic64_64.h | 23 +++++++++++++++++++++++
 lib/atomic64_test.c                |  2 +-
 3 files changed, 47 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 8f8217b9bdac..706c69492c14 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -246,6 +246,29 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 
 #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
 
+/*
+ * atomic_dec_if_positive - decrement by 1 if old value positive
+ * @v: pointer of type atomic_t
+ *
+ * The function returns the old value of *v minus 1, even if
+ * the atomic variable, v, was not decremented.
+ */
+static inline int atomic_dec_if_positive(atomic_t *v)
+{
+	int c, old, dec;
+	c = atomic_read(v);
+	for (;;) {
+		dec = c - 1;
+		if (unlikely(dec < 0))
+			break;
+		old = atomic_cmpxchg((v), c, dec);
+		if (likely(old == c))
+			break;
+		c = old;
+	}
+	return dec;
+}
+
 /**
  * atomic_inc_short - increment of a short integer
  * @v: pointer to type int
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 51c5b4056929..4d6e2cd6c88c 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -221,4 +221,27 @@ static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
 
 #define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
 
+/*
+ * atomic64_dec_if_positive - decrement by 1 if old value positive
+ * @v: pointer of type atomic_t
+ *
+ * The function returns the old value of *v minus 1, even if
+ * the atomic variable, v, was not decremented.
+ */
+static inline long atomic64_dec_if_positive(atomic64_t *v)
+{
+	long c, old, dec;
+	c = atomic64_read(v);
+	for (;;) {
+		dec = c - 1;
+		if (unlikely(dec < 0))
+			break;
+		old = atomic64_cmpxchg((v), c, dec);
+		if (likely(old == c))
+			break;
+		c = old;
+	}
+	return dec;
+}
+
 #endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index 0effcacbebda..58efdabb3845 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -112,7 +112,7 @@ static __init int test_atomic64(void)
 	r += one;
 	BUG_ON(v.counter != r);
 
-#if defined(CONFIG_X86_32) || defined(CONFIG_MIPS) || defined(CONFIG_PPC) || defined(_ASM_GENERIC_ATOMIC64_H)
+#if defined(CONFIG_X86) || defined(CONFIG_MIPS) || defined(CONFIG_PPC) || defined(_ASM_GENERIC_ATOMIC64_H)
 	INIT(onestwos);
 	BUG_ON(atomic64_dec_if_positive(&v) != (onestwos - 1));
 	r -= one;
-- 
cgit v1.2.3-55-g7522


From 6e6104fe085026e6ef82cc5cc303d6c8ceb7e411 Mon Sep 17 00:00:00 2001
From: Luca Barbieri
Date: Mon, 1 Mar 2010 19:55:46 +0100
Subject: x86-32: Fix atomic64_add_unless return value convention

atomic64_add_unless must return 1 if it perfomed the add and 0 otherwise.
The implementation did the opposite thing.

Reported-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267469749-11878-3-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/lib/atomic64_386_32.S | 4 ++--
 arch/x86/lib/atomic64_cx8_32.S | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 5db07fe4a0ca..a2f847c88b89 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -133,13 +133,13 @@ BEGIN add_unless %ecx
 1:
 	movl %eax,  ($v)
 	movl %edx, 4($v)
-	xorl %eax, %eax
+	movl $1, %eax
 2:
 RETURN
 3:
 	cmpl %edx, %edi
 	jne 1b
-	movl $1, %eax
+	xorl %eax, %eax
 	jmp 2b
 END_
 
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index e49c4ebca9f4..d0e37b189f82 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -180,7 +180,7 @@ ENTRY(atomic64_add_unless_cx8)
 	cmpxchg8b (%ebp)
 	jne 1b
 
-	xorl %eax, %eax
+	movl $1, %eax
 3:
 	addl $8, %esp
 	CFI_ADJUST_CFA_OFFSET -8
@@ -190,7 +190,7 @@ ENTRY(atomic64_add_unless_cx8)
 4:
 	cmpl %edx, 4(%esp)
 	jne 2b
-	movl $1, %eax
+	xorl %eax, %eax
 	jmp 3b
 	CFI_ENDPROC
 ENDPROC(atomic64_add_unless_cx8)
-- 
cgit v1.2.3-55-g7522


From f3e83131469e29032a700217aa394996107b8fc5 Mon Sep 17 00:00:00 2001
From: Luca Barbieri
Date: Mon, 1 Mar 2010 19:55:49 +0100
Subject: x86-32: Fix atomic64_inc_not_zero return value convention

atomic64_inc_not_zero must return 1 if it perfomed the add and 0 otherwise.
It was doing the opposite thing.

Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267469749-11878-6-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/lib/atomic64_386_32.S | 3 +--
 arch/x86/lib/atomic64_cx8_32.S | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index a2f847c88b89..4a5979aa6883 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -153,13 +153,12 @@ BEGIN inc_not_zero %esi
 	adcl $0, %edx
 	movl %eax,  ($v)
 	movl %edx, 4($v)
-	xorl %eax, %eax
+	movl $1, %eax
 2:
 RETURN
 3:
 	testl %edx, %edx
 	jne 1b
-	movl $1, %eax
 	jmp 2b
 END_
 
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index d0e37b189f82..71e080de3352 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -212,14 +212,13 @@ ENTRY(atomic64_inc_not_zero_cx8)
 	cmpxchg8b (%esi)
 	jne 1b
 
-	xorl %eax, %eax
+	movl $1, %eax
 3:
 	RESTORE ebx
 	ret
 4:
 	testl %edx, %edx
 	jne 2b
-	movl $1, %eax
 	jmp 3b
 	CFI_ENDPROC
 ENDPROC(atomic64_inc_not_zero_cx8)
-- 
cgit v1.2.3-55-g7522


From 4daa2a8093ecd1148270a1fc64e99f072b8c2901 Mon Sep 17 00:00:00 2001
From: Pallipadi, Venkatesh
Date: Wed, 24 Feb 2010 13:43:55 -0800
Subject: x86, pat: In rbt_memtype_check_insert(), update new->type only if
 valid

new->type should only change when there is a valid ret_type. Otherwise
the requested type and return type should be same.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
LKML-Reference: <20100224214355.GA16431@linux-os.sc.intel.com>
Tested-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/pat_rbtree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 9063f40b638b..07de4cb8cc30 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -223,7 +223,9 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
 						new->type, ret_type);
 
 	if (!err) {
-		new->type = *ret_type;
+		if (ret_type)
+			new->type = *ret_type;
+
 		memtype_rb_insert(&memtype_rbroot, new);
 	}
 	return err;
-- 
cgit v1.2.3-55-g7522


From ced918eb748ce30b3aace549fd17540e40ffdca0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Wed, 17 Feb 2010 16:47:10 +0000
Subject: i8253: Convert i8253_lock to raw_spinlock

i8253_lock needs to be a real spinlock in preempt-rt, i.e. it can
not be converted to a sleeping lock.

Convert it to raw_spinlock and fix up all users.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Acked-by: Takashi Iwai <tiwai@suse.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <20100217163751.030764372@linutronix.de>

---
 arch/mips/include/asm/i8253.h     |  2 +-
 arch/mips/kernel/i8253.c          | 14 +++++++-------
 arch/x86/include/asm/i8253.h      |  2 +-
 arch/x86/kernel/apm_32.c          |  4 ++--
 arch/x86/kernel/i8253.c           | 14 +++++++-------
 drivers/block/hd.c                |  4 ++--
 drivers/input/gameport/gameport.c |  4 ++--
 drivers/input/joystick/analog.c   |  4 ++--
 drivers/input/misc/pcspkr.c       |  6 +++---
 sound/drivers/pcsp/pcsp.h         |  2 +-
 sound/drivers/pcsp/pcsp_input.c   |  4 ++--
 sound/drivers/pcsp/pcsp_lib.c     | 12 ++++++------
 12 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/mips/include/asm/i8253.h b/arch/mips/include/asm/i8253.h
index 032ca73f181b..48bb82372994 100644
--- a/arch/mips/include/asm/i8253.h
+++ b/arch/mips/include/asm/i8253.h
@@ -12,7 +12,7 @@
 #define PIT_CH0			0x40
 #define PIT_CH2			0x42
 
-extern spinlock_t i8253_lock;
+extern raw_spinlock_t i8253_lock;
 
 extern void setup_pit_timer(void);
 
diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c
index ed5c441615e4..94794062a177 100644
--- a/arch/mips/kernel/i8253.c
+++ b/arch/mips/kernel/i8253.c
@@ -15,7 +15,7 @@
 #include <asm/io.h>
 #include <asm/time.h>
 
-DEFINE_SPINLOCK(i8253_lock);
+DEFINE_RAW_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
 
 /*
@@ -26,7 +26,7 @@ EXPORT_SYMBOL(i8253_lock);
 static void init_pit_timer(enum clock_event_mode mode,
 			   struct clock_event_device *evt)
 {
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 
 	switch(mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
@@ -55,7 +55,7 @@ static void init_pit_timer(enum clock_event_mode mode,
 		/* Nothing to do here */
 		break;
 	}
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 }
 
 /*
@@ -65,10 +65,10 @@ static void init_pit_timer(enum clock_event_mode mode,
  */
 static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
 {
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 	outb_p(delta & 0xff , PIT_CH0);	/* LSB */
 	outb(delta >> 8 , PIT_CH0);	/* MSB */
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 
 	return 0;
 }
@@ -137,7 +137,7 @@ static cycle_t pit_read(struct clocksource *cs)
 	static int old_count;
 	static u32 old_jifs;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 	/*
 	 * Although our caller may have the read side of xtime_lock,
 	 * this is now a seqlock, and we are cheating in this routine
@@ -183,7 +183,7 @@ static cycle_t pit_read(struct clocksource *cs)
 	old_count = count;
 	old_jifs = jifs;
 
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 
 	count = (LATCH - 1) - count;
 
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
index 1edbf89680fd..fc1f579fb965 100644
--- a/arch/x86/include/asm/i8253.h
+++ b/arch/x86/include/asm/i8253.h
@@ -6,7 +6,7 @@
 #define PIT_CH0			0x40
 #define PIT_CH2			0x42
 
-extern spinlock_t i8253_lock;
+extern raw_spinlock_t i8253_lock;
 
 extern struct clock_event_device *global_clock_event;
 
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 031aa887b0eb..c4f9182ca3ac 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1224,7 +1224,7 @@ static void reinit_timer(void)
 #ifdef INIT_TIMER_AFTER_SUSPEND
 	unsigned long flags;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 	/* set the clock to HZ */
 	outb_pit(0x34, PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */
 	udelay(10);
@@ -1232,7 +1232,7 @@ static void reinit_timer(void)
 	udelay(10);
 	outb_pit(LATCH >> 8, PIT_CH0);	/* MSB */
 	udelay(10);
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 #endif
 }
 
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 23c167925a5c..2dfd31597443 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -16,7 +16,7 @@
 #include <asm/hpet.h>
 #include <asm/smp.h>
 
-DEFINE_SPINLOCK(i8253_lock);
+DEFINE_RAW_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
 
 /*
@@ -33,7 +33,7 @@ struct clock_event_device *global_clock_event;
 static void init_pit_timer(enum clock_event_mode mode,
 			   struct clock_event_device *evt)
 {
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 
 	switch (mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
@@ -62,7 +62,7 @@ static void init_pit_timer(enum clock_event_mode mode,
 		/* Nothing to do here */
 		break;
 	}
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 }
 
 /*
@@ -72,10 +72,10 @@ static void init_pit_timer(enum clock_event_mode mode,
  */
 static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
 {
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 	outb_pit(delta & 0xff , PIT_CH0);	/* LSB */
 	outb_pit(delta >> 8 , PIT_CH0);		/* MSB */
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 
 	return 0;
 }
@@ -130,7 +130,7 @@ static cycle_t pit_read(struct clocksource *cs)
 	int count;
 	u32 jifs;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 	/*
 	 * Although our caller may have the read side of xtime_lock,
 	 * this is now a seqlock, and we are cheating in this routine
@@ -176,7 +176,7 @@ static cycle_t pit_read(struct clocksource *cs)
 	old_count = count;
 	old_jifs = jifs;
 
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 
 	count = (LATCH - 1) - count;
 
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 5116c65c07cb..b9868ad0278d 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -165,12 +165,12 @@ unsigned long read_timer(void)
 	unsigned long t, flags;
 	int i;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 	t = jiffies * 11932;
 	outb_p(0, 0x43);
 	i = inb_p(0x40);
 	i |= inb(0x40) << 8;
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 	return(t - i);
 }
 #endif
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index 7e18bcf05a66..46239e47a260 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -59,11 +59,11 @@ static unsigned int get_time_pit(void)
 	unsigned long flags;
 	unsigned int count;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 	outb_p(0x00, 0x43);
 	count = inb_p(0x40);
 	count |= inb_p(0x40) << 8;
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 
 	return count;
 }
diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
index 1c0b529c06aa..4afe0a3b4884 100644
--- a/drivers/input/joystick/analog.c
+++ b/drivers/input/joystick/analog.c
@@ -146,11 +146,11 @@ static unsigned int get_time_pit(void)
         unsigned long flags;
         unsigned int count;
 
-        spin_lock_irqsave(&i8253_lock, flags);
+        raw_spin_lock_irqsave(&i8253_lock, flags);
         outb_p(0x00, 0x43);
         count = inb_p(0x40);
         count |= inb_p(0x40) << 8;
-        spin_unlock_irqrestore(&i8253_lock, flags);
+        raw_spin_unlock_irqrestore(&i8253_lock, flags);
 
         return count;
 }
diff --git a/drivers/input/misc/pcspkr.c b/drivers/input/misc/pcspkr.c
index ea4e1fd12651..f080dd31499b 100644
--- a/drivers/input/misc/pcspkr.c
+++ b/drivers/input/misc/pcspkr.c
@@ -30,7 +30,7 @@ MODULE_ALIAS("platform:pcspkr");
 #include <asm/i8253.h>
 #else
 #include <asm/8253pit.h>
-static DEFINE_SPINLOCK(i8253_lock);
+static DEFINE_RAW_SPINLOCK(i8253_lock);
 #endif
 
 static int pcspkr_event(struct input_dev *dev, unsigned int type, unsigned int code, int value)
@@ -50,7 +50,7 @@ static int pcspkr_event(struct input_dev *dev, unsigned int type, unsigned int c
 	if (value > 20 && value < 32767)
 		count = PIT_TICK_RATE / value;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 
 	if (count) {
 		/* set command for counter 2, 2 byte write */
@@ -65,7 +65,7 @@ static int pcspkr_event(struct input_dev *dev, unsigned int type, unsigned int c
 		outb(inb_p(0x61) & 0xFC, 0x61);
 	}
 
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 
 	return 0;
 }
diff --git a/sound/drivers/pcsp/pcsp.h b/sound/drivers/pcsp/pcsp.h
index 1e123077923d..4ff6c8cc5077 100644
--- a/sound/drivers/pcsp/pcsp.h
+++ b/sound/drivers/pcsp/pcsp.h
@@ -16,7 +16,7 @@
 #include <asm/i8253.h>
 #else
 #include <asm/8253pit.h>
-static DEFINE_SPINLOCK(i8253_lock);
+static DEFINE_RAW_SPINLOCK(i8253_lock);
 #endif
 
 #define PCSP_SOUND_VERSION 0x400	/* read 4.00 */
diff --git a/sound/drivers/pcsp/pcsp_input.c b/sound/drivers/pcsp/pcsp_input.c
index 0444cdeb4bec..b5e2b54c2604 100644
--- a/sound/drivers/pcsp/pcsp_input.c
+++ b/sound/drivers/pcsp/pcsp_input.c
@@ -21,7 +21,7 @@ static void pcspkr_do_sound(unsigned int count)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 
 	if (count) {
 		/* set command for counter 2, 2 byte write */
@@ -36,7 +36,7 @@ static void pcspkr_do_sound(unsigned int count)
 		outb(inb_p(0x61) & 0xFC, 0x61);
 	}
 
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 }
 
 void pcspkr_stop_sound(void)
diff --git a/sound/drivers/pcsp/pcsp_lib.c b/sound/drivers/pcsp/pcsp_lib.c
index e1145ac6e908..f6a2e72b8cde 100644
--- a/sound/drivers/pcsp/pcsp_lib.c
+++ b/sound/drivers/pcsp/pcsp_lib.c
@@ -65,7 +65,7 @@ static u64 pcsp_timer_update(struct snd_pcsp *chip)
 	timer_cnt = val * CUR_DIV() / 256;
 
 	if (timer_cnt && chip->enable) {
-		spin_lock_irqsave(&i8253_lock, flags);
+		raw_spin_lock_irqsave(&i8253_lock, flags);
 		if (!nforce_wa) {
 			outb_p(chip->val61, 0x61);
 			outb_p(timer_cnt, 0x42);
@@ -74,7 +74,7 @@ static u64 pcsp_timer_update(struct snd_pcsp *chip)
 			outb(chip->val61 ^ 2, 0x61);
 			chip->thalf = 1;
 		}
-		spin_unlock_irqrestore(&i8253_lock, flags);
+		raw_spin_unlock_irqrestore(&i8253_lock, flags);
 	}
 
 	chip->ns_rem = PCSP_PERIOD_NS();
@@ -158,10 +158,10 @@ static int pcsp_start_playing(struct snd_pcsp *chip)
 		return -EIO;
 	}
 
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 	chip->val61 = inb(0x61) | 0x03;
 	outb_p(0x92, 0x43);	/* binary, mode 1, LSB only, ch 2 */
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 	atomic_set(&chip->timer_active, 1);
 	chip->thalf = 0;
 
@@ -178,11 +178,11 @@ static void pcsp_stop_playing(struct snd_pcsp *chip)
 		return;
 
 	atomic_set(&chip->timer_active, 0);
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 	/* restore the timer */
 	outb_p(0xb6, 0x43);	/* binary, mode 3, LSB/MSB, ch 2 */
 	outb(chip->val61 & 0xFC, 0x61);
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 }
 
 /*
-- 
cgit v1.2.3-55-g7522


From 12387a46bb150f5608de4aa9a90dfdddbf991e3f Mon Sep 17 00:00:00 2001
From: Huang Ying
Date: Wed, 10 Mar 2010 18:28:55 +0800
Subject: crypto: aesni-intel - Add AES-NI accelerated CTR mode

To take advantage of the hardware pipeline implementation of AES-NI
instructions. CTR mode cryption is implemented in ASM to schedule
multiple AES-NI instructions one after another. This way, some latency
of AES-NI instruction can be eliminated.

Performance testing based on dm-crypt should 50% reduction of
ecryption/decryption time.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_asm.S  | 115 ++++++++++++++++++++++++++++++++
 arch/x86/crypto/aesni-intel_glue.c | 130 +++++++++++++++++++++++++++++++++++--
 2 files changed, 238 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 20bb0e1ac681..822846a9eba9 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -32,6 +32,9 @@
 #define IN	IN1
 #define KEY	%xmm2
 #define IV	%xmm3
+#define BSWAP_MASK %xmm10
+#define CTR	%xmm11
+#define INC	%xmm12
 
 #define KEYP	%rdi
 #define OUTP	%rsi
@@ -42,6 +45,7 @@
 #define T1	%r10
 #define TKEYP	T1
 #define T2	%r11
+#define TCTR_LOW T2
 
 _key_expansion_128:
 _key_expansion_256a:
@@ -724,3 +728,114 @@ ENTRY(aesni_cbc_dec)
 	movups IV, (IVP)
 .Lcbc_dec_just_ret:
 	ret
+
+.align 16
+.Lbswap_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/*
+ * _aesni_inc_init:	internal ABI
+ *	setup registers used by _aesni_inc
+ * input:
+ *	IV
+ * output:
+ *	CTR:	== IV, in little endian
+ *	TCTR_LOW: == lower qword of CTR
+ *	INC:	== 1, in little endian
+ *	BSWAP_MASK == endian swapping mask
+ */
+_aesni_inc_init:
+	movaps .Lbswap_mask, BSWAP_MASK
+	movaps IV, CTR
+	PSHUFB_XMM BSWAP_MASK CTR
+	mov $1, TCTR_LOW
+	movq TCTR_LOW, INC
+	movq CTR, TCTR_LOW
+	ret
+
+/*
+ * _aesni_inc:		internal ABI
+ *	Increase IV by 1, IV is in big endian
+ * input:
+ *	IV
+ *	CTR:	== IV, in little endian
+ *	TCTR_LOW: == lower qword of CTR
+ *	INC:	== 1, in little endian
+ *	BSWAP_MASK == endian swapping mask
+ * output:
+ *	IV:	Increase by 1
+ * changed:
+ *	CTR:	== output IV, in little endian
+ *	TCTR_LOW: == lower qword of CTR
+ */
+_aesni_inc:
+	paddq INC, CTR
+	add $1, TCTR_LOW
+	jnc .Linc_low
+	pslldq $8, INC
+	paddq INC, CTR
+	psrldq $8, INC
+.Linc_low:
+	movaps CTR, IV
+	PSHUFB_XMM BSWAP_MASK IV
+	ret
+
+/*
+ * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *		      size_t len, u8 *iv)
+ */
+ENTRY(aesni_ctr_enc)
+	cmp $16, LEN
+	jb .Lctr_enc_just_ret
+	mov 480(KEYP), KLEN
+	movups (IVP), IV
+	call _aesni_inc_init
+	cmp $64, LEN
+	jb .Lctr_enc_loop1
+.align 4
+.Lctr_enc_loop4:
+	movaps IV, STATE1
+	call _aesni_inc
+	movups (INP), IN1
+	movaps IV, STATE2
+	call _aesni_inc
+	movups 0x10(INP), IN2
+	movaps IV, STATE3
+	call _aesni_inc
+	movups 0x20(INP), IN3
+	movaps IV, STATE4
+	call _aesni_inc
+	movups 0x30(INP), IN4
+	call _aesni_enc4
+	pxor IN1, STATE1
+	movups STATE1, (OUTP)
+	pxor IN2, STATE2
+	movups STATE2, 0x10(OUTP)
+	pxor IN3, STATE3
+	movups STATE3, 0x20(OUTP)
+	pxor IN4, STATE4
+	movups STATE4, 0x30(OUTP)
+	sub $64, LEN
+	add $64, INP
+	add $64, OUTP
+	cmp $64, LEN
+	jge .Lctr_enc_loop4
+	cmp $16, LEN
+	jb .Lctr_enc_ret
+.align 4
+.Lctr_enc_loop1:
+	movaps IV, STATE
+	call _aesni_inc
+	movups (INP), IN
+	call _aesni_enc1
+	pxor IN, STATE
+	movups STATE, (OUTP)
+	sub $16, LEN
+	add $16, INP
+	add $16, OUTP
+	cmp $16, LEN
+	jge .Lctr_enc_loop1
+.Lctr_enc_ret:
+	movups IV, (IVP)
+.Lctr_enc_just_ret:
+	ret
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 49c552c060e9..2cb3dcc4490a 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -18,6 +18,7 @@
 #include <crypto/algapi.h>
 #include <crypto/aes.h>
 #include <crypto/cryptd.h>
+#include <crypto/ctr.h>
 #include <asm/i387.h>
 #include <asm/aes.h>
 
@@ -58,6 +59,8 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
+			      const u8 *in, unsigned int len, u8 *iv);
 
 static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
 {
@@ -321,6 +324,72 @@ static struct crypto_alg blk_cbc_alg = {
 	},
 };
 
+static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
+			    struct blkcipher_walk *walk)
+{
+	u8 *ctrblk = walk->iv;
+	u8 keystream[AES_BLOCK_SIZE];
+	u8 *src = walk->src.virt.addr;
+	u8 *dst = walk->dst.virt.addr;
+	unsigned int nbytes = walk->nbytes;
+
+	aesni_enc(ctx, keystream, ctrblk);
+	crypto_xor(keystream, src, nbytes);
+	memcpy(dst, keystream, nbytes);
+	crypto_inc(ctrblk, AES_BLOCK_SIZE);
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc,
+		     struct scatterlist *dst, struct scatterlist *src,
+		     unsigned int nbytes)
+{
+	struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	kernel_fpu_begin();
+	while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
+		aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+			      nbytes & AES_BLOCK_MASK, walk.iv);
+		nbytes &= AES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	if (walk.nbytes) {
+		ctr_crypt_final(ctx, &walk);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+	kernel_fpu_end();
+
+	return err;
+}
+
+static struct crypto_alg blk_ctr_alg = {
+	.cra_name		= "__ctr-aes-aesni",
+	.cra_driver_name	= "__driver-ctr-aes-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_ctr_alg.cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.ivsize		= AES_BLOCK_SIZE,
+			.setkey		= aes_set_key,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+};
+
 static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
 			unsigned int key_len)
 {
@@ -467,13 +536,11 @@ static struct crypto_alg ablk_cbc_alg = {
 	},
 };
 
-#ifdef HAS_CTR
 static int ablk_ctr_init(struct crypto_tfm *tfm)
 {
 	struct cryptd_ablkcipher *cryptd_tfm;
 
-	cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))",
-					     0, 0);
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-aes-aesni", 0, 0);
 	if (IS_ERR(cryptd_tfm))
 		return PTR_ERR(cryptd_tfm);
 	ablk_init_common(tfm, cryptd_tfm);
@@ -500,11 +567,50 @@ static struct crypto_alg ablk_ctr_alg = {
 			.ivsize		= AES_BLOCK_SIZE,
 			.setkey		= ablk_set_key,
 			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_decrypt,
+			.decrypt	= ablk_encrypt,
 			.geniv		= "chainiv",
 		},
 	},
 };
+
+#ifdef HAS_CTR
+static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher(
+		"rfc3686(__driver-ctr-aes-aesni)", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_rfc3686_ctr_alg = {
+	.cra_name		= "rfc3686(ctr(aes))",
+	.cra_driver_name	= "rfc3686-ctr-aes-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_rfc3686_ctr_alg.cra_list),
+	.cra_init		= ablk_rfc3686_ctr_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize = AES_MIN_KEY_SIZE+CTR_RFC3686_NONCE_SIZE,
+			.max_keysize = AES_MAX_KEY_SIZE+CTR_RFC3686_NONCE_SIZE,
+			.ivsize	     = CTR_RFC3686_IV_SIZE,
+			.setkey	     = ablk_set_key,
+			.encrypt     = ablk_encrypt,
+			.decrypt     = ablk_decrypt,
+			.geniv	     = "seqiv",
+		},
+	},
+};
 #endif
 
 #ifdef HAS_LRW
@@ -640,13 +746,17 @@ static int __init aesni_init(void)
 		goto blk_ecb_err;
 	if ((err = crypto_register_alg(&blk_cbc_alg)))
 		goto blk_cbc_err;
+	if ((err = crypto_register_alg(&blk_ctr_alg)))
+		goto blk_ctr_err;
 	if ((err = crypto_register_alg(&ablk_ecb_alg)))
 		goto ablk_ecb_err;
 	if ((err = crypto_register_alg(&ablk_cbc_alg)))
 		goto ablk_cbc_err;
-#ifdef HAS_CTR
 	if ((err = crypto_register_alg(&ablk_ctr_alg)))
 		goto ablk_ctr_err;
+#ifdef HAS_CTR
+	if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
+		goto ablk_rfc3686_ctr_err;
 #endif
 #ifdef HAS_LRW
 	if ((err = crypto_register_alg(&ablk_lrw_alg)))
@@ -675,13 +785,17 @@ ablk_pcbc_err:
 ablk_lrw_err:
 #endif
 #ifdef HAS_CTR
+	crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
+ablk_rfc3686_ctr_err:
+#endif
 	crypto_unregister_alg(&ablk_ctr_alg);
 ablk_ctr_err:
-#endif
 	crypto_unregister_alg(&ablk_cbc_alg);
 ablk_cbc_err:
 	crypto_unregister_alg(&ablk_ecb_alg);
 ablk_ecb_err:
+	crypto_unregister_alg(&blk_ctr_alg);
+blk_ctr_err:
 	crypto_unregister_alg(&blk_cbc_alg);
 blk_cbc_err:
 	crypto_unregister_alg(&blk_ecb_alg);
@@ -705,10 +819,12 @@ static void __exit aesni_exit(void)
 	crypto_unregister_alg(&ablk_lrw_alg);
 #endif
 #ifdef HAS_CTR
-	crypto_unregister_alg(&ablk_ctr_alg);
+	crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
 #endif
+	crypto_unregister_alg(&ablk_ctr_alg);
 	crypto_unregister_alg(&ablk_cbc_alg);
 	crypto_unregister_alg(&ablk_ecb_alg);
+	crypto_unregister_alg(&blk_ctr_alg);
 	crypto_unregister_alg(&blk_cbc_alg);
 	crypto_unregister_alg(&blk_ecb_alg);
 	crypto_unregister_alg(&__aesni_alg);
-- 
cgit v1.2.3-55-g7522


From ca037701a025334e724e5c61b3b1082940c8b981 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Tue, 2 Mar 2010 19:52:12 +0100
Subject: perf, x86: Add PEBS infrastructure

This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.

This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.

With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.

This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).

It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c          | 223 +++++-------
 arch/x86/kernel/cpu/perf_event_intel.c    | 150 ++------
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 557 ++++++++++++++++++++++++++++++
 include/linux/perf_event.h                |   3 +-
 4 files changed, 671 insertions(+), 262 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/perf_event_intel_ds.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1d665a0b202c..0c03d5c1671f 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,45 +31,6 @@
 
 static u64 perf_event_mask __read_mostly;
 
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS	4
-
-/* The size of a BTS record in bytes: */
-#define BTS_RECORD_SIZE		24
-
-/* The size of a per-cpu BTS buffer in bytes: */
-#define BTS_BUFFER_SIZE		(BTS_RECORD_SIZE * 2048)
-
-/* The BTS overflow threshold in bytes from the end of the buffer: */
-#define BTS_OVFL_TH		(BTS_RECORD_SIZE * 128)
-
-
-/*
- * Bits in the debugctlmsr controlling branch tracing.
- */
-#define X86_DEBUGCTL_TR			(1 << 6)
-#define X86_DEBUGCTL_BTS		(1 << 7)
-#define X86_DEBUGCTL_BTINT		(1 << 8)
-#define X86_DEBUGCTL_BTS_OFF_OS		(1 << 9)
-#define X86_DEBUGCTL_BTS_OFF_USR	(1 << 10)
-
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
-	u64	bts_buffer_base;
-	u64	bts_index;
-	u64	bts_absolute_maximum;
-	u64	bts_interrupt_threshold;
-	u64	pebs_buffer_base;
-	u64	pebs_index;
-	u64	pebs_absolute_maximum;
-	u64	pebs_interrupt_threshold;
-	u64	pebs_event_reset[MAX_PEBS_EVENTS];
-};
-
 struct event_constraint {
 	union {
 		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
@@ -88,17 +49,29 @@ struct amd_nb {
 };
 
 struct cpu_hw_events {
+	/*
+	 * Generic x86 PMC bits
+	 */
 	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
 	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
 	int			enabled;
-	struct debug_store	*ds;
 
 	int			n_events;
 	int			n_added;
 	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
 	u64			tags[X86_PMC_IDX_MAX];
 	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+
+	/*
+	 * Intel DebugStore bits
+	 */
+	struct debug_store	*ds;
+	u64			pebs_enabled;
+
+	/*
+	 * AMD specific bits
+	 */
 	struct amd_nb		*amd_nb;
 };
 
@@ -112,12 +85,24 @@ struct cpu_hw_events {
 #define EVENT_CONSTRAINT(c, n, m)	\
 	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
 
+/*
+ * Constraint on the Event code.
+ */
 #define INTEL_EVENT_CONSTRAINT(c, n)	\
 	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
 
+/*
+ * Constraint on the Event code + UMask + fixed-mask
+ */
 #define FIXED_EVENT_CONSTRAINT(c, n)	\
 	EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
 
+/*
+ * Constraint on the Event code + UMask
+ */
+#define PEBS_EVENT_CONSTRAINT(c, n)	\
+	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
+
 #define EVENT_CONSTRAINT_END		\
 	EVENT_CONSTRAINT(0, 0, 0)
 
@@ -128,6 +113,9 @@ struct cpu_hw_events {
  * struct x86_pmu - generic x86 pmu
  */
 struct x86_pmu {
+	/*
+	 * Generic x86 PMC bits
+	 */
 	const char	*name;
 	int		version;
 	int		(*handle_irq)(struct pt_regs *);
@@ -146,10 +134,6 @@ struct x86_pmu {
 	u64		event_mask;
 	int		apic;
 	u64		max_period;
-	u64		intel_ctrl;
-	void		(*enable_bts)(u64 config);
-	void		(*disable_bts)(void);
-
 	struct event_constraint *
 			(*get_event_constraints)(struct cpu_hw_events *cpuc,
 						 struct perf_event *event);
@@ -162,6 +146,19 @@ struct x86_pmu {
 	void		(*cpu_starting)(int cpu);
 	void		(*cpu_dying)(int cpu);
 	void		(*cpu_dead)(int cpu);
+
+	/*
+	 * Intel Arch Perfmon v2+
+	 */
+	u64		intel_ctrl;
+
+	/*
+	 * Intel DebugStore bits
+	 */
+	int		bts, pebs;
+	int		pebs_record_size;
+	void		(*drain_pebs)(struct pt_regs *regs);
+	struct event_constraint *pebs_constraints;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -293,110 +290,14 @@ static void release_pmc_hardware(void)
 #endif
 }
 
-static inline bool bts_available(void)
-{
-	return x86_pmu.enable_bts != NULL;
-}
-
-static void init_debug_store_on_cpu(int cpu)
-{
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-	if (!ds)
-		return;
-
-	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
-		     (u32)((u64)(unsigned long)ds),
-		     (u32)((u64)(unsigned long)ds >> 32));
-}
-
-static void fini_debug_store_on_cpu(int cpu)
-{
-	if (!per_cpu(cpu_hw_events, cpu).ds)
-		return;
-
-	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
-}
-
-static void release_bts_hardware(void)
-{
-	int cpu;
-
-	if (!bts_available())
-		return;
-
-	get_online_cpus();
-
-	for_each_online_cpu(cpu)
-		fini_debug_store_on_cpu(cpu);
-
-	for_each_possible_cpu(cpu) {
-		struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-		if (!ds)
-			continue;
-
-		per_cpu(cpu_hw_events, cpu).ds = NULL;
-
-		kfree((void *)(unsigned long)ds->bts_buffer_base);
-		kfree(ds);
-	}
-
-	put_online_cpus();
-}
-
-static int reserve_bts_hardware(void)
-{
-	int cpu, err = 0;
-
-	if (!bts_available())
-		return 0;
-
-	get_online_cpus();
-
-	for_each_possible_cpu(cpu) {
-		struct debug_store *ds;
-		void *buffer;
-
-		err = -ENOMEM;
-		buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
-		if (unlikely(!buffer))
-			break;
-
-		ds = kzalloc(sizeof(*ds), GFP_KERNEL);
-		if (unlikely(!ds)) {
-			kfree(buffer);
-			break;
-		}
-
-		ds->bts_buffer_base = (u64)(unsigned long)buffer;
-		ds->bts_index = ds->bts_buffer_base;
-		ds->bts_absolute_maximum =
-			ds->bts_buffer_base + BTS_BUFFER_SIZE;
-		ds->bts_interrupt_threshold =
-			ds->bts_absolute_maximum - BTS_OVFL_TH;
-
-		per_cpu(cpu_hw_events, cpu).ds = ds;
-		err = 0;
-	}
-
-	if (err)
-		release_bts_hardware();
-	else {
-		for_each_online_cpu(cpu)
-			init_debug_store_on_cpu(cpu);
-	}
-
-	put_online_cpus();
-
-	return err;
-}
+static int reserve_ds_buffers(void);
+static void release_ds_buffers(void);
 
 static void hw_perf_event_destroy(struct perf_event *event)
 {
 	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
 		release_pmc_hardware();
-		release_bts_hardware();
+		release_ds_buffers();
 		mutex_unlock(&pmc_reserve_mutex);
 	}
 }
@@ -459,7 +360,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 			if (!reserve_pmc_hardware())
 				err = -EBUSY;
 			else
-				err = reserve_bts_hardware();
+				err = reserve_ds_buffers();
 		}
 		if (!err)
 			atomic_inc(&active_events);
@@ -537,7 +438,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
 	    (hwc->sample_period == 1)) {
 		/* BTS is not supported by this architecture. */
-		if (!bts_available())
+		if (!x86_pmu.bts)
 			return -EOPNOTSUPP;
 
 		/* BTS is currently only allowed for user-mode. */
@@ -995,6 +896,7 @@ static void x86_pmu_unthrottle(struct perf_event *event)
 void perf_event_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+	u64 pebs;
 	struct cpu_hw_events *cpuc;
 	unsigned long flags;
 	int cpu, idx;
@@ -1012,12 +914,14 @@ void perf_event_print_debug(void)
 		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
 
 		pr_info("\n");
 		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
 		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
 		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
 		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
+		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
 	}
 	pr_info("CPU#%d: active:       %016llx\n", cpu, *(u64 *)cpuc->active_mask);
 
@@ -1333,6 +1237,7 @@ undo:
 
 #include "perf_event_amd.c"
 #include "perf_event_p6.c"
+#include "perf_event_intel_ds.c"
 #include "perf_event_intel.c"
 
 static int __cpuinit
@@ -1464,6 +1369,32 @@ static const struct pmu pmu = {
 	.unthrottle	= x86_pmu_unthrottle,
 };
 
+/*
+ * validate that we can schedule this event
+ */
+static int validate_event(struct perf_event *event)
+{
+	struct cpu_hw_events *fake_cpuc;
+	struct event_constraint *c;
+	int ret = 0;
+
+	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
+	if (!fake_cpuc)
+		return -ENOMEM;
+
+	c = x86_pmu.get_event_constraints(fake_cpuc, event);
+
+	if (!c || !c->weight)
+		ret = -ENOSPC;
+
+	if (x86_pmu.put_event_constraints)
+		x86_pmu.put_event_constraints(fake_cpuc, event);
+
+	kfree(fake_cpuc);
+
+	return ret;
+}
+
 /*
  * validate a single event group
  *
@@ -1529,6 +1460,8 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 
 		if (event->group_leader != event)
 			err = validate_group(event);
+		else
+			err = validate_event(event);
 
 		event->pmu = tmp;
 	}
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 84bfde64a337..11446412e4c7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -470,42 +470,6 @@ static u64 intel_pmu_raw_event(u64 hw_event)
 	return hw_event & CORE_EVNTSEL_MASK;
 }
 
-static void intel_pmu_enable_bts(u64 config)
-{
-	unsigned long debugctlmsr;
-
-	debugctlmsr = get_debugctlmsr();
-
-	debugctlmsr |= X86_DEBUGCTL_TR;
-	debugctlmsr |= X86_DEBUGCTL_BTS;
-	debugctlmsr |= X86_DEBUGCTL_BTINT;
-
-	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
-		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
-
-	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
-		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
-
-	update_debugctlmsr(debugctlmsr);
-}
-
-static void intel_pmu_disable_bts(void)
-{
-	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-	unsigned long debugctlmsr;
-
-	if (!cpuc->ds)
-		return;
-
-	debugctlmsr = get_debugctlmsr();
-
-	debugctlmsr &=
-		~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
-		  X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
-
-	update_debugctlmsr(debugctlmsr);
-}
-
 static void intel_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -514,6 +478,8 @@ static void intel_pmu_disable_all(void)
 
 	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
 		intel_pmu_disable_bts();
+
+	intel_pmu_pebs_disable_all();
 }
 
 static void intel_pmu_enable_all(void)
@@ -531,6 +497,8 @@ static void intel_pmu_enable_all(void)
 
 		intel_pmu_enable_bts(event->hw.config);
 	}
+
+	intel_pmu_pebs_enable_all();
 }
 
 static inline u64 intel_pmu_get_status(void)
@@ -547,8 +515,7 @@ static inline void intel_pmu_ack_status(u64 ack)
 	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 }
 
-static inline void
-intel_pmu_disable_fixed(struct hw_perf_event *hwc)
+static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
 {
 	int idx = hwc->idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, mask;
@@ -560,68 +527,7 @@ intel_pmu_disable_fixed(struct hw_perf_event *hwc)
 	(void)checking_wrmsrl(hwc->config_base, ctrl_val);
 }
 
-static void intel_pmu_drain_bts_buffer(void)
-{
-	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-	struct debug_store *ds = cpuc->ds;
-	struct bts_record {
-		u64	from;
-		u64	to;
-		u64	flags;
-	};
-	struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
-	struct bts_record *at, *top;
-	struct perf_output_handle handle;
-	struct perf_event_header header;
-	struct perf_sample_data data;
-	struct pt_regs regs;
-
-	if (!event)
-		return;
-
-	if (!ds)
-		return;
-
-	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
-	top = (struct bts_record *)(unsigned long)ds->bts_index;
-
-	if (top <= at)
-		return;
-
-	ds->bts_index = ds->bts_buffer_base;
-
-	perf_sample_data_init(&data, 0);
-
-	data.period	= event->hw.last_period;
-	regs.ip		= 0;
-
-	/*
-	 * Prepare a generic sample, i.e. fill in the invariant fields.
-	 * We will overwrite the from and to address before we output
-	 * the sample.
-	 */
-	perf_prepare_sample(&header, &data, event, &regs);
-
-	if (perf_output_begin(&handle, event,
-			      header.size * (top - at), 1, 1))
-		return;
-
-	for (; at < top; at++) {
-		data.ip		= at->from;
-		data.addr	= at->to;
-
-		perf_output_sample(&handle, &header, &data, event);
-	}
-
-	perf_output_end(&handle);
-
-	/* There's new data available. */
-	event->hw.interrupts++;
-	event->pending_kill = POLL_IN;
-}
-
-static inline void
-intel_pmu_disable_event(struct perf_event *event)
+static void intel_pmu_disable_event(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 
@@ -637,10 +543,12 @@ intel_pmu_disable_event(struct perf_event *event)
 	}
 
 	x86_pmu_disable_event(event);
+
+	if (unlikely(event->attr.precise))
+		intel_pmu_pebs_disable(hwc);
 }
 
-static inline void
-intel_pmu_enable_fixed(struct hw_perf_event *hwc)
+static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
 {
 	int idx = hwc->idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, bits, mask;
@@ -689,6 +597,9 @@ static void intel_pmu_enable_event(struct perf_event *event)
 		return;
 	}
 
+	if (unlikely(event->attr.precise))
+		intel_pmu_pebs_enable(hwc);
+
 	__x86_pmu_enable_event(hwc);
 }
 
@@ -762,6 +673,13 @@ again:
 
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
+
+	/*
+	 * PEBS overflow sets bit 62 in the global status register
+	 */
+	if (__test_and_clear_bit(62, (unsigned long *)&status))
+		x86_pmu.drain_pebs(regs);
+
 	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
 		struct perf_event *event = cpuc->events[bit];
 
@@ -791,22 +709,18 @@ done:
 	return 1;
 }
 
-static struct event_constraint bts_constraint =
-	EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
-
 static struct event_constraint *
-intel_special_constraints(struct perf_event *event)
+intel_bts_constraints(struct perf_event *event)
 {
-	unsigned int hw_event;
-
-	hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned int hw_event, bts_event;
 
-	if (unlikely((hw_event ==
-		      x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
-		     (event->hw.sample_period == 1))) {
+	hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
+	bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
 
+	if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
 		return &bts_constraint;
-	}
+
 	return NULL;
 }
 
@@ -815,7 +729,11 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
 {
 	struct event_constraint *c;
 
-	c = intel_special_constraints(event);
+	c = intel_bts_constraints(event);
+	if (c)
+		return c;
+
+	c = intel_pebs_constraints(event);
 	if (c)
 		return c;
 
@@ -864,8 +782,6 @@ static __initconst struct x86_pmu intel_pmu = {
 	 * the generic event period:
 	 */
 	.max_period		= (1ULL << 31) - 1,
-	.enable_bts		= intel_pmu_enable_bts,
-	.disable_bts		= intel_pmu_disable_bts,
 	.get_event_constraints	= intel_get_event_constraints,
 
 	.cpu_starting		= init_debug_store_on_cpu,
@@ -915,6 +831,8 @@ static __init int intel_pmu_init(void)
 	if (version > 1)
 		x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
 
+	intel_ds_init();
+
 	/*
 	 * Install the hw-cache-events table:
 	 */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
new file mode 100644
index 000000000000..0d994ef213b9
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -0,0 +1,557 @@
+#ifdef CONFIG_CPU_SUP_INTEL
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS		4
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE		24
+
+#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
+#define PEBS_BUFFER_SIZE	PAGE_SIZE
+
+/*
+ * pebs_record_32 for p4 and core not supported
+
+struct pebs_record_32 {
+	u32 flags, ip;
+	u32 ax, bc, cx, dx;
+	u32 si, di, bp, sp;
+};
+
+ */
+
+struct pebs_record_core {
+	u64 flags, ip;
+	u64 ax, bx, cx, dx;
+	u64 si, di, bp, sp;
+	u64 r8,  r9,  r10, r11;
+	u64 r12, r13, r14, r15;
+};
+
+struct pebs_record_nhm {
+	u64 flags, ip;
+	u64 ax, bx, cx, dx;
+	u64 si, di, bp, sp;
+	u64 r8,  r9,  r10, r11;
+	u64 r12, r13, r14, r15;
+	u64 status, dla, dse, lat;
+};
+
+/*
+ * Bits in the debugctlmsr controlling branch tracing.
+ */
+#define X86_DEBUGCTL_TR			(1 << 6)
+#define X86_DEBUGCTL_BTS		(1 << 7)
+#define X86_DEBUGCTL_BTINT		(1 << 8)
+#define X86_DEBUGCTL_BTS_OFF_OS		(1 << 9)
+#define X86_DEBUGCTL_BTS_OFF_USR	(1 << 10)
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+	u64	bts_buffer_base;
+	u64	bts_index;
+	u64	bts_absolute_maximum;
+	u64	bts_interrupt_threshold;
+	u64	pebs_buffer_base;
+	u64	pebs_index;
+	u64	pebs_absolute_maximum;
+	u64	pebs_interrupt_threshold;
+	u64	pebs_event_reset[MAX_PEBS_EVENTS];
+};
+
+static void init_debug_store_on_cpu(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+	if (!ds)
+		return;
+
+	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+		     (u32)((u64)(unsigned long)ds),
+		     (u32)((u64)(unsigned long)ds >> 32));
+}
+
+static void fini_debug_store_on_cpu(int cpu)
+{
+	if (!per_cpu(cpu_hw_events, cpu).ds)
+		return;
+
+	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static void release_ds_buffers(void)
+{
+	int cpu;
+
+	if (!x86_pmu.bts && !x86_pmu.pebs)
+		return;
+
+	get_online_cpus();
+
+	for_each_online_cpu(cpu)
+		fini_debug_store_on_cpu(cpu);
+
+	for_each_possible_cpu(cpu) {
+		struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+		if (!ds)
+			continue;
+
+		per_cpu(cpu_hw_events, cpu).ds = NULL;
+
+		kfree((void *)(unsigned long)ds->pebs_buffer_base);
+		kfree((void *)(unsigned long)ds->bts_buffer_base);
+		kfree(ds);
+	}
+
+	put_online_cpus();
+}
+
+static int reserve_ds_buffers(void)
+{
+	int cpu, err = 0;
+
+	if (!x86_pmu.bts && !x86_pmu.pebs)
+		return 0;
+
+	get_online_cpus();
+
+	for_each_possible_cpu(cpu) {
+		struct debug_store *ds;
+		void *buffer;
+		int max, thresh;
+
+		err = -ENOMEM;
+		ds = kzalloc(sizeof(*ds), GFP_KERNEL);
+		if (unlikely(!ds)) {
+			kfree(buffer);
+			break;
+		}
+		per_cpu(cpu_hw_events, cpu).ds = ds;
+
+		if (x86_pmu.bts) {
+			buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
+			if (unlikely(!buffer))
+				break;
+
+			max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+			thresh = max / 16;
+
+			ds->bts_buffer_base = (u64)(unsigned long)buffer;
+			ds->bts_index = ds->bts_buffer_base;
+			ds->bts_absolute_maximum = ds->bts_buffer_base +
+				max * BTS_RECORD_SIZE;
+			ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+				thresh * BTS_RECORD_SIZE;
+		}
+
+		if (x86_pmu.pebs) {
+			buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
+			if (unlikely(!buffer))
+				break;
+
+			max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
+
+			ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+			ds->pebs_index = ds->pebs_buffer_base;
+			ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+				max * x86_pmu.pebs_record_size;
+			/*
+			 * Always use single record PEBS
+			 */
+			ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
+				x86_pmu.pebs_record_size;
+		}
+
+		err = 0;
+	}
+
+	if (err)
+		release_ds_buffers();
+	else {
+		for_each_online_cpu(cpu)
+			init_debug_store_on_cpu(cpu);
+	}
+
+	put_online_cpus();
+
+	return err;
+}
+
+/*
+ * BTS
+ */
+
+static struct event_constraint bts_constraint =
+	EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
+
+static void intel_pmu_enable_bts(u64 config)
+{
+	unsigned long debugctlmsr;
+
+	debugctlmsr = get_debugctlmsr();
+
+	debugctlmsr |= X86_DEBUGCTL_TR;
+	debugctlmsr |= X86_DEBUGCTL_BTS;
+	debugctlmsr |= X86_DEBUGCTL_BTINT;
+
+	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
+
+	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
+
+	update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_disable_bts(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	unsigned long debugctlmsr;
+
+	if (!cpuc->ds)
+		return;
+
+	debugctlmsr = get_debugctlmsr();
+
+	debugctlmsr &=
+		~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
+		  X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
+
+	update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_drain_bts_buffer(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct debug_store *ds = cpuc->ds;
+	struct bts_record {
+		u64	from;
+		u64	to;
+		u64	flags;
+	};
+	struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
+	struct bts_record *at, *top;
+	struct perf_output_handle handle;
+	struct perf_event_header header;
+	struct perf_sample_data data;
+	struct pt_regs regs;
+
+	if (!event)
+		return;
+
+	if (!ds)
+		return;
+
+	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+	top = (struct bts_record *)(unsigned long)ds->bts_index;
+
+	if (top <= at)
+		return;
+
+	ds->bts_index = ds->bts_buffer_base;
+
+	perf_sample_data_init(&data, 0);
+	data.period = event->hw.last_period;
+	regs.ip     = 0;
+
+	/*
+	 * Prepare a generic sample, i.e. fill in the invariant fields.
+	 * We will overwrite the from and to address before we output
+	 * the sample.
+	 */
+	perf_prepare_sample(&header, &data, event, &regs);
+
+	if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
+		return;
+
+	for (; at < top; at++) {
+		data.ip		= at->from;
+		data.addr	= at->to;
+
+		perf_output_sample(&handle, &header, &data, event);
+	}
+
+	perf_output_end(&handle);
+
+	/* There's new data available. */
+	event->hw.interrupts++;
+	event->pending_kill = POLL_IN;
+}
+
+/*
+ * PEBS
+ */
+
+static struct event_constraint intel_core_pebs_events[] = {
+	PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */
+	PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
+	PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
+	PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
+	PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */
+	PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
+	PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */
+	PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
+	PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_nehalem_pebs_events[] = {
+	PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
+	PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
+	PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
+	PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */
+	PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */
+	PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
+	PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */
+	PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
+	PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint *
+intel_pebs_constraints(struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	if (!event->attr.precise)
+		return NULL;
+
+	if (x86_pmu.pebs_constraints) {
+		for_each_event_constraint(c, x86_pmu.pebs_constraints) {
+			if ((event->hw.config & c->cmask) == c->code)
+				return c;
+		}
+	}
+
+	return &emptyconstraint;
+}
+
+static void intel_pmu_pebs_enable(struct hw_perf_event *hwc)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	u64 val = cpuc->pebs_enabled;
+
+	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
+
+	val |= 1ULL << hwc->idx;
+	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+}
+
+static void intel_pmu_pebs_disable(struct hw_perf_event *hwc)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	u64 val = cpuc->pebs_enabled;
+
+	val &= ~(1ULL << hwc->idx);
+	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+
+	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+}
+
+static void intel_pmu_pebs_enable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (cpuc->pebs_enabled)
+		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+}
+
+static void intel_pmu_pebs_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (cpuc->pebs_enabled)
+		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+}
+
+static int intel_pmu_save_and_restart(struct perf_event *event);
+static void intel_pmu_disable_event(struct perf_event *event);
+
+static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct debug_store *ds = cpuc->ds;
+	struct perf_event *event = cpuc->events[0]; /* PMC0 only */
+	struct pebs_record_core *at, *top;
+	struct perf_sample_data data;
+	struct pt_regs regs;
+	int n;
+
+	if (!event || !ds || !x86_pmu.pebs)
+		return;
+
+	intel_pmu_pebs_disable_all();
+
+	at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
+	top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
+
+	if (top <= at)
+		goto out;
+
+	ds->pebs_index = ds->pebs_buffer_base;
+
+	if (!intel_pmu_save_and_restart(event))
+		goto out;
+
+	perf_sample_data_init(&data, 0);
+	data.period = event->hw.last_period;
+
+	n = top - at;
+
+	/*
+	 * Should not happen, we program the threshold at 1 and do not
+	 * set a reset value.
+	 */
+	WARN_ON_ONCE(n > 1);
+
+	/*
+	 * We use the interrupt regs as a base because the PEBS record
+	 * does not contain a full regs set, specifically it seems to
+	 * lack segment descriptors, which get used by things like
+	 * user_mode().
+	 *
+	 * In the simple case fix up only the IP and BP,SP regs, for
+	 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
+	 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
+	 */
+	regs = *iregs;
+	regs.ip = at->ip;
+	regs.bp = at->bp;
+	regs.sp = at->sp;
+
+	if (perf_event_overflow(event, 1, &data, &regs))
+		intel_pmu_disable_event(event);
+
+out:
+	intel_pmu_pebs_enable_all();
+}
+
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct debug_store *ds = cpuc->ds;
+	struct pebs_record_nhm *at, *top;
+	struct perf_sample_data data;
+	struct perf_event *event = NULL;
+	struct pt_regs regs;
+	int bit, n;
+
+	if (!ds || !x86_pmu.pebs)
+		return;
+
+	intel_pmu_pebs_disable_all();
+
+	at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
+
+	if (top <= at)
+		goto out;
+
+	ds->pebs_index = ds->pebs_buffer_base;
+
+	n = top - at;
+
+	/*
+	 * Should not happen, we program the threshold at 1 and do not
+	 * set a reset value.
+	 */
+	WARN_ON_ONCE(n > MAX_PEBS_EVENTS);
+
+	for ( ; at < top; at++) {
+		for_each_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
+			if (!cpuc->events[bit]->attr.precise)
+				continue;
+
+			event = cpuc->events[bit];
+		}
+
+		if (!event)
+			continue;
+
+		if (!intel_pmu_save_and_restart(event))
+			continue;
+
+		perf_sample_data_init(&data, 0);
+		data.period = event->hw.last_period;
+
+		/*
+		 * See the comment in intel_pmu_drain_pebs_core()
+		 */
+		regs = *iregs;
+		regs.ip = at->ip;
+		regs.bp = at->bp;
+		regs.sp = at->sp;
+
+		if (perf_event_overflow(event, 1, &data, &regs))
+			intel_pmu_disable_event(event);
+	}
+out:
+	intel_pmu_pebs_enable_all();
+}
+
+/*
+ * BTS, PEBS probe and setup
+ */
+
+static void intel_ds_init(void)
+{
+	/*
+	 * No support for 32bit formats
+	 */
+	if (!boot_cpu_has(X86_FEATURE_DTES64))
+		return;
+
+	x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
+	x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
+	if (x86_pmu.pebs) {
+		int format = 0;
+
+		if (x86_pmu.version > 1) {
+			u64 capabilities;
+			/*
+			 * v2+ has a PEBS format field
+			 */
+			rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+			format = (capabilities >> 8) & 0xf;
+		}
+
+		switch (format) {
+		case 0:
+			printk(KERN_CONT "PEBS v0, ");
+			x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
+			x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
+			x86_pmu.pebs_constraints = intel_core_pebs_events;
+			break;
+
+		case 1:
+			printk(KERN_CONT "PEBS v1, ");
+			x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
+			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+			x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
+			break;
+
+		default:
+			printk(KERN_CONT "PEBS unknown format: %d, ", format);
+			x86_pmu.pebs = 0;
+			break;
+		}
+	}
+}
+
+#else /* CONFIG_CPU_SUP_INTEL */
+
+static int reseve_ds_buffers(void)
+{
+	return 0;
+}
+
+static void release_ds_buffers(void)
+{
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 80acbf3d5de1..42307b50c787 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -203,8 +203,9 @@ struct perf_event_attr {
 				enable_on_exec :  1, /* next exec enables     */
 				task           :  1, /* trace fork/exit       */
 				watermark      :  1, /* wakeup_watermark      */
+				precise        :  1, /* OoO invariant counter */
 
-				__reserved_1   : 49;
+				__reserved_1   : 48;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
-- 
cgit v1.2.3-55-g7522


From caff2befffe899e63df5cc760b7ed01cfd902685 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Wed, 3 Mar 2010 12:02:30 +0100
Subject: perf, x86: Implement simple LBR support

Implement simple suport Intel Last-Branch-Record, it supports all
hardware that implements FREEZE_LBRS_ON_PMI, but does not (yet) implement
the LBR config register.

The Intel LBR is a FIFO of From,To addresses describing the last few
branches the hardware took.

This patch does not add perf interface to the LBR, but merely provides an
interface for internal use.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.544191154@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c           |  18 +++
 arch/x86/kernel/cpu/perf_event_intel.c     |  13 ++
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 228 +++++++++++++++++++++++++++++
 include/linux/perf_event.h                 |  11 ++
 4 files changed, 270 insertions(+)
 create mode 100644 arch/x86/kernel/cpu/perf_event_intel_lbr.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0c03d5c1671f..1badff6b6b28 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -48,6 +48,8 @@ struct amd_nb {
 	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
 };
 
+#define MAX_LBR_ENTRIES		16
+
 struct cpu_hw_events {
 	/*
 	 * Generic x86 PMC bits
@@ -69,6 +71,14 @@ struct cpu_hw_events {
 	struct debug_store	*ds;
 	u64			pebs_enabled;
 
+	/*
+	 * Intel LBR bits
+	 */
+	int				lbr_users;
+	void				*lbr_context;
+	struct perf_branch_stack	lbr_stack;
+	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
+
 	/*
 	 * AMD specific bits
 	 */
@@ -159,6 +169,13 @@ struct x86_pmu {
 	int		pebs_record_size;
 	void		(*drain_pebs)(struct pt_regs *regs);
 	struct event_constraint *pebs_constraints;
+
+	/*
+	 * Intel LBR
+	 */
+	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
+	int		lbr_nr;			   /* hardware stack size */
+	int		lbr_format;		   /* hardware format     */
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -1237,6 +1254,7 @@ undo:
 
 #include "perf_event_amd.c"
 #include "perf_event_p6.c"
+#include "perf_event_intel_lbr.c"
 #include "perf_event_intel_ds.c"
 #include "perf_event_intel.c"
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 11446412e4c7..44f6ed42a934 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -480,6 +480,7 @@ static void intel_pmu_disable_all(void)
 		intel_pmu_disable_bts();
 
 	intel_pmu_pebs_disable_all();
+	intel_pmu_lbr_disable_all();
 }
 
 static void intel_pmu_enable_all(void)
@@ -499,6 +500,7 @@ static void intel_pmu_enable_all(void)
 	}
 
 	intel_pmu_pebs_enable_all();
+	intel_pmu_lbr_enable_all();
 }
 
 static inline u64 intel_pmu_get_status(void)
@@ -674,6 +676,8 @@ again:
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
 
+	intel_pmu_lbr_read();
+
 	/*
 	 * PEBS overflow sets bit 62 in the global status register
 	 */
@@ -848,6 +852,8 @@ static __init int intel_pmu_init(void)
 		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
+		intel_pmu_lbr_init_core();
+
 		x86_pmu.event_constraints = intel_core2_event_constraints;
 		pr_cont("Core2 events, ");
 		break;
@@ -857,13 +863,18 @@ static __init int intel_pmu_init(void)
 		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
+		intel_pmu_lbr_init_nhm();
+
 		x86_pmu.event_constraints = intel_nehalem_event_constraints;
 		pr_cont("Nehalem/Corei7 events, ");
 		break;
+
 	case 28: /* Atom */
 		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
+		intel_pmu_lbr_init_atom();
+
 		x86_pmu.event_constraints = intel_gen_event_constraints;
 		pr_cont("Atom events, ");
 		break;
@@ -873,6 +884,8 @@ static __init int intel_pmu_init(void)
 		memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
+		intel_pmu_lbr_init_nhm();
+
 		x86_pmu.event_constraints = intel_westmere_event_constraints;
 		pr_cont("Westmere events, ");
 		break;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
new file mode 100644
index 000000000000..ea3e99ed82ce
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -0,0 +1,228 @@
+#ifdef CONFIG_CPU_SUP_INTEL
+
+enum {
+	LBR_FORMAT_32		= 0x00,
+	LBR_FORMAT_LIP		= 0x01,
+	LBR_FORMAT_EIP		= 0x02,
+	LBR_FORMAT_EIP_FLAGS	= 0x03,
+};
+
+/*
+ * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
+ * otherwise it becomes near impossible to get a reliable stack.
+ */
+
+#define X86_DEBUGCTL_LBR               		(1 << 0)
+#define X86_DEBUGCTL_FREEZE_LBRS_ON_PMI		(1 << 11)
+
+static void __intel_pmu_lbr_enable(void)
+{
+	u64 debugctl;
+
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	debugctl |= (X86_DEBUGCTL_LBR | X86_DEBUGCTL_FREEZE_LBRS_ON_PMI);
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void __intel_pmu_lbr_disable(void)
+{
+	u64 debugctl;
+
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	debugctl &= ~(X86_DEBUGCTL_LBR | X86_DEBUGCTL_FREEZE_LBRS_ON_PMI);
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void intel_pmu_lbr_reset_32(void)
+{
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++)
+		wrmsrl(x86_pmu.lbr_from + i, 0);
+}
+
+static void intel_pmu_lbr_reset_64(void)
+{
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		wrmsrl(x86_pmu.lbr_from + i, 0);
+		wrmsrl(x86_pmu.lbr_to   + i, 0);
+	}
+}
+
+static void intel_pmu_lbr_reset(void)
+{
+	if (x86_pmu.lbr_format == LBR_FORMAT_32)
+		intel_pmu_lbr_reset_32();
+	else
+		intel_pmu_lbr_reset_64();
+}
+
+static void intel_pmu_lbr_enable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (!x86_pmu.lbr_nr)
+		return;
+
+	WARN_ON(cpuc->enabled);
+
+	/*
+	 * Reset the LBR stack if this is the first LBR user or
+	 * we changed task context so as to avoid data leaks.
+	 */
+
+	if (!cpuc->lbr_users ||
+	    (event->ctx->task && cpuc->lbr_context != event->ctx)) {
+		intel_pmu_lbr_reset();
+		cpuc->lbr_context = event->ctx;
+	}
+
+	cpuc->lbr_users++;
+}
+
+static void intel_pmu_lbr_disable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (!x86_pmu.lbr_nr)
+		return;
+
+	cpuc->lbr_users--;
+
+	BUG_ON(cpuc->lbr_users < 0);
+	WARN_ON(cpuc->enabled);
+}
+
+static void intel_pmu_lbr_enable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (cpuc->lbr_users)
+		__intel_pmu_lbr_enable();
+}
+
+static void intel_pmu_lbr_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (cpuc->lbr_users)
+		__intel_pmu_lbr_disable();
+}
+
+static inline u64 intel_pmu_lbr_tos(void)
+{
+	u64 tos;
+
+	rdmsrl(x86_pmu.lbr_tos, tos);
+
+	return tos;
+}
+
+static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+{
+	unsigned long mask = x86_pmu.lbr_nr - 1;
+	u64 tos = intel_pmu_lbr_tos();
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++, tos--) {
+		unsigned long lbr_idx = (tos - i) & mask;
+		union {
+			struct {
+				u32 from;
+				u32 to;
+			};
+			u64     lbr;
+		} msr_lastbranch;
+
+		rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
+
+		cpuc->lbr_entries[i].from  = msr_lastbranch.from;
+		cpuc->lbr_entries[i].to    = msr_lastbranch.to;
+		cpuc->lbr_entries[i].flags = 0;
+	}
+	cpuc->lbr_stack.nr = i;
+}
+
+#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
+
+/*
+ * Due to lack of segmentation in Linux the effective address (offset)
+ * is the same as the linear address, allowing us to merge the LIP and EIP
+ * LBR formats.
+ */
+static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+{
+	unsigned long mask = x86_pmu.lbr_nr - 1;
+	u64 tos = intel_pmu_lbr_tos();
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++, tos--) {
+		unsigned long lbr_idx = (tos - i) & mask;
+		u64 from, to, flags = 0;
+
+		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
+		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
+
+		if (x86_pmu.lbr_format == LBR_FORMAT_EIP_FLAGS) {
+			flags = !!(from & LBR_FROM_FLAG_MISPRED);
+			from = (u64)((((s64)from) << 1) >> 1);
+		}
+
+		cpuc->lbr_entries[i].from  = from;
+		cpuc->lbr_entries[i].to    = to;
+		cpuc->lbr_entries[i].flags = flags;
+	}
+	cpuc->lbr_stack.nr = i;
+}
+
+static void intel_pmu_lbr_read(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (!cpuc->lbr_users)
+		return;
+
+	if (x86_pmu.lbr_format == LBR_FORMAT_32)
+		intel_pmu_lbr_read_32(cpuc);
+	else
+		intel_pmu_lbr_read_64(cpuc);
+}
+
+static int intel_pmu_lbr_format(void)
+{
+	u64 capabilities;
+
+	rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+	return capabilities & 0x1f;
+}
+
+static void intel_pmu_lbr_init_core(void)
+{
+	x86_pmu.lbr_format = intel_pmu_lbr_format();
+	x86_pmu.lbr_nr     = 4;
+	x86_pmu.lbr_tos    = 0x01c9;
+	x86_pmu.lbr_from   = 0x40;
+	x86_pmu.lbr_to     = 0x60;
+}
+
+static void intel_pmu_lbr_init_nhm(void)
+{
+	x86_pmu.lbr_format = intel_pmu_lbr_format();
+	x86_pmu.lbr_nr     = 16;
+	x86_pmu.lbr_tos    = 0x01c9;
+	x86_pmu.lbr_from   = 0x680;
+	x86_pmu.lbr_to     = 0x6c0;
+}
+
+static void intel_pmu_lbr_init_atom(void)
+{
+	x86_pmu.lbr_format = intel_pmu_lbr_format();
+	x86_pmu.lbr_nr	   = 8;
+	x86_pmu.lbr_tos    = 0x01c9;
+	x86_pmu.lbr_from   = 0x40;
+	x86_pmu.lbr_to     = 0x60;
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 42307b50c787..ab4fd9ede264 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -467,6 +467,17 @@ struct perf_raw_record {
 	void				*data;
 };
 
+struct perf_branch_entry {
+	__u64				from;
+	__u64				to;
+	__u64				flags;
+};
+
+struct perf_branch_stack {
+	__u64				nr;
+	struct perf_branch_entry	entries[0];
+};
+
 struct task_struct;
 
 /**
-- 
cgit v1.2.3-55-g7522


From ef21f683a045a79b6aa86ad81e5fdfc0d5ddd250 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Wed, 3 Mar 2010 13:12:23 +0100
Subject: perf, x86: use LBR for PEBS IP+1 fixup

Use the LBR to fix up the PEBS IP+1 issue.

As said, PEBS reports the next instruction, here we use the LBR to find
the last branch and from that construct the actual IP. If the IP matches
the LBR-TO, we use LBR-FROM, otherwise we use the LBR-TO address as the
beginning of the last basic block and decode forward.

Once we find a match to the current IP, we use the previous location.

This patch introduces a new ABI element: PERF_RECORD_MISC_EXACT, which
conveys that the reported IP (PERF_SAMPLE_IP) is the exact instruction
that caused the event (barring CPU errata).

The fixup can fail due to various reasons:

 1) LBR contains invalid data (quite possible)
 2) part of the basic block got paged out
 3) the reported IP isn't part of the basic block (see 1)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.619375431@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event.h         | 19 +++++++
 arch/x86/kernel/cpu/perf_event.c          | 70 +++++++++++++-------------
 arch/x86/kernel/cpu/perf_event_intel.c    |  4 +-
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 84 ++++++++++++++++++++++++++++++-
 include/linux/perf_event.h                |  6 +++
 5 files changed, 144 insertions(+), 39 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index db6109a885a7..a9038c951619 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -136,6 +136,25 @@ extern void perf_events_lapic_init(void);
 
 #define PERF_EVENT_INDEX_OFFSET			0
 
+/*
+ * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups.
+ * This flag is otherwise unused and ABI specified to be 0, so nobody should
+ * care what we do with it.
+ */
+#define PERF_EFLAGS_EXACT	(1UL << 3)
+
+#define perf_misc_flags(regs)				\
+({	int misc = 0;					\
+	if (user_mode(regs))				\
+		misc |= PERF_RECORD_MISC_USER;		\
+	else						\
+		misc |= PERF_RECORD_MISC_KERNEL;	\
+	if (regs->flags & PERF_EFLAGS_EXACT)		\
+		misc |= PERF_RECORD_MISC_EXACT;		\
+	misc; })
+
+#define perf_instruction_pointer(regs)	((regs)->ip)
+
 #else
 static inline void init_hw_perf_events(void)		{ }
 static inline void perf_events_lapic_init(void)	{ }
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1badff6b6b28..5cb4e8dcee4b 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -29,6 +29,41 @@
 #include <asm/stacktrace.h>
 #include <asm/nmi.h>
 
+/*
+ * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ */
+static unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+{
+	unsigned long offset, addr = (unsigned long)from;
+	int type = in_nmi() ? KM_NMI : KM_IRQ0;
+	unsigned long size, len = 0;
+	struct page *page;
+	void *map;
+	int ret;
+
+	do {
+		ret = __get_user_pages_fast(addr, 1, 0, &page);
+		if (!ret)
+			break;
+
+		offset = addr & (PAGE_SIZE - 1);
+		size = min(PAGE_SIZE - offset, n - len);
+
+		map = kmap_atomic(page, type);
+		memcpy(to, map+offset, size);
+		kunmap_atomic(map, type);
+		put_page(page);
+
+		len  += size;
+		to   += size;
+		addr += size;
+
+	} while (len < n);
+
+	return len;
+}
+
 static u64 perf_event_mask __read_mostly;
 
 struct event_constraint {
@@ -1550,41 +1585,6 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 	dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
 }
 
-/*
- * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
- */
-static unsigned long
-copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
-{
-	unsigned long offset, addr = (unsigned long)from;
-	int type = in_nmi() ? KM_NMI : KM_IRQ0;
-	unsigned long size, len = 0;
-	struct page *page;
-	void *map;
-	int ret;
-
-	do {
-		ret = __get_user_pages_fast(addr, 1, 0, &page);
-		if (!ret)
-			break;
-
-		offset = addr & (PAGE_SIZE - 1);
-		size = min(PAGE_SIZE - offset, n - len);
-
-		map = kmap_atomic(page, type);
-		memcpy(to, map+offset, size);
-		kunmap_atomic(map, type);
-		put_page(page);
-
-		len  += size;
-		to   += size;
-		addr += size;
-
-	} while (len < n);
-
-	return len;
-}
-
 static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
 {
 	unsigned long bytes;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 44f6ed42a934..7eb78be3b229 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -547,7 +547,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
 	x86_pmu_disable_event(event);
 
 	if (unlikely(event->attr.precise))
-		intel_pmu_pebs_disable(hwc);
+		intel_pmu_pebs_disable(event);
 }
 
 static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
@@ -600,7 +600,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
 	}
 
 	if (unlikely(event->attr.precise))
-		intel_pmu_pebs_enable(hwc);
+		intel_pmu_pebs_enable(event);
 
 	__x86_pmu_enable_event(hwc);
 }
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 0d994ef213b9..50e6ff3281fc 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -331,26 +331,32 @@ intel_pebs_constraints(struct perf_event *event)
 	return &emptyconstraint;
 }
 
-static void intel_pmu_pebs_enable(struct hw_perf_event *hwc)
+static void intel_pmu_pebs_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
 	u64 val = cpuc->pebs_enabled;
 
 	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
 
 	val |= 1ULL << hwc->idx;
 	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+
+	intel_pmu_lbr_enable(event);
 }
 
-static void intel_pmu_pebs_disable(struct hw_perf_event *hwc)
+static void intel_pmu_pebs_disable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
 	u64 val = cpuc->pebs_enabled;
 
 	val &= ~(1ULL << hwc->idx);
 	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
 
 	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+
+	intel_pmu_lbr_disable(event);
 }
 
 static void intel_pmu_pebs_enable_all(void)
@@ -369,6 +375,70 @@ static void intel_pmu_pebs_disable_all(void)
 		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
 }
 
+#include <asm/insn.h>
+
+#define MAX_INSN_SIZE	16
+
+static inline bool kernel_ip(unsigned long ip)
+{
+#ifdef CONFIG_X86_32
+	return ip > PAGE_OFFSET;
+#else
+	return (long)ip < 0;
+#endif
+}
+
+static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	unsigned long from = cpuc->lbr_entries[0].from;
+	unsigned long old_to, to = cpuc->lbr_entries[0].to;
+	unsigned long ip = regs->ip;
+
+	if (!cpuc->lbr_stack.nr || !from || !to)
+		return 0;
+
+	if (ip < to)
+		return 0;
+
+	/*
+	 * We sampled a branch insn, rewind using the LBR stack
+	 */
+	if (ip == to) {
+		regs->ip = from;
+		return 1;
+	}
+
+	do {
+		struct insn insn;
+		u8 buf[MAX_INSN_SIZE];
+		void *kaddr;
+
+		old_to = to;
+		if (!kernel_ip(ip)) {
+			int bytes, size = min_t(int, MAX_INSN_SIZE, ip - to);
+
+			bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+			if (bytes != size)
+				return 0;
+
+			kaddr = buf;
+		} else
+			kaddr = (void *)to;
+
+		kernel_insn_init(&insn, kaddr);
+		insn_get_length(&insn);
+		to += insn.length;
+	} while (to < ip);
+
+	if (to == ip) {
+		regs->ip = old_to;
+		return 1;
+	}
+
+	return 0;
+}
+
 static int intel_pmu_save_and_restart(struct perf_event *event);
 static void intel_pmu_disable_event(struct perf_event *event);
 
@@ -424,6 +494,11 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	regs.bp = at->bp;
 	regs.sp = at->sp;
 
+	if (intel_pmu_pebs_fixup_ip(&regs))
+		regs.flags |= PERF_EFLAGS_EXACT;
+	else
+		regs.flags &= ~PERF_EFLAGS_EXACT;
+
 	if (perf_event_overflow(event, 1, &data, &regs))
 		intel_pmu_disable_event(event);
 
@@ -487,6 +562,11 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		regs.bp = at->bp;
 		regs.sp = at->sp;
 
+		if (intel_pmu_pebs_fixup_ip(&regs))
+			regs.flags |= PERF_EFLAGS_EXACT;
+		else
+			regs.flags &= ~PERF_EFLAGS_EXACT;
+
 		if (perf_event_overflow(event, 1, &data, &regs))
 			intel_pmu_disable_event(event);
 	}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ab4fd9ede264..be85f7c4a94f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -294,6 +294,12 @@ struct perf_event_mmap_page {
 #define PERF_RECORD_MISC_USER			(2 << 0)
 #define PERF_RECORD_MISC_HYPERVISOR		(3 << 0)
 
+#define PERF_RECORD_MISC_EXACT			(1 << 14)
+/*
+ * Reserve the last bit to indicate some extended misc field
+ */
+#define PERF_RECORD_MISC_EXT_RESERVED		(1 << 15)
+
 struct perf_event_header {
 	__u32	type;
 	__u16	misc;
-- 
cgit v1.2.3-55-g7522


From 8db909a7e3c888b5d45aef7650d74ccebe3ce725 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Wed, 3 Mar 2010 17:07:40 +0100
Subject: perf, x86: Clean up IA32_PERF_CAPABILITIES usage

Saner PERF_CAPABILITIES support, which also exposes pebs_trap. Use that
latter to make PEBS's use of LBR conditional since a fault-like pebs
should already report the correct IP.

( As of this writing there is no known hardware that implements
  !pebs_trap )

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.770650663@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c           | 15 +++++++++++++--
 arch/x86/kernel/cpu/perf_event_intel.c     | 10 ++++++++++
 arch/x86/kernel/cpu/perf_event_intel_ds.c  | 30 +++++++++++++++---------------
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 18 ++++--------------
 4 files changed, 42 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5cb4e8dcee4b..7b5430b2efe7 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -154,6 +154,17 @@ struct cpu_hw_events {
 #define for_each_event_constraint(e, c)	\
 	for ((e) = (c); (e)->cmask; (e)++)
 
+union perf_capabilities {
+	struct {
+		u64	lbr_format    : 6;
+		u64	pebs_trap     : 1;
+		u64	pebs_arch_reg : 1;
+		u64	pebs_format   : 4;
+		u64	smm_freeze    : 1;
+	};
+	u64	capabilities;
+};
+
 /*
  * struct x86_pmu - generic x86 pmu
  */
@@ -195,7 +206,8 @@ struct x86_pmu {
 	/*
 	 * Intel Arch Perfmon v2+
 	 */
-	u64		intel_ctrl;
+	u64			intel_ctrl;
+	union perf_capabilities intel_cap;
 
 	/*
 	 * Intel DebugStore bits
@@ -210,7 +222,6 @@ struct x86_pmu {
 	 */
 	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
 	int		lbr_nr;			   /* hardware stack size */
-	int		lbr_format;		   /* hardware format     */
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 7eb78be3b229..246c07238823 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -835,6 +835,16 @@ static __init int intel_pmu_init(void)
 	if (version > 1)
 		x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
 
+	/*
+	 * v2 and above have a perf capabilities MSR
+	 */
+	if (version > 1) {
+		u64 capabilities;
+
+		rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+		x86_pmu.intel_cap.capabilities = capabilities;
+	}
+
 	intel_ds_init();
 
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 50e6ff3281fc..5e4029441b2d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -342,7 +342,8 @@ static void intel_pmu_pebs_enable(struct perf_event *event)
 	val |= 1ULL << hwc->idx;
 	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
 
-	intel_pmu_lbr_enable(event);
+	if (x86_pmu.intel_cap.pebs_trap)
+		intel_pmu_lbr_enable(event);
 }
 
 static void intel_pmu_pebs_disable(struct perf_event *event)
@@ -356,7 +357,8 @@ static void intel_pmu_pebs_disable(struct perf_event *event)
 
 	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
 
-	intel_pmu_lbr_disable(event);
+	if (x86_pmu.intel_cap.pebs_trap)
+		intel_pmu_lbr_disable(event);
 }
 
 static void intel_pmu_pebs_enable_all(void)
@@ -395,6 +397,12 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 	unsigned long old_to, to = cpuc->lbr_entries[0].to;
 	unsigned long ip = regs->ip;
 
+	/*
+	 * We don't need to fixup if the PEBS assist is fault like
+	 */
+	if (!x86_pmu.intel_cap.pebs_trap)
+		return 1;
+
 	if (!cpuc->lbr_stack.nr || !from || !to)
 		return 0;
 
@@ -589,34 +597,26 @@ static void intel_ds_init(void)
 	x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
 	x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
 	if (x86_pmu.pebs) {
-		int format = 0;
-
-		if (x86_pmu.version > 1) {
-			u64 capabilities;
-			/*
-			 * v2+ has a PEBS format field
-			 */
-			rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
-			format = (capabilities >> 8) & 0xf;
-		}
+		char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
+		int format = x86_pmu.intel_cap.pebs_format;
 
 		switch (format) {
 		case 0:
-			printk(KERN_CONT "PEBS v0, ");
+			printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
 			x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
 			x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
 			x86_pmu.pebs_constraints = intel_core_pebs_events;
 			break;
 
 		case 1:
-			printk(KERN_CONT "PEBS v1, ");
+			printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
 			x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
 			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
 			x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
 			break;
 
 		default:
-			printk(KERN_CONT "PEBS unknown format: %d, ", format);
+			printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
 			x86_pmu.pebs = 0;
 			break;
 		}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index ea3e99ed82ce..4f3a124329c4 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -53,7 +53,7 @@ static void intel_pmu_lbr_reset_64(void)
 
 static void intel_pmu_lbr_reset(void)
 {
-	if (x86_pmu.lbr_format == LBR_FORMAT_32)
+	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
 		intel_pmu_lbr_reset_32();
 	else
 		intel_pmu_lbr_reset_64();
@@ -155,6 +155,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 {
 	unsigned long mask = x86_pmu.lbr_nr - 1;
+	int lbr_format = x86_pmu.intel_cap.lbr_format;
 	u64 tos = intel_pmu_lbr_tos();
 	int i;
 
@@ -165,7 +166,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
 		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
 
-		if (x86_pmu.lbr_format == LBR_FORMAT_EIP_FLAGS) {
+		if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
 			flags = !!(from & LBR_FROM_FLAG_MISPRED);
 			from = (u64)((((s64)from) << 1) >> 1);
 		}
@@ -184,23 +185,14 @@ static void intel_pmu_lbr_read(void)
 	if (!cpuc->lbr_users)
 		return;
 
-	if (x86_pmu.lbr_format == LBR_FORMAT_32)
+	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
 		intel_pmu_lbr_read_32(cpuc);
 	else
 		intel_pmu_lbr_read_64(cpuc);
 }
 
-static int intel_pmu_lbr_format(void)
-{
-	u64 capabilities;
-
-	rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
-	return capabilities & 0x1f;
-}
-
 static void intel_pmu_lbr_init_core(void)
 {
-	x86_pmu.lbr_format = intel_pmu_lbr_format();
 	x86_pmu.lbr_nr     = 4;
 	x86_pmu.lbr_tos    = 0x01c9;
 	x86_pmu.lbr_from   = 0x40;
@@ -209,7 +201,6 @@ static void intel_pmu_lbr_init_core(void)
 
 static void intel_pmu_lbr_init_nhm(void)
 {
-	x86_pmu.lbr_format = intel_pmu_lbr_format();
 	x86_pmu.lbr_nr     = 16;
 	x86_pmu.lbr_tos    = 0x01c9;
 	x86_pmu.lbr_from   = 0x680;
@@ -218,7 +209,6 @@ static void intel_pmu_lbr_init_nhm(void)
 
 static void intel_pmu_lbr_init_atom(void)
 {
-	x86_pmu.lbr_format = intel_pmu_lbr_format();
 	x86_pmu.lbr_nr	   = 8;
 	x86_pmu.lbr_tos    = 0x01c9;
 	x86_pmu.lbr_from   = 0x40;
-- 
cgit v1.2.3-55-g7522


From 7e1a40dda619b0483fbe0740494ed2c2a1f05289 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 4 Mar 2010 12:38:03 +0100
Subject: perf, x86: Expose the full PEBS record using PERF_SAMPLE_RAW

Expose the full PEBS record using PERF_SAMPLE_RAW

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.847218224@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 5e4029441b2d..ef56f053ab31 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -457,6 +457,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	struct perf_event *event = cpuc->events[0]; /* PMC0 only */
 	struct pebs_record_core *at, *top;
 	struct perf_sample_data data;
+	struct perf_raw_record raw;
 	struct pt_regs regs;
 	int n;
 
@@ -479,6 +480,12 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	perf_sample_data_init(&data, 0);
 	data.period = event->hw.last_period;
 
+	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+		raw.size = x86_pmu.pebs_record_size;
+		raw.data = at;
+		data.raw = &raw;
+	}
+
 	n = top - at;
 
 	/*
@@ -521,6 +528,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	struct pebs_record_nhm *at, *top;
 	struct perf_sample_data data;
 	struct perf_event *event = NULL;
+	struct perf_raw_record raw;
 	struct pt_regs regs;
 	int bit, n;
 
@@ -562,6 +570,12 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		perf_sample_data_init(&data, 0);
 		data.period = event->hw.last_period;
 
+		if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+			raw.size = x86_pmu.pebs_record_size;
+			raw.data = at;
+			data.raw = &raw;
+		}
+
 		/*
 		 * See the comment in intel_pmu_drain_pebs_core()
 		 */
-- 
cgit v1.2.3-55-g7522


From 30a813ae035d3e220a89609adce878e045c49547 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 4 Mar 2010 13:49:21 +0100
Subject: x86: Move MAX_INSN_SIZE into asm/insn.h

Since there's now two users for this, place it in a common header.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.923774125@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/insn.h               | 2 ++
 arch/x86/include/asm/kprobes.h            | 2 +-
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 2 --
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 96c2e0ad04ca..88c765e16410 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -68,6 +68,8 @@ struct insn {
 	const insn_byte_t *next_byte;
 };
 
+#define MAX_INSN_SIZE	16
+
 #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
 #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
 #define X86_MODRM_RM(modrm) ((modrm) & 0x07)
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 4ffa345a8ccb..547882539157 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -24,6 +24,7 @@
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
+#include <asm/insn.h>
 
 #define  __ARCH_WANT_KPROBES_INSN_SLOT
 
@@ -36,7 +37,6 @@ typedef u8 kprobe_opcode_t;
 #define RELATIVEJUMP_SIZE 5
 #define RELATIVECALL_OPCODE 0xe8
 #define RELATIVE_ADDR_SIZE 4
-#define MAX_INSN_SIZE 16
 #define MAX_STACK_SIZE 64
 #define MIN_STACK_SIZE(ADDR)					       \
 	(((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index ef56f053ab31..72453ac5fb7d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -379,8 +379,6 @@ static void intel_pmu_pebs_disable_all(void)
 
 #include <asm/insn.h>
 
-#define MAX_INSN_SIZE	16
-
 static inline bool kernel_ip(unsigned long ip)
 {
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3-55-g7522


From 3adaebd69557615c1bf0365ce5e32d93ac7d82af Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 5 Mar 2010 12:09:29 +0100
Subject: perf, x86: Fix silly bug in data store buffer allocation

Fix up the ds allocation error path, where we could free @buffer before
we used it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100305154128.813452402@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 72453ac5fb7d..a67fff14475e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -127,10 +127,8 @@ static int reserve_ds_buffers(void)
 
 		err = -ENOMEM;
 		ds = kzalloc(sizeof(*ds), GFP_KERNEL);
-		if (unlikely(!ds)) {
-			kfree(buffer);
+		if (unlikely(!ds))
 			break;
-		}
 		per_cpu(cpu_hw_events, cpu).ds = ds;
 
 		if (x86_pmu.bts) {
-- 
cgit v1.2.3-55-g7522


From 3c44780b220e876b01e39d4028cd6f4205fbf5d6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 4 Mar 2010 21:49:01 +0100
Subject: perf, x86: Disable PEBS on clovertown chips

This CPU has just too many handycaps to be really useful.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100305154128.890278662@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       |  4 ++++
 arch/x86/kernel/cpu/perf_event_intel.c | 27 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 7b5430b2efe7..335ee1d38b79 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -197,6 +197,7 @@ struct x86_pmu {
 	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
 						 struct perf_event *event);
 	struct event_constraint *event_constraints;
+	void		(*quirks)(void);
 
 	void		(*cpu_prepare)(int cpu);
 	void		(*cpu_starting)(int cpu);
@@ -1373,6 +1374,9 @@ void __init init_hw_perf_events(void)
 
 	pr_cont("%s PMU driver.\n", x86_pmu.name);
 
+	if (x86_pmu.quirks)
+		x86_pmu.quirks();
+
 	if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
 		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
 		     x86_pmu.num_events, X86_PMC_MAX_GENERIC);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 246c07238823..224c952071f9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -792,6 +792,32 @@ static __initconst struct x86_pmu intel_pmu = {
 	.cpu_dying		= fini_debug_store_on_cpu,
 };
 
+static void intel_clovertown_quirks(void)
+{
+	/*
+	 * PEBS is unreliable due to:
+	 *
+	 *   AJ67  - PEBS may experience CPL leaks
+	 *   AJ68  - PEBS PMI may be delayed by one event
+	 *   AJ69  - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
+	 *   AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
+	 *
+	 * AJ67 could be worked around by restricting the OS/USR flags.
+	 * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
+	 *
+	 * AJ106 could possibly be worked around by not allowing LBR
+	 *       usage from PEBS, including the fixup.
+	 * AJ68  could possibly be worked around by always programming
+	 * 	 a pebs_event_reset[0] value and coping with the lost events.
+	 *
+	 * But taken together it might just make sense to not enable PEBS on
+	 * these chips.
+	 */
+	printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
+	x86_pmu.pebs = 0;
+	x86_pmu.pebs_constraints = NULL;
+}
+
 static __init int intel_pmu_init(void)
 {
 	union cpuid10_edx edx;
@@ -856,6 +882,7 @@ static __init int intel_pmu_init(void)
 		break;
 
 	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+		x86_pmu.quirks = intel_clovertown_quirks;
 	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
 	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
 	case 29: /* six-core 45 nm xeon "Dunnington" */
-- 
cgit v1.2.3-55-g7522


From 74846d35b24b6efd61bb88a0a750b6bb257e6e78 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 5 Mar 2010 13:49:35 +0100
Subject: perf, x86: Clear the LBRs on init

Some CPUs have errata where the LBR is not cleared on Power-On. So always
clear the LBRs before use.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100305154128.966563424@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel.c     | 18 ++++++++++++++++--
 arch/x86/kernel/cpu/perf_event_intel_lbr.c |  3 +++
 2 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 224c952071f9..c135ed735b22 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -767,6 +767,20 @@ static __initconst struct x86_pmu core_pmu = {
 	.event_constraints	= intel_core_event_constraints,
 };
 
+static void intel_pmu_cpu_starting(int cpu)
+{
+	init_debug_store_on_cpu(cpu);
+	/*
+	 * Deal with CPUs that don't clear their LBRs on power-up.
+	 */
+	intel_pmu_lbr_reset();
+}
+
+static void intel_pmu_cpu_dying(int cpu)
+{
+	fini_debug_store_on_cpu(cpu);
+}
+
 static __initconst struct x86_pmu intel_pmu = {
 	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
@@ -788,8 +802,8 @@ static __initconst struct x86_pmu intel_pmu = {
 	.max_period		= (1ULL << 31) - 1,
 	.get_event_constraints	= intel_get_event_constraints,
 
-	.cpu_starting		= init_debug_store_on_cpu,
-	.cpu_dying		= fini_debug_store_on_cpu,
+	.cpu_starting		= intel_pmu_cpu_starting,
+	.cpu_dying		= intel_pmu_cpu_dying,
 };
 
 static void intel_clovertown_quirks(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 4f3a124329c4..dcec765f8188 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -53,6 +53,9 @@ static void intel_pmu_lbr_reset_64(void)
 
 static void intel_pmu_lbr_reset(void)
 {
+	if (!x86_pmu.lbr_nr)
+		return;
+
 	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
 		intel_pmu_lbr_reset_32();
 	else
-- 
cgit v1.2.3-55-g7522


From a562b1871f7f7d2f3a835c3c1e07fa58d473cfb7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 5 Mar 2010 16:29:14 +0100
Subject: perf, x86: Robustify PEBS fixup

It turns out the LBR is massively unreliable on certain CPUs, so code the
fixup a little more defensive to avoid crashing the kernel.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100305154129.042271287@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index a67fff14475e..e7ac51770d4d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -399,10 +399,23 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 	if (!x86_pmu.intel_cap.pebs_trap)
 		return 1;
 
+	/*
+	 * No LBR entry, no basic block, no rewinding
+	 */
 	if (!cpuc->lbr_stack.nr || !from || !to)
 		return 0;
 
-	if (ip < to)
+	/*
+	 * Basic blocks should never cross user/kernel boundaries
+	 */
+	if (kernel_ip(ip) != kernel_ip(to))
+		return 0;
+
+	/*
+	 * unsigned math, either ip is before the start (impossible) or
+	 * the basic block is larger than 1 page (sanity)
+	 */
+	if ((ip - to) > PAGE_SIZE)
 		return 0;
 
 	/*
@@ -420,7 +433,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 
 		old_to = to;
 		if (!kernel_ip(ip)) {
-			int bytes, size = min_t(int, MAX_INSN_SIZE, ip - to);
+			int bytes, size = MAX_INSN_SIZE;
 
 			bytes = copy_from_user_nmi(buf, (void __user *)to, size);
 			if (bytes != size)
@@ -440,6 +453,10 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 		return 1;
 	}
 
+	/*
+	 * Even though we decoded the basic block, the instruction stream
+	 * never matched the given IP, either the TO or the IP got corrupted.
+	 */
 	return 0;
 }
 
-- 
cgit v1.2.3-55-g7522


From cc7f00820b2f3be656569c41158d9323e425bcfe Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Mon, 8 Mar 2010 17:51:33 +0100
Subject: perf, x86: Avoid double disable on throttle vs
 ioctl(PERF_IOC_DISABLE)

Calling ioctl(PERF_EVENT_IOC_DISABLE) on a thottled counter would result
in a double disable, cure this by using x86_pmu_{start,stop} for
throttle/unthrottle and teach x86_pmu_stop() to check ->active_mask.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index e7ac51770d4d..a7401e4167df 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -461,7 +461,6 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 }
 
 static int intel_pmu_save_and_restart(struct perf_event *event);
-static void intel_pmu_disable_event(struct perf_event *event);
 
 static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 {
@@ -528,7 +527,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 		regs.flags &= ~PERF_EFLAGS_EXACT;
 
 	if (perf_event_overflow(event, 1, &data, &regs))
-		intel_pmu_disable_event(event);
+		x86_pmu_stop(event);
 
 out:
 	intel_pmu_pebs_enable_all();
@@ -603,7 +602,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 			regs.flags &= ~PERF_EFLAGS_EXACT;
 
 		if (perf_event_overflow(event, 1, &data, &regs))
-			intel_pmu_disable_event(event);
+			x86_pmu_stop(event);
 	}
 out:
 	intel_pmu_pebs_enable_all();
-- 
cgit v1.2.3-55-g7522


From 8f4aebd2be9892bf8fb79a2d8576d3f3ee7f00f6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Sat, 6 Mar 2010 13:26:11 +0100
Subject: perf, x86: Fix pebs drains

I overlooked the perf_disable()/perf_enable() calls in
intel_pmu_handle_irq(), (pointed out by Markus) so we should not
explicitly disable_all/enable_all pebs counters in the drain functions,
these are already disabled and enabling them early is confusing.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index a7401e4167df..66c6962f15f9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -476,18 +476,16 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	if (!event || !ds || !x86_pmu.pebs)
 		return;
 
-	intel_pmu_pebs_disable_all();
-
 	at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
 	top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
 
 	if (top <= at)
-		goto out;
+		return;
 
 	ds->pebs_index = ds->pebs_buffer_base;
 
 	if (!intel_pmu_save_and_restart(event))
-		goto out;
+		return;
 
 	perf_sample_data_init(&data, 0);
 	data.period = event->hw.last_period;
@@ -528,9 +526,6 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 
 	if (perf_event_overflow(event, 1, &data, &regs))
 		x86_pmu_stop(event);
-
-out:
-	intel_pmu_pebs_enable_all();
 }
 
 static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
@@ -547,13 +542,11 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	if (!ds || !x86_pmu.pebs)
 		return;
 
-	intel_pmu_pebs_disable_all();
-
 	at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
 	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
 
 	if (top <= at)
-		goto out;
+		return;
 
 	ds->pebs_index = ds->pebs_buffer_base;
 
@@ -604,8 +597,6 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		if (perf_event_overflow(event, 1, &data, &regs))
 			x86_pmu_stop(event);
 	}
-out:
-	intel_pmu_pebs_enable_all();
 }
 
 /*
-- 
cgit v1.2.3-55-g7522


From 4807e3d5dc7bb7057dd6ca3abb09f3da2eb8c323 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Sat, 6 Mar 2010 13:47:07 +0100
Subject: perf, x86: Fix PEBS enable/disable vs cpuc->enabled

We should never call ->enable with the pmu enabled, and we _can_ have
->disable called with the pmu enabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 66c6962f15f9..9ad0e67b9c82 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -338,7 +338,7 @@ static void intel_pmu_pebs_enable(struct perf_event *event)
 	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
 
 	val |= 1ULL << hwc->idx;
-	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+	WARN_ON_ONCE(cpuc->enabled);
 
 	if (x86_pmu.intel_cap.pebs_trap)
 		intel_pmu_lbr_enable(event);
@@ -351,7 +351,8 @@ static void intel_pmu_pebs_disable(struct perf_event *event)
 	u64 val = cpuc->pebs_enabled;
 
 	val &= ~(1ULL << hwc->idx);
-	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+	if (cpuc->enabled)
+		wrmsrl(MSR_IA32_PEBS_ENABLE, val);
 
 	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
 
-- 
cgit v1.2.3-55-g7522


From 2df202bf7520eaffcbfb07e45dfa3cfb0aeee2c0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Sat, 6 Mar 2010 13:48:54 +0100
Subject: perf, x86: Fix LBR enable/disable vs cpuc->enabled

We should never call ->enable with the pmu enabled, and we _can_ have
->disable called with the pmu enabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index dcec765f8188..0145f99f7a35 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -69,7 +69,7 @@ static void intel_pmu_lbr_enable(struct perf_event *event)
 	if (!x86_pmu.lbr_nr)
 		return;
 
-	WARN_ON(cpuc->enabled);
+	WARN_ON_ONCE(cpuc->enabled);
 
 	/*
 	 * Reset the LBR stack if this is the first LBR user or
@@ -93,9 +93,10 @@ static void intel_pmu_lbr_disable(struct perf_event *event)
 		return;
 
 	cpuc->lbr_users--;
-
 	BUG_ON(cpuc->lbr_users < 0);
-	WARN_ON(cpuc->enabled);
+
+	if (cpuc->enabled && !cpuc->lbr_users)
+		__intel_pmu_lbr_disable();
 }
 
 static void intel_pmu_lbr_enable_all(void)
-- 
cgit v1.2.3-55-g7522


From d329527e47851f84b1e7944ed9601205f35f1b93 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Mon, 8 Mar 2010 13:57:14 +0100
Subject: perf, x86: Reorder intel_pmu_enable_all()

The documentation says we have to enable PEBS before we enable the PMU
proper.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index c135ed735b22..d3e2424069a7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -487,6 +487,8 @@ static void intel_pmu_enable_all(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
+	intel_pmu_pebs_enable_all();
+	intel_pmu_lbr_enable_all();
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 
 	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
@@ -498,9 +500,6 @@ static void intel_pmu_enable_all(void)
 
 		intel_pmu_enable_bts(event->hw.config);
 	}
-
-	intel_pmu_pebs_enable_all();
-	intel_pmu_lbr_enable_all();
 }
 
 static inline u64 intel_pmu_get_status(void)
-- 
cgit v1.2.3-55-g7522


From 12ab854d744f04bfc5c6c4db723b7e31fc03eb29 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Sat, 6 Mar 2010 18:57:38 +0100
Subject: perf, x86: Deal with multiple state bits for pebs-fmt1

Its unclear if the PEBS state record will have only a single bit set, in
case it does not and accumulates bits, deal with that by only processing
each event once.

Also, robustify some of the code.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 9ad0e67b9c82..b4680daecf19 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -538,6 +538,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	struct perf_event *event = NULL;
 	struct perf_raw_record raw;
 	struct pt_regs regs;
+	u64 status = 0;
 	int bit, n;
 
 	if (!ds || !x86_pmu.pebs)
@@ -561,13 +562,22 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 
 	for ( ; at < top; at++) {
 		for_each_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
-			if (!cpuc->events[bit]->attr.precise)
+			event = cpuc->events[bit];
+			if (!test_bit(bit, cpuc->active_mask))
 				continue;
 
-			event = cpuc->events[bit];
+			WARN_ON_ONCE(!event);
+
+			if (!event->attr.precise)
+				continue;
+
+			if (__test_and_set_bit(bit, (unsigned long *)&status))
+				continue;
+
+			break;
 		}
 
-		if (!event)
+		if (!event || bit >= MAX_PEBS_EVENTS)
 			continue;
 
 		if (!intel_pmu_save_and_restart(event))
-- 
cgit v1.2.3-55-g7522


From ad0e6cfe2a2a61d7b5530188e571d508146cb43b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Sat, 6 Mar 2010 19:49:06 +0100
Subject: perf, x86: Fix silly bug in intel_pmu_pebs_{enable,disable}

We need to use the actual cpuc->pebs_enabled value, not a local copy for
the changes to take effect.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index b4680daecf19..242369488e72 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -333,11 +333,10 @@ static void intel_pmu_pebs_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	u64 val = cpuc->pebs_enabled;
 
 	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
 
-	val |= 1ULL << hwc->idx;
+	cpuc->pebs_enabled |= 1ULL << hwc->idx;
 	WARN_ON_ONCE(cpuc->enabled);
 
 	if (x86_pmu.intel_cap.pebs_trap)
@@ -348,11 +347,10 @@ static void intel_pmu_pebs_disable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	u64 val = cpuc->pebs_enabled;
 
-	val &= ~(1ULL << hwc->idx);
+	cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
 	if (cpuc->enabled)
-		wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
 
 	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
 
-- 
cgit v1.2.3-55-g7522


From b83a46e7da4a948cc852ba7805dfb1a392dec861 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Mon, 8 Mar 2010 13:51:12 +0100
Subject: perf, x86: Don't reset the LBR as frequently

If we reset the LBR on each first counter, simple counter rotation which
first deschedules all counters and then reschedules the new ones will
lead to LBR reset, even though we're still in the same task context.

Reduce this by not flushing on the first counter but only flushing on
different task contexts.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 0145f99f7a35..f278136bf918 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -72,12 +72,11 @@ static void intel_pmu_lbr_enable(struct perf_event *event)
 	WARN_ON_ONCE(cpuc->enabled);
 
 	/*
-	 * Reset the LBR stack if this is the first LBR user or
-	 * we changed task context so as to avoid data leaks.
+	 * Reset the LBR stack if we changed task context to
+	 * avoid data leaks.
 	 */
 
-	if (!cpuc->lbr_users ||
-	    (event->ctx->task && cpuc->lbr_context != event->ctx)) {
+	if (event->ctx->task && cpuc->lbr_context != event->ctx) {
 		intel_pmu_lbr_reset();
 		cpuc->lbr_context = event->ctx;
 	}
@@ -93,7 +92,7 @@ static void intel_pmu_lbr_disable(struct perf_event *event)
 		return;
 
 	cpuc->lbr_users--;
-	BUG_ON(cpuc->lbr_users < 0);
+	WARN_ON_ONCE(cpuc->lbr_users < 0);
 
 	if (cpuc->enabled && !cpuc->lbr_users)
 		__intel_pmu_lbr_disable();
-- 
cgit v1.2.3-55-g7522


From 7645a24cbd01cbf4865d1273d5ddaa8d8c2ccb3a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Mon, 8 Mar 2010 13:51:31 +0100
Subject: perf, x86: Remove checking_{wr,rd}msr() usage

We don't need checking_{wr,rd}msr() calls, since we should know what cpu
we're running on and not use blindly poke at msrs.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       | 24 ++++++++++++++++++------
 arch/x86/kernel/cpu/perf_event_intel.c |  5 ++---
 2 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 335ee1d38b79..e24f6374f9f5 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -29,6 +29,17 @@
 #include <asm/stacktrace.h>
 #include <asm/nmi.h>
 
+#if 0
+#undef wrmsrl
+#define wrmsrl(msr, val) 					\
+do {								\
+	trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
+			(unsigned long)(val));			\
+	native_write_msr((msr), (u32)((u64)(val)), 		\
+			(u32)((u64)(val) >> 32));		\
+} while (0)
+#endif
+
 /*
  * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
  */
@@ -821,14 +832,15 @@ void hw_perf_enable(void)
 
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
 {
-	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
+	wrmsrl(hwc->config_base + hwc->idx,
 			      hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
 }
 
 static inline void x86_pmu_disable_event(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	(void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config);
+
+	wrmsrl(hwc->config_base + hwc->idx, hwc->config);
 }
 
 static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -843,7 +855,7 @@ x86_perf_event_set_period(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	s64 left = atomic64_read(&hwc->period_left);
 	s64 period = hwc->sample_period;
-	int err, ret = 0, idx = hwc->idx;
+	int ret = 0, idx = hwc->idx;
 
 	if (idx == X86_PMC_IDX_FIXED_BTS)
 		return 0;
@@ -881,8 +893,8 @@ x86_perf_event_set_period(struct perf_event *event)
 	 */
 	atomic64_set(&hwc->prev_count, (u64)-left);
 
-	err = checking_wrmsrl(hwc->event_base + idx,
-			     (u64)(-left) & x86_pmu.event_mask);
+	wrmsrl(hwc->event_base + idx,
+			(u64)(-left) & x86_pmu.event_mask);
 
 	perf_event_update_userpage(event);
 
@@ -987,7 +999,7 @@ void perf_event_print_debug(void)
 		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
 		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
 	}
-	pr_info("CPU#%d: active:       %016llx\n", cpu, *(u64 *)cpuc->active_mask);
+	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
 
 	for (idx = 0; idx < x86_pmu.num_events; idx++) {
 		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index d3e2424069a7..971dc6e7d54b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -525,7 +525,7 @@ static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
 
 	rdmsrl(hwc->config_base, ctrl_val);
 	ctrl_val &= ~mask;
-	(void)checking_wrmsrl(hwc->config_base, ctrl_val);
+	wrmsrl(hwc->config_base, ctrl_val);
 }
 
 static void intel_pmu_disable_event(struct perf_event *event)
@@ -553,7 +553,6 @@ static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
 {
 	int idx = hwc->idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, bits, mask;
-	int err;
 
 	/*
 	 * Enable IRQ generation (0x8),
@@ -578,7 +577,7 @@ static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
 	rdmsrl(hwc->config_base, ctrl_val);
 	ctrl_val &= ~mask;
 	ctrl_val |= bits;
-	err = checking_wrmsrl(hwc->config_base, ctrl_val);
+	wrmsrl(hwc->config_base, ctrl_val);
 }
 
 static void intel_pmu_enable_event(struct perf_event *event)
-- 
cgit v1.2.3-55-g7522


From d80c7502ff63aa0d99d8c0c5803d28bbef67a74e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Tue, 9 Mar 2010 11:41:02 +0100
Subject: perf, x86: Fixup the PEBS handler for Core2 cpus

Pull the core handler in line with the nhm one, also make sure we always
drain the buffer.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 38 +++++++++++++++++++------------
 1 file changed, 24 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 242369488e72..1bfd59beb658 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -472,20 +472,39 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	struct pt_regs regs;
 	int n;
 
-	if (!event || !ds || !x86_pmu.pebs)
+	if (!ds || !x86_pmu.pebs)
 		return;
 
 	at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
 	top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
 
-	if (top <= at)
+	/*
+	 * Whatever else happens, drain the thing
+	 */
+	ds->pebs_index = ds->pebs_buffer_base;
+
+	if (!test_bit(0, cpuc->active_mask))
 		return;
 
-	ds->pebs_index = ds->pebs_buffer_base;
+	WARN_ON_ONCE(!event);
+
+	if (!event->attr.precise)
+		return;
+
+	n = top - at;
+	if (n <= 0)
+		return;
 
 	if (!intel_pmu_save_and_restart(event))
 		return;
 
+	/*
+	 * Should not happen, we program the threshold at 1 and do not
+	 * set a reset value.
+	 */
+	WARN_ON_ONCE(n > 1);
+	at += n - 1;
+
 	perf_sample_data_init(&data, 0);
 	data.period = event->hw.last_period;
 
@@ -495,14 +514,6 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 		data.raw = &raw;
 	}
 
-	n = top - at;
-
-	/*
-	 * Should not happen, we program the threshold at 1 and do not
-	 * set a reset value.
-	 */
-	WARN_ON_ONCE(n > 1);
-
 	/*
 	 * We use the interrupt regs as a base because the PEBS record
 	 * does not contain a full regs set, specifically it seems to
@@ -545,12 +556,11 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
 	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
 
-	if (top <= at)
-		return;
-
 	ds->pebs_index = ds->pebs_buffer_base;
 
 	n = top - at;
+	if (n <= 0)
+		return;
 
 	/*
 	 * Should not happen, we program the threshold at 1 and do not
-- 
cgit v1.2.3-55-g7522


From 63fb3f9b2312e131be5a0a2dddb63f2fb123db9b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Tue, 9 Mar 2010 11:51:02 +0100
Subject: perf, x86: Fix LBR read-out

Don't decrement the TOS twice...

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index f278136bf918..df4c98e26c5b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -129,7 +129,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 	u64 tos = intel_pmu_lbr_tos();
 	int i;
 
-	for (i = 0; i < x86_pmu.lbr_nr; i++, tos--) {
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		unsigned long lbr_idx = (tos - i) & mask;
 		union {
 			struct {
@@ -162,7 +162,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 	u64 tos = intel_pmu_lbr_tos();
 	int i;
 
-	for (i = 0; i < x86_pmu.lbr_nr; i++, tos--) {
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		unsigned long lbr_idx = (tos - i) & mask;
 		u64 from, to, flags = 0;
 
-- 
cgit v1.2.3-55-g7522


From ba7e4d13fc7e25af1d167d40e6f028298dfc55ad Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Sat, 6 Jun 2009 13:58:12 +0200
Subject: perf, x86: Add INSTRUCTION_DECODER config flag

The PEBS+LBR decoding magic needs the insn_get_length() infrastructure
to be able to decode x86 instruction length.

So split it out of KPROBES dependency and make it enabled when either
KPROBES or PERF_EVENTS is enabled.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig      | 3 +++
 arch/x86/lib/Makefile | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e98440371525..e1240f652a9b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -58,6 +58,9 @@ config X86
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
 
+config INSTRUCTION_DECODER
+	def_bool (KPROBES || PERF_EVENTS)
+
 config OUTPUT_FORMAT
 	string
 	default "elf32-i386" if X86_32
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 419386c24b82..cbaf8f2b83df 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -20,7 +20,7 @@ lib-y := delay.o
 lib-y += thunk_$(BITS).o
 lib-y += usercopy_$(BITS).o getuser.o putuser.o
 lib-y += memcpy_$(BITS).o
-lib-$(CONFIG_KPROBES) += insn.o inat.o
+lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
 
 obj-y += msr.o msr-reg.o msr-reg-export.o
 
-- 
cgit v1.2.3-55-g7522


From caa0142d84ceb0fc83e28f0475d0a7316cb6df77 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Sat, 6 Jun 2009 13:58:12 +0200
Subject: perf, x86: Fix the !CONFIG_CPU_SUP_INTEL build

Fix typo. But the modularization here is ugly and should be improved.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 1bfd59beb658..c59678a14a2e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -661,7 +661,7 @@ static void intel_ds_init(void)
 
 #else /* CONFIG_CPU_SUP_INTEL */
 
-static int reseve_ds_buffers(void)
+static int reserve_ds_buffers(void)
 {
 	return 0;
 }
-- 
cgit v1.2.3-55-g7522


From 938179b4f8cf8a4f11234ebf2dff2eb48400acfe Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich
Date: Fri, 5 Mar 2010 11:42:03 -0600
Subject: x86: Improve Intel microcode loader performance

We've noticed that on large SGI UV system configurations,
running microcode.ctl can take very long periods of time.  This
is due to the large number of vmalloc/vfree calls made by the
Intel generic_load_microcode() logic.

By reusing allocated space, the following patch reduces the time
to run microcode.ctl on a 1024 cpu system from approximately 80
seconds down to 1 or 2 seconds.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Acked-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Bill Davidsen <davidsen@tmr.com>
LKML-Reference: <20100305174203.GA19638@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_intel.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 85a343e28937..356170262a93 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -343,10 +343,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 				int (*get_ucode_data)(void *, const void *, size_t))
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-	u8 *ucode_ptr = data, *new_mc = NULL, *mc;
+	u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
 	int new_rev = uci->cpu_sig.rev;
 	unsigned int leftover = size;
 	enum ucode_state state = UCODE_OK;
+	unsigned int curr_mc_size = 0;
 
 	while (leftover) {
 		struct microcode_header_intel mc_header;
@@ -361,9 +362,15 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 			break;
 		}
 
-		mc = vmalloc(mc_size);
-		if (!mc)
-			break;
+		/* For performance reasons, reuse mc area when possible */
+		if (!mc || mc_size > curr_mc_size) {
+			if (mc)
+				vfree(mc);
+			mc = vmalloc(mc_size);
+			if (!mc)
+				break;
+			curr_mc_size = mc_size;
+		}
 
 		if (get_ucode_data(mc, ucode_ptr, mc_size) ||
 		    microcode_sanity_check(mc) < 0) {
@@ -376,13 +383,16 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 				vfree(new_mc);
 			new_rev = mc_header.rev;
 			new_mc  = mc;
-		} else
-			vfree(mc);
+			mc = NULL;	/* trigger new vmalloc */
+		}
 
 		ucode_ptr += mc_size;
 		leftover  -= mc_size;
 	}
 
+	if (mc)
+		vfree(mc);
+
 	if (leftover) {
 		if (new_mc)
 			vfree(new_mc);
-- 
cgit v1.2.3-55-g7522


From 6f4edd69e40aba4f45bf9558c1e9a950d79ab4e4 Mon Sep 17 00:00:00 2001
From: Jack Steiner
Date: Wed, 10 Mar 2010 14:44:58 -0600
Subject: x86, UV: Clean up UV headers for MMR definitions

Update UV mmr definitions header file. Eliminate definitions no
longer needed. Move 2 definitions from tlb_uv.c into the header
file where they belong.

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20100310204458.GA28835@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_mmrs.h | 528 ++++++--------------------------------
 arch/x86/kernel/tlb_uv.c          |  10 +-
 2 files changed, 85 insertions(+), 453 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index 2cae46c7c8a2..b2f2d2e05cec 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -1,4 +1,3 @@
-
 /*
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
@@ -14,14 +13,26 @@
 
 #define UV_MMR_ENABLE		(1UL << 63)
 
+/* ========================================================================= */
+/*                          UVH_BAU_DATA_BROADCAST                           */
+/* ========================================================================= */
+#define UVH_BAU_DATA_BROADCAST 0x61688UL
+#define UVH_BAU_DATA_BROADCAST_32 0x0440
+
+#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0
+#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL
+
+union uvh_bau_data_broadcast_u {
+    unsigned long	v;
+    struct uvh_bau_data_broadcast_s {
+	unsigned long	enable :  1;  /* RW */
+	unsigned long	rsvd_1_63: 63;  /*    */
+    } s;
+};
+
 /* ========================================================================= */
 /*                           UVH_BAU_DATA_CONFIG                             */
 /* ========================================================================= */
-#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
-#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
-/* 1011 timebase 7 (168millisec) * 3 ticks -> 500ms */
 #define UVH_BAU_DATA_CONFIG 0x61680UL
 #define UVH_BAU_DATA_CONFIG_32 0x0438
 
@@ -603,6 +614,68 @@ union uvh_lb_bau_intd_software_acknowledge_u {
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70
 
+/* ========================================================================= */
+/*                         UVH_LB_BAU_MISC_CONTROL                           */
+/* ========================================================================= */
+#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
+#define UVH_LB_BAU_MISC_CONTROL_32 0x00a10
+
+#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
+#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
+#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
+#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
+#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
+#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
+#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
+#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
+#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UVH_LB_BAU_MISC_CONTROL_FUN_SHFT 48
+#define UVH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
+
+union uvh_lb_bau_misc_control_u {
+    unsigned long	v;
+    struct uvh_lb_bau_misc_control_s {
+	unsigned long	rejection_delay                    :  8;  /* RW */
+	unsigned long	apic_mode                          :  1;  /* RW */
+	unsigned long	force_broadcast                    :  1;  /* RW */
+	unsigned long	force_lock_nop                     :  1;  /* RW */
+	unsigned long	csi_agent_presence_vector          :  3;  /* RW */
+	unsigned long	descriptor_fetch_mode              :  1;  /* RW */
+	unsigned long	enable_intd_soft_ack_mode          :  1;  /* RW */
+	unsigned long	intd_soft_ack_timeout_period       :  4;  /* RW */
+	unsigned long	enable_dual_mapping_mode           :  1;  /* RW */
+	unsigned long	vga_io_port_decode_enable          :  1;  /* RW */
+	unsigned long	vga_io_port_16_bit_decode          :  1;  /* RW */
+	unsigned long	suppress_dest_registration         :  1;  /* RW */
+	unsigned long	programmed_initial_priority        :  3;  /* RW */
+	unsigned long	use_incoming_priority              :  1;  /* RW */
+	unsigned long	enable_programmed_initial_priority :  1;  /* RW */
+	unsigned long	rsvd_29_47                         : 19;  /*    */
+	unsigned long	fun                                : 16;  /* RW */
+    } s;
+};
+
 /* ========================================================================= */
 /*                     UVH_LB_BAU_SB_ACTIVATION_CONTROL                      */
 /* ========================================================================= */
@@ -680,334 +753,6 @@ union uvh_lb_bau_sb_descriptor_base_u {
     } s;
 };
 
-/* ========================================================================= */
-/*                      UVH_LB_MCAST_AOERR0_RPT_ENABLE                       */
-/* ========================================================================= */
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE 0x50b20UL
-
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_SHFT 0
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_MASK 0x0000000000000001UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_SHFT 1
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_MASK 0x0000000000000002UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_SHFT 2
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_MASK 0x0000000000000004UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_SHFT 3
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_MASK 0x0000000000000008UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_SHFT 4
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_MASK 0x0000000000000010UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_SHFT 5
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_MASK 0x0000000000000020UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_SHFT 6
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_MASK 0x0000000000000040UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_SHFT 7
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_MASK 0x0000000000000080UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_SHFT 8
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_MASK 0x0000000000000100UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_SHFT 9
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_MASK 0x0000000000000200UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_SHFT 10
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_MASK 0x0000000000000400UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_SHFT 11
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_MASK 0x0000000000000800UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_SHFT 12
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_MASK 0x0000000000001000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_SHFT 13
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_MASK 0x0000000000002000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_SHFT 14
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_MASK 0x0000000000004000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_SHFT 15
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_MASK 0x0000000000008000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_SHFT 16
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_MASK 0x0000000000010000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_SHFT 17
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_MASK 0x0000000000020000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_SHFT 18
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_MASK 0x0000000000040000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_SHFT 19
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_MASK 0x0000000000080000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_SHFT 20
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_MASK 0x0000000000100000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_SHFT 21
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_MASK 0x0000000000200000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_SHFT 22
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_MASK 0x0000000000400000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_SHFT 23
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_MASK 0x0000000000800000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_SHFT 24
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_MASK 0x0000000001000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_SHFT 25
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_MASK 0x0000000002000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_SHFT 26
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_MASK 0x0000000004000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_SHFT 27
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_MASK 0x0000000008000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_SHFT 28
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_MASK 0x0000000010000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_SHFT 29
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_MASK 0x0000000020000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_SHFT 30
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_MASK 0x0000000040000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_SHFT 31
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_MASK 0x0000000080000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_SHFT 32
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_MASK 0x0000000100000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_SHFT 33
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_MASK 0x0000000200000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_SHFT 34
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_MASK 0x0000000400000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_SHFT 35
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_MASK 0x0000000800000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_SHFT 36
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_MASK 0x0000001000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_SHFT 37
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_MASK 0x0000002000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_SHFT 38
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_MASK 0x0000004000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_SHFT 39
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_MASK 0x0000008000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_SHFT 40
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_MASK 0x0000010000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_SHFT 41
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_MASK 0x0000020000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_SHFT 42
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_MASK 0x0000040000000000UL
-
-union uvh_lb_mcast_aoerr0_rpt_enable_u {
-    unsigned long	v;
-    struct uvh_lb_mcast_aoerr0_rpt_enable_s {
-	unsigned long	mcast_obese_msg                         :  1;  /* RW */
-	unsigned long	mcast_data_sb_err                       :  1;  /* RW */
-	unsigned long	mcast_nack_buff_parity                  :  1;  /* RW */
-	unsigned long	mcast_timeout                           :  1;  /* RW */
-	unsigned long	mcast_inactive_reply                    :  1;  /* RW */
-	unsigned long	mcast_upgrade_error                     :  1;  /* RW */
-	unsigned long	mcast_reg_count_underflow               :  1;  /* RW */
-	unsigned long	mcast_rep_obese_msg                     :  1;  /* RW */
-	unsigned long	ucache_req_runt_msg                     :  1;  /* RW */
-	unsigned long	ucache_req_obese_msg                    :  1;  /* RW */
-	unsigned long	ucache_req_data_sb_err                  :  1;  /* RW */
-	unsigned long	ucache_rep_runt_msg                     :  1;  /* RW */
-	unsigned long	ucache_rep_obese_msg                    :  1;  /* RW */
-	unsigned long	ucache_rep_data_sb_err                  :  1;  /* RW */
-	unsigned long	ucache_rep_command_err                  :  1;  /* RW */
-	unsigned long	ucache_pend_timeout                     :  1;  /* RW */
-	unsigned long	macc_req_runt_msg                       :  1;  /* RW */
-	unsigned long	macc_req_obese_msg                      :  1;  /* RW */
-	unsigned long	macc_req_data_sb_err                    :  1;  /* RW */
-	unsigned long	macc_rep_runt_msg                       :  1;  /* RW */
-	unsigned long	macc_rep_obese_msg                      :  1;  /* RW */
-	unsigned long	macc_rep_data_sb_err                    :  1;  /* RW */
-	unsigned long	macc_amo_timeout                        :  1;  /* RW */
-	unsigned long	macc_put_timeout                        :  1;  /* RW */
-	unsigned long	macc_spurious_event                     :  1;  /* RW */
-	unsigned long	ioh_destination_table_parity            :  1;  /* RW */
-	unsigned long	get_had_error_reply                     :  1;  /* RW */
-	unsigned long	get_timeout                             :  1;  /* RW */
-	unsigned long	lock_manager_had_error_reply            :  1;  /* RW */
-	unsigned long	put_had_error_reply                     :  1;  /* RW */
-	unsigned long	put_timeout                             :  1;  /* RW */
-	unsigned long	sb_activation_overrun                   :  1;  /* RW */
-	unsigned long	completed_gb_activation_had_error_reply :  1;  /* RW */
-	unsigned long	completed_gb_activation_timeout         :  1;  /* RW */
-	unsigned long	descriptor_buffer_0_parity              :  1;  /* RW */
-	unsigned long	descriptor_buffer_1_parity              :  1;  /* RW */
-	unsigned long	socket_destination_table_parity         :  1;  /* RW */
-	unsigned long	bau_reply_payload_corruption            :  1;  /* RW */
-	unsigned long	io_port_destination_table_parity        :  1;  /* RW */
-	unsigned long	intd_soft_ack_timeout                   :  1;  /* RW */
-	unsigned long	int_rep_obese_msg                       :  1;  /* RW */
-	unsigned long	int_rep_command_err                     :  1;  /* RW */
-	unsigned long	int_timeout                             :  1;  /* RW */
-	unsigned long	rsvd_43_63                              : 21;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                          UVH_LOCAL_INT0_CONFIG                            */
-/* ========================================================================= */
-#define UVH_LOCAL_INT0_CONFIG 0x61000UL
-
-#define UVH_LOCAL_INT0_CONFIG_VECTOR_SHFT 0
-#define UVH_LOCAL_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_LOCAL_INT0_CONFIG_DM_SHFT 8
-#define UVH_LOCAL_INT0_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_LOCAL_INT0_CONFIG_DESTMODE_SHFT 11
-#define UVH_LOCAL_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_LOCAL_INT0_CONFIG_STATUS_SHFT 12
-#define UVH_LOCAL_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_LOCAL_INT0_CONFIG_P_SHFT 13
-#define UVH_LOCAL_INT0_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_LOCAL_INT0_CONFIG_T_SHFT 15
-#define UVH_LOCAL_INT0_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_LOCAL_INT0_CONFIG_M_SHFT 16
-#define UVH_LOCAL_INT0_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_LOCAL_INT0_CONFIG_APIC_ID_SHFT 32
-#define UVH_LOCAL_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_local_int0_config_u {
-    unsigned long	v;
-    struct uvh_local_int0_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                          UVH_LOCAL_INT0_ENABLE                            */
-/* ========================================================================= */
-#define UVH_LOCAL_INT0_ENABLE 0x65000UL
-
-#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_SHFT 0
-#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_MASK 0x0000000000000001UL
-#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_SHFT 1
-#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_MASK 0x0000000000000002UL
-#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_SHFT 2
-#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_MASK 0x0000000000000004UL
-#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_SHFT 3
-#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_MASK 0x0000000000000008UL
-#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_SHFT 4
-#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_MASK 0x0000000000000010UL
-#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_SHFT 5
-#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_MASK 0x0000000000000020UL
-#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_SHFT 6
-#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_MASK 0x0000000000000040UL
-#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_SHFT 7
-#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_MASK 0x0000000000000080UL
-#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_SHFT 8
-#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_MASK 0x0000000000000100UL
-#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_SHFT 9
-#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_MASK 0x0000000000000200UL
-#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_SHFT 10
-#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_MASK 0x0000000000000400UL
-#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_SHFT 11
-#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_MASK 0x0000000000000800UL
-#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_SHFT 12
-#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_MASK 0x0000000000001000UL
-#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_SHFT 13
-#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_MASK 0x0000000000002000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_SHFT 14
-#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_MASK 0x0000000000004000UL
-#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_SHFT 15
-#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_MASK 0x0000000000008000UL
-#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_SHFT 16
-#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_MASK 0x0000000000010000UL
-#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_SHFT 17
-#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_MASK 0x0000000000020000UL
-#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_SHFT 18
-#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_MASK 0x0000000000040000UL
-#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_SHFT 19
-#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_MASK 0x0000000000080000UL
-#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_SHFT 20
-#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_MASK 0x0000000000100000UL
-#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_SHFT 21
-#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_MASK 0x0000000000200000UL
-#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_SHFT 22
-#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_SHFT 23
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_MASK 0x0000000000800000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_SHFT 24
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_MASK 0x0000000001000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_SHFT 25
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_MASK 0x0000000002000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_SHFT 26
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_MASK 0x0000000004000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_SHFT 27
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_MASK 0x0000000008000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_SHFT 28
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_MASK 0x0000000010000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_SHFT 29
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_MASK 0x0000000020000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_SHFT 30
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_MASK 0x0000000040000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_SHFT 31
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_MASK 0x0000000080000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_SHFT 32
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_MASK 0x0000000100000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_SHFT 33
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_MASK 0x0000000200000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_SHFT 34
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_MASK 0x0000000400000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_SHFT 35
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_MASK 0x0000000800000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_SHFT 36
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_MASK 0x0000001000000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_SHFT 37
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_MASK 0x0000002000000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_SHFT 38
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_MASK 0x0000004000000000UL
-#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_SHFT 39
-#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_MASK 0x0000008000000000UL
-#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_SHFT 40
-#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_MASK 0x0000010000000000UL
-#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_SHFT 41
-#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_MASK 0x0000020000000000UL
-#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_SHFT 42
-#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_MASK 0x0000040000000000UL
-#define UVH_LOCAL_INT0_ENABLE_LTC_INT_SHFT 43
-#define UVH_LOCAL_INT0_ENABLE_LTC_INT_MASK 0x0000080000000000UL
-#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_SHFT 44
-#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
-
-union uvh_local_int0_enable_u {
-    unsigned long	v;
-    struct uvh_local_int0_enable_s {
-	unsigned long	lb_hcerr            :  1;  /* RW */
-	unsigned long	gr0_hcerr           :  1;  /* RW */
-	unsigned long	gr1_hcerr           :  1;  /* RW */
-	unsigned long	lh_hcerr            :  1;  /* RW */
-	unsigned long	rh_hcerr            :  1;  /* RW */
-	unsigned long	xn_hcerr            :  1;  /* RW */
-	unsigned long	si_hcerr            :  1;  /* RW */
-	unsigned long	lb_aoerr0           :  1;  /* RW */
-	unsigned long	gr0_aoerr0          :  1;  /* RW */
-	unsigned long	gr1_aoerr0          :  1;  /* RW */
-	unsigned long	lh_aoerr0           :  1;  /* RW */
-	unsigned long	rh_aoerr0           :  1;  /* RW */
-	unsigned long	xn_aoerr0           :  1;  /* RW */
-	unsigned long	si_aoerr0           :  1;  /* RW */
-	unsigned long	lb_aoerr1           :  1;  /* RW */
-	unsigned long	gr0_aoerr1          :  1;  /* RW */
-	unsigned long	gr1_aoerr1          :  1;  /* RW */
-	unsigned long	lh_aoerr1           :  1;  /* RW */
-	unsigned long	rh_aoerr1           :  1;  /* RW */
-	unsigned long	xn_aoerr1           :  1;  /* RW */
-	unsigned long	si_aoerr1           :  1;  /* RW */
-	unsigned long	rh_vpi_int          :  1;  /* RW */
-	unsigned long	system_shutdown_int :  1;  /* RW */
-	unsigned long	lb_irq_int_0        :  1;  /* RW */
-	unsigned long	lb_irq_int_1        :  1;  /* RW */
-	unsigned long	lb_irq_int_2        :  1;  /* RW */
-	unsigned long	lb_irq_int_3        :  1;  /* RW */
-	unsigned long	lb_irq_int_4        :  1;  /* RW */
-	unsigned long	lb_irq_int_5        :  1;  /* RW */
-	unsigned long	lb_irq_int_6        :  1;  /* RW */
-	unsigned long	lb_irq_int_7        :  1;  /* RW */
-	unsigned long	lb_irq_int_8        :  1;  /* RW */
-	unsigned long	lb_irq_int_9        :  1;  /* RW */
-	unsigned long	lb_irq_int_10       :  1;  /* RW */
-	unsigned long	lb_irq_int_11       :  1;  /* RW */
-	unsigned long	lb_irq_int_12       :  1;  /* RW */
-	unsigned long	lb_irq_int_13       :  1;  /* RW */
-	unsigned long	lb_irq_int_14       :  1;  /* RW */
-	unsigned long	lb_irq_int_15       :  1;  /* RW */
-	unsigned long	l1_nmi_int          :  1;  /* RW */
-	unsigned long	stop_clock          :  1;  /* RW */
-	unsigned long	asic_to_l1          :  1;  /* RW */
-	unsigned long	l1_to_asic          :  1;  /* RW */
-	unsigned long	ltc_int             :  1;  /* RW */
-	unsigned long	la_seq_trigger      :  1;  /* RW */
-	unsigned long	rsvd_45_63          : 19;  /*    */
-    } s;
-};
-
 /* ========================================================================= */
 /*                               UVH_NODE_ID                                 */
 /* ========================================================================= */
@@ -1111,26 +856,6 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
     } s;
 };
 
-/* ========================================================================= */
-/*                    UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR                      */
-/* ========================================================================= */
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR 0x1600020UL
-
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT 26
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_rh_gam_cfg_overlay_config_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_cfg_overlay_config_mmr_s {
-	unsigned long	rsvd_0_25: 26;  /*    */
-	unsigned long	base   : 20;  /* RW */
-	unsigned long	rsvd_46_62: 17;  /*    */
-	unsigned long	enable :  1;  /* RW */
-    } s;
-};
-
 /* ========================================================================= */
 /*                    UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR                      */
 /* ========================================================================= */
@@ -1262,101 +987,6 @@ union uvh_rtc1_int_config_u {
     } s;
 };
 
-/* ========================================================================= */
-/*                           UVH_RTC2_INT_CONFIG                             */
-/* ========================================================================= */
-#define UVH_RTC2_INT_CONFIG 0x61600UL
-
-#define UVH_RTC2_INT_CONFIG_VECTOR_SHFT 0
-#define UVH_RTC2_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_RTC2_INT_CONFIG_DM_SHFT 8
-#define UVH_RTC2_INT_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_RTC2_INT_CONFIG_DESTMODE_SHFT 11
-#define UVH_RTC2_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_RTC2_INT_CONFIG_STATUS_SHFT 12
-#define UVH_RTC2_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_RTC2_INT_CONFIG_P_SHFT 13
-#define UVH_RTC2_INT_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_RTC2_INT_CONFIG_T_SHFT 15
-#define UVH_RTC2_INT_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_RTC2_INT_CONFIG_M_SHFT 16
-#define UVH_RTC2_INT_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_RTC2_INT_CONFIG_APIC_ID_SHFT 32
-#define UVH_RTC2_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_rtc2_int_config_u {
-    unsigned long	v;
-    struct uvh_rtc2_int_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                           UVH_RTC3_INT_CONFIG                             */
-/* ========================================================================= */
-#define UVH_RTC3_INT_CONFIG 0x61640UL
-
-#define UVH_RTC3_INT_CONFIG_VECTOR_SHFT 0
-#define UVH_RTC3_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_RTC3_INT_CONFIG_DM_SHFT 8
-#define UVH_RTC3_INT_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_RTC3_INT_CONFIG_DESTMODE_SHFT 11
-#define UVH_RTC3_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_RTC3_INT_CONFIG_STATUS_SHFT 12
-#define UVH_RTC3_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_RTC3_INT_CONFIG_P_SHFT 13
-#define UVH_RTC3_INT_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_RTC3_INT_CONFIG_T_SHFT 15
-#define UVH_RTC3_INT_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_RTC3_INT_CONFIG_M_SHFT 16
-#define UVH_RTC3_INT_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_RTC3_INT_CONFIG_APIC_ID_SHFT 32
-#define UVH_RTC3_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_rtc3_int_config_u {
-    unsigned long	v;
-    struct uvh_rtc3_int_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                            UVH_RTC_INC_RATIO                              */
-/* ========================================================================= */
-#define UVH_RTC_INC_RATIO 0x350000UL
-
-#define UVH_RTC_INC_RATIO_FRACTION_SHFT 0
-#define UVH_RTC_INC_RATIO_FRACTION_MASK 0x00000000000fffffUL
-#define UVH_RTC_INC_RATIO_RATIO_SHFT 20
-#define UVH_RTC_INC_RATIO_RATIO_MASK 0x0000000000700000UL
-
-union uvh_rtc_inc_ratio_u {
-    unsigned long	v;
-    struct uvh_rtc_inc_ratio_s {
-	unsigned long	fraction : 20;  /* RW */
-	unsigned long	ratio    :  3;  /* RW */
-	unsigned long	rsvd_23_63: 41;  /*    */
-    } s;
-};
-
 /* ========================================================================= */
 /*                          UVH_SI_ADDR_MAP_CONFIG                           */
 /* ========================================================================= */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 364d015efebc..ef68ba48564b 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -20,6 +20,8 @@
 #include <asm/tsc.h>
 #include <asm/irq_vectors.h>
 
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD	0x000000000bUL
+
 static struct bau_control	**uv_bau_table_bases __read_mostly;
 static int			uv_bau_retry_limit __read_mostly;
 
@@ -478,16 +480,16 @@ static void uv_enable_timeouts(void)
 		 * To program the period, the SOFT_ACK_MODE must be off.
 		 */
 		mmr_image &= ~((unsigned long)1 <<
-			       UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
 		uv_write_global_mmr64
 		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
 		/*
 		 * Set the 4-bit period.
 		 */
 		mmr_image &= ~((unsigned long)0xf <<
-			UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
 		mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
-			     UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
 		uv_write_global_mmr64
 		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
 		/*
@@ -496,7 +498,7 @@ static void uv_enable_timeouts(void)
 		 * indicated in bits 2:0 (7 causes all of them to timeout).
 		 */
 		mmr_image |= ((unsigned long)1 <<
-			      UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
 		uv_write_global_mmr64
 		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
 	}
-- 
cgit v1.2.3-55-g7522


From a072738e04f0eb26370e39ec679e9a0d65e49aea Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Thu, 11 Mar 2010 19:54:39 +0300
Subject: perf, x86: Implement initial P4 PMU driver

The netburst PMU is way different from the "architectural
perfomance monitoring" specification that current CPUs use.
P4 uses a tuple of ESCR+CCCR+COUNTER MSR registers to handle
perfomance monitoring events.

A few implementational details:

1) We need a separate x86_pmu::hw_config helper in struct
   x86_pmu since register bit-fields are quite different from P6,
   Core and later cpu series.

2) For the same reason is a x86_pmu::schedule_events helper
   introduced.

3) hw_perf_event::config consists of packed ESCR+CCCR values.
   It's allowed since in reality both registers only use a half
   of their size. Of course before making a real write into a
   particular MSR we need to unpack the value and extend it to
   a proper size.

4) The tuple of packed ESCR+CCCR in hw_perf_event::config
   doesn't describe the memory address of ESCR MSR register
   so that we need to keep a mapping between these tuples
   used and available ESCR (various P4 events may use same
   ESCRs but not simultaneously), for this sake every active
   event has a per-cpu map of hw_perf_event::idx <--> ESCR
   addresses.

5) Since hw_perf_event::idx is an offset to counter/control register
   we need to lift X86_PMC_MAX_GENERIC up, otherwise kernel
   strips it down to 8 registers and event armed may never be turned
   off (ie the bit in active_mask is set but the loop never reaches
   this index to check), thanks to Peter Zijlstra

Restrictions:

 - No cascaded counters support (do we ever need them?)
 - No dependent events support (so PERF_COUNT_HW_INSTRUCTIONS
   doesn't work for now)
 - There are events with same counters which can't work simultaneously
   (need to use intersected ones due to broken counter 1)
 - No PERF_COUNT_HW_CACHE_ events yet

Todo:

 - Implement dependent events
 - Need proper hashing for event opcodes (no linear search, good for
   debugging stage but not in real loads)
 - Some events counted during a clock cycle -- need to set threshold
   for them and count every clock cycle just to get summary statistics
   (ie to behave the same way as other PMUs do)
 - Need to swicth to use event_constraints
 - To support RAW events we need to encode a global list of P4 events
   into p4_templates
 - Cache events need to be added

Event support status matrix:

 Event			status
 -----------------------------
 cycles			works
 cache-references	works
 cache-misses		works
 branch-misses		works
 bus-cycles		partially (does not work on 64bit cpu with HT enabled)
 instruction		doesnt work (needs dependent event [mop tagging])
 branches		doesnt work

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100311165439.GB5129@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event.h      |   2 +-
 arch/x86/include/asm/perf_event_p4.h   | 707 +++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event.c       |  46 ++-
 arch/x86/kernel/cpu/perf_event_amd.c   |   2 +
 arch/x86/kernel/cpu/perf_event_intel.c |  15 +-
 arch/x86/kernel/cpu/perf_event_p4.c    | 607 ++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event_p6.c    |   2 +
 7 files changed, 1358 insertions(+), 23 deletions(-)
 create mode 100644 arch/x86/include/asm/perf_event_p4.h
 create mode 100644 arch/x86/kernel/cpu/perf_event_p4.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index a9038c951619..124dddd598f3 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -5,7 +5,7 @@
  * Performance event hw details:
  */
 
-#define X86_PMC_MAX_GENERIC					8
+#define X86_PMC_MAX_GENERIC				       32
 #define X86_PMC_MAX_FIXED					3
 
 #define X86_PMC_IDX_GENERIC				        0
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
new file mode 100644
index 000000000000..829f4711645f
--- /dev/null
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -0,0 +1,707 @@
+/*
+ * Netburst Perfomance Events (P4, old Xeon)
+ */
+
+#ifndef PERF_EVENT_P4_H
+#define PERF_EVENT_P4_H
+
+#include <linux/cpu.h>
+#include <linux/bitops.h>
+
+/*
+ * NetBurst has perfomance MSRs shared between
+ * threads if HT is turned on, ie for both logical
+ * processors (mem: in turn in Atom with HT support
+ * perf-MSRs are not shared and every thread has its
+ * own perf-MSRs set)
+ */
+#define ARCH_P4_TOTAL_ESCR		(46)
+#define ARCH_P4_RESERVED_ESCR		(2) /* IQ_ESCR(0,1) not always present */
+#define ARCH_P4_MAX_ESCR		(ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
+#define ARCH_P4_MAX_CCCR		(18)
+#define ARCH_P4_MAX_COUNTER		(ARCH_P4_MAX_CCCR / 2)
+
+#define P4_EVNTSEL_EVENT_MASK		0x7e000000U
+#define P4_EVNTSEL_EVENT_SHIFT		25
+#define P4_EVNTSEL_EVENTMASK_MASK	0x01fffe00U
+#define P4_EVNTSEL_EVENTMASK_SHIFT	9
+#define P4_EVNTSEL_TAG_MASK		0x000001e0U
+#define P4_EVNTSEL_TAG_SHIFT		5
+#define P4_EVNTSEL_TAG_ENABLE		0x00000010U
+#define P4_EVNTSEL_T0_OS		0x00000008U
+#define P4_EVNTSEL_T0_USR		0x00000004U
+#define P4_EVNTSEL_T1_OS		0x00000002U
+#define P4_EVNTSEL_T1_USR		0x00000001U
+
+/* Non HT mask */
+#define P4_EVNTSEL_MASK				\
+	(P4_EVNTSEL_EVENT_MASK		|	\
+	P4_EVNTSEL_EVENTMASK_MASK	|	\
+	P4_EVNTSEL_TAG_MASK		|	\
+	P4_EVNTSEL_TAG_ENABLE		|	\
+	P4_EVNTSEL_T0_OS		|	\
+	P4_EVNTSEL_T0_USR)
+
+/* HT mask */
+#define P4_EVNTSEL_MASK_HT			\
+	(P4_EVNTSEL_MASK		|	\
+	P4_EVNTSEL_T1_OS		|	\
+	P4_EVNTSEL_T1_USR)
+
+#define P4_CCCR_OVF			0x80000000U
+#define P4_CCCR_CASCADE			0x40000000U
+#define P4_CCCR_OVF_PMI_T0		0x04000000U
+#define P4_CCCR_OVF_PMI_T1		0x08000000U
+#define P4_CCCR_FORCE_OVF		0x02000000U
+#define P4_CCCR_EDGE			0x01000000U
+#define P4_CCCR_THRESHOLD_MASK		0x00f00000U
+#define P4_CCCR_THRESHOLD_SHIFT		20
+#define P4_CCCR_THRESHOLD(v)		((v) << P4_CCCR_THRESHOLD_SHIFT)
+#define P4_CCCR_COMPLEMENT		0x00080000U
+#define P4_CCCR_COMPARE			0x00040000U
+#define P4_CCCR_ESCR_SELECT_MASK	0x0000e000U
+#define P4_CCCR_ESCR_SELECT_SHIFT	13
+#define P4_CCCR_ENABLE			0x00001000U
+#define P4_CCCR_THREAD_SINGLE		0x00010000U
+#define P4_CCCR_THREAD_BOTH		0x00020000U
+#define P4_CCCR_THREAD_ANY		0x00030000U
+
+/* Non HT mask */
+#define P4_CCCR_MASK				\
+	(P4_CCCR_OVF			|	\
+	P4_CCCR_CASCADE			|	\
+	P4_CCCR_OVF_PMI_T0		|	\
+	P4_CCCR_FORCE_OVF		|	\
+	P4_CCCR_EDGE			|	\
+	P4_CCCR_THRESHOLD_MASK		|	\
+	P4_CCCR_COMPLEMENT		|	\
+	P4_CCCR_COMPARE			|	\
+	P4_CCCR_ESCR_SELECT_MASK	|	\
+	P4_CCCR_ENABLE)
+
+/* HT mask */
+#define P4_CCCR_MASK_HT				\
+	(P4_CCCR_MASK			|	\
+	P4_CCCR_THREAD_ANY)
+
+/*
+ * format is 32 bit: ee ss aa aa
+ * where
+ *	ee - 8 bit event
+ *	ss - 8 bit selector
+ *	aa aa - 16 bits reserved for tags/attributes
+ */
+#define P4_EVENT_PACK(event, selector)		(((event) << 24) | ((selector) << 16))
+#define P4_EVENT_UNPACK_EVENT(packed)		(((packed) >> 24) & 0xff)
+#define P4_EVENT_UNPACK_SELECTOR(packed)	(((packed) >> 16) & 0xff)
+#define P4_EVENT_PACK_ATTR(attr)		((attr))
+#define P4_EVENT_UNPACK_ATTR(packed)		((packed) & 0xffff)
+#define P4_MAKE_EVENT_ATTR(class, name, bit)	class##_##name = (1 << bit)
+#define P4_EVENT_ATTR(class, name)		class##_##name
+#define P4_EVENT_ATTR_STR(class, name)		__stringify(class##_##name)
+
+/*
+ * config field is 64bit width and consists of
+ * HT << 63 | ESCR << 32 | CCCR
+ * where HT is HyperThreading bit (since ESCR
+ * has it reserved we may use it for own purpose)
+ *
+ * note that this is NOT the addresses of respective
+ * ESCR and CCCR but rather an only packed value should
+ * be unpacked and written to a proper addresses
+ *
+ * the base idea is to pack as much info as
+ * possible
+ */
+#define p4_config_pack_escr(v)		(((u64)(v)) << 32)
+#define p4_config_pack_cccr(v)		(((u64)(v)) & 0xffffffffULL)
+#define p4_config_unpack_escr(v)	(((u64)(v)) >> 32)
+#define p4_config_unpack_cccr(v)	(((u64)(v)) & 0xffffffffULL)
+
+#define p4_config_unpack_emask(v)			\
+	({						\
+		u32 t = p4_config_unpack_escr((v));	\
+		t  &= P4_EVNTSEL_EVENTMASK_MASK;	\
+		t >>= P4_EVNTSEL_EVENTMASK_SHIFT;	\
+		t;					\
+	})
+
+#define P4_CONFIG_HT_SHIFT		63
+#define P4_CONFIG_HT			(1ULL << P4_CONFIG_HT_SHIFT)
+
+static inline u32 p4_config_unpack_opcode(u64 config)
+{
+	u32 e, s;
+
+	/*
+	 * we don't care about HT presence here since
+	 * event opcode doesn't depend on it
+	 */
+	e = (p4_config_unpack_escr(config) & P4_EVNTSEL_EVENT_MASK) >> P4_EVNTSEL_EVENT_SHIFT;
+	s = (p4_config_unpack_cccr(config) & P4_CCCR_ESCR_SELECT_MASK) >> P4_CCCR_ESCR_SELECT_SHIFT;
+
+	return P4_EVENT_PACK(e, s);
+}
+
+static inline bool p4_is_event_cascaded(u64 config)
+{
+	u32 cccr = p4_config_unpack_cccr(config);
+	return !!(cccr & P4_CCCR_CASCADE);
+}
+
+static inline int p4_ht_config_thread(u64 config)
+{
+	return !!(config & P4_CONFIG_HT);
+}
+
+static inline u64 p4_set_ht_bit(u64 config)
+{
+	return config | P4_CONFIG_HT;
+}
+
+static inline u64 p4_clear_ht_bit(u64 config)
+{
+	return config & ~P4_CONFIG_HT;
+}
+
+static inline int p4_ht_active(void)
+{
+#ifdef CONFIG_SMP
+	return smp_num_siblings > 1;
+#endif
+	return 0;
+}
+
+static inline int p4_ht_thread(int cpu)
+{
+#ifdef CONFIG_SMP
+	if (smp_num_siblings == 2)
+		return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map));
+#endif
+	return 0;
+}
+
+static inline int p4_should_swap_ts(u64 config, int cpu)
+{
+	return p4_ht_config_thread(config) ^ p4_ht_thread(cpu);
+}
+
+static inline u32 p4_default_cccr_conf(int cpu)
+{
+	/*
+	 * Note that P4_CCCR_THREAD_ANY is "required" on
+	 * non-HT machines (on HT machines we count TS events
+	 * regardless the state of second logical processor
+	 */
+	u32 cccr = P4_CCCR_THREAD_ANY;
+
+	if (!p4_ht_thread(cpu))
+		cccr |= P4_CCCR_OVF_PMI_T0;
+	else
+		cccr |= P4_CCCR_OVF_PMI_T1;
+
+	return cccr;
+}
+
+static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
+{
+	u32 escr = 0;
+
+	if (!p4_ht_thread(cpu)) {
+		if (!exclude_os)
+			escr |= P4_EVNTSEL_T0_OS;
+		if (!exclude_usr)
+			escr |= P4_EVNTSEL_T0_USR;
+	} else {
+		if (!exclude_os)
+			escr |= P4_EVNTSEL_T1_OS;
+		if (!exclude_usr)
+			escr |= P4_EVNTSEL_T1_USR;
+	}
+
+	return escr;
+}
+
+/*
+ * Comments below the event represent ESCR restriction
+ * for this event and counter index per ESCR
+ *
+ * MSR_P4_IQ_ESCR0 and MSR_P4_IQ_ESCR1 are available only on early
+ * processor builds (family 0FH, models 01H-02H). These MSRs
+ * are not available on later versions, so that we don't use
+ * them completely
+ *
+ * Also note that CCCR1 do not have P4_CCCR_ENABLE bit properly
+ * working so that we should not use this CCCR and respective
+ * counter as result
+ */
+#define P4_TC_DELIVER_MODE		P4_EVENT_PACK(0x01, 0x01)
+	/*
+	 * MSR_P4_TC_ESCR0:	4, 5
+	 * MSR_P4_TC_ESCR1:	6, 7
+	 */
+
+#define P4_BPU_FETCH_REQUEST		P4_EVENT_PACK(0x03, 0x00)
+	/*
+	 * MSR_P4_BPU_ESCR0:	0, 1
+	 * MSR_P4_BPU_ESCR1:	2, 3
+	 */
+
+#define P4_ITLB_REFERENCE		P4_EVENT_PACK(0x18, 0x03)
+	/*
+	 * MSR_P4_ITLB_ESCR0:	0, 1
+	 * MSR_P4_ITLB_ESCR1:	2, 3
+	 */
+
+#define P4_MEMORY_CANCEL		P4_EVENT_PACK(0x02, 0x05)
+	/*
+	 * MSR_P4_DAC_ESCR0:	8, 9
+	 * MSR_P4_DAC_ESCR1:	10, 11
+	 */
+
+#define P4_MEMORY_COMPLETE		P4_EVENT_PACK(0x08, 0x02)
+	/*
+	 * MSR_P4_SAAT_ESCR0:	8, 9
+	 * MSR_P4_SAAT_ESCR1:	10, 11
+	 */
+
+#define P4_LOAD_PORT_REPLAY		P4_EVENT_PACK(0x04, 0x02)
+	/*
+	 * MSR_P4_SAAT_ESCR0:	8, 9
+	 * MSR_P4_SAAT_ESCR1:	10, 11
+	 */
+
+#define P4_STORE_PORT_REPLAY		P4_EVENT_PACK(0x05, 0x02)
+	/*
+	 * MSR_P4_SAAT_ESCR0:	8, 9
+	 * MSR_P4_SAAT_ESCR1:	10, 11
+	 */
+
+#define P4_MOB_LOAD_REPLAY		P4_EVENT_PACK(0x03, 0x02)
+	/*
+	 * MSR_P4_MOB_ESCR0:	0, 1
+	 * MSR_P4_MOB_ESCR1:	2, 3
+	 */
+
+#define P4_PAGE_WALK_TYPE		P4_EVENT_PACK(0x01, 0x04)
+	/*
+	 * MSR_P4_PMH_ESCR0:	0, 1
+	 * MSR_P4_PMH_ESCR1:	2, 3
+	 */
+
+#define P4_BSQ_CACHE_REFERENCE		P4_EVENT_PACK(0x0c, 0x07)
+	/*
+	 * MSR_P4_BSU_ESCR0:	0, 1
+	 * MSR_P4_BSU_ESCR1:	2, 3
+	 */
+
+#define P4_IOQ_ALLOCATION		P4_EVENT_PACK(0x03, 0x06)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_IOQ_ACTIVE_ENTRIES		P4_EVENT_PACK(0x1a, 0x06)
+	/*
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_FSB_DATA_ACTIVITY		P4_EVENT_PACK(0x17, 0x06)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_BSQ_ALLOCATION		P4_EVENT_PACK(0x05, 0x07)
+	/*
+	 * MSR_P4_BSU_ESCR0:	0, 1
+	 */
+
+#define P4_BSQ_ACTIVE_ENTRIES		P4_EVENT_PACK(0x06, 0x07)
+	/*
+	 * MSR_P4_BSU_ESCR1:	2, 3
+	 */
+
+#define P4_SSE_INPUT_ASSIST		P4_EVENT_PACK(0x34, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR:	8, 9
+	 * MSR_P4_FIRM_ESCR:	10, 11
+	 */
+
+#define P4_PACKED_SP_UOP		P4_EVENT_PACK(0x08, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_PACKED_DP_UOP		P4_EVENT_PACK(0x0c, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_SCALAR_SP_UOP		P4_EVENT_PACK(0x0a, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_SCALAR_DP_UOP		P4_EVENT_PACK(0x0e, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_64BIT_MMX_UOP		P4_EVENT_PACK(0x02, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_128BIT_MMX_UOP		P4_EVENT_PACK(0x1a, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_X87_FP_UOP			P4_EVENT_PACK(0x04, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_TC_MISC			P4_EVENT_PACK(0x06, 0x01)
+	/*
+	 * MSR_P4_TC_ESCR0:	4, 5
+	 * MSR_P4_TC_ESCR1:	6, 7
+	 */
+
+#define P4_GLOBAL_POWER_EVENTS		P4_EVENT_PACK(0x13, 0x06)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_TC_MS_XFER			P4_EVENT_PACK(0x05, 0x00)
+	/*
+	 * MSR_P4_MS_ESCR0:	4, 5
+	 * MSR_P4_MS_ESCR1:	6, 7
+	 */
+
+#define P4_UOP_QUEUE_WRITES		P4_EVENT_PACK(0x09, 0x00)
+	/*
+	 * MSR_P4_MS_ESCR0:	4, 5
+	 * MSR_P4_MS_ESCR1:	6, 7
+	 */
+
+#define P4_RETIRED_MISPRED_BRANCH_TYPE	P4_EVENT_PACK(0x05, 0x02)
+	/*
+	 * MSR_P4_TBPU_ESCR0:	4, 5
+	 * MSR_P4_TBPU_ESCR0:	6, 7
+	 */
+
+#define P4_RETIRED_BRANCH_TYPE		P4_EVENT_PACK(0x04, 0x02)
+	/*
+	 * MSR_P4_TBPU_ESCR0:	4, 5
+	 * MSR_P4_TBPU_ESCR0:	6, 7
+	 */
+
+#define P4_RESOURCE_STALL		P4_EVENT_PACK(0x01, 0x01)
+	/*
+	 * MSR_P4_ALF_ESCR0:	12, 13, 16
+	 * MSR_P4_ALF_ESCR1:	14, 15, 17
+	 */
+
+#define P4_WC_BUFFER			P4_EVENT_PACK(0x05, 0x05)
+	/*
+	 * MSR_P4_DAC_ESCR0:	8, 9
+	 * MSR_P4_DAC_ESCR1:	10, 11
+	 */
+
+#define P4_B2B_CYCLES			P4_EVENT_PACK(0x16, 0x03)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_BNR				P4_EVENT_PACK(0x08, 0x03)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_SNOOP			P4_EVENT_PACK(0x06, 0x03)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_RESPONSE			P4_EVENT_PACK(0x04, 0x03)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_FRONT_END_EVENT		P4_EVENT_PACK(0x08, 0x05)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_EXECUTION_EVENT		P4_EVENT_PACK(0x0c, 0x05)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_REPLAY_EVENT			P4_EVENT_PACK(0x09, 0x05)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_INSTR_RETIRED		P4_EVENT_PACK(0x02, 0x04)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_UOPS_RETIRED			P4_EVENT_PACK(0x01, 0x04)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_UOP_TYPE			P4_EVENT_PACK(0x02, 0x02)
+	/*
+	 * MSR_P4_RAT_ESCR0:	12, 13, 16
+	 * MSR_P4_RAT_ESCR1:	14, 15, 17
+	 */
+
+#define P4_BRANCH_RETIRED		P4_EVENT_PACK(0x06, 0x05)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_MISPRED_BRANCH_RETIRED	P4_EVENT_PACK(0x03, 0x04)
+	/*
+	 * MSR_P4_CRU_ESCR0:	12, 13, 16
+	 * MSR_P4_CRU_ESCR1:	14, 15, 17
+	 */
+
+#define P4_X87_ASSIST			P4_EVENT_PACK(0x03, 0x05)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_MACHINE_CLEAR		P4_EVENT_PACK(0x02, 0x05)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_INSTR_COMPLETED		P4_EVENT_PACK(0x07, 0x04)
+	/*
+	 * MSR_P4_CRU_ESCR0:	12, 13, 16
+	 * MSR_P4_CRU_ESCR1:	14, 15, 17
+	 */
+
+/*
+ * a caller should use P4_EVENT_ATTR helper to
+ * pick the attribute needed, for example
+ *
+ *	P4_EVENT_ATTR(P4_TC_DELIVER_MODE, DD)
+ */
+enum P4_EVENTS_ATTR {
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, DD, 0),
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, DB, 1),
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, DI, 2),
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, BD, 3),
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, BB, 4),
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, BI, 5),
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, ID, 6),
+
+	P4_MAKE_EVENT_ATTR(P4_BPU_FETCH_REQUEST, TCMISS, 0),
+
+	P4_MAKE_EVENT_ATTR(P4_ITLB_REFERENCE, HIT, 0),
+	P4_MAKE_EVENT_ATTR(P4_ITLB_REFERENCE, MISS, 1),
+	P4_MAKE_EVENT_ATTR(P4_ITLB_REFERENCE, HIT_UK, 2),
+
+	P4_MAKE_EVENT_ATTR(P4_MEMORY_CANCEL, ST_RB_FULL, 2),
+	P4_MAKE_EVENT_ATTR(P4_MEMORY_CANCEL, 64K_CONF, 3),
+
+	P4_MAKE_EVENT_ATTR(P4_MEMORY_COMPLETE, LSC, 0),
+	P4_MAKE_EVENT_ATTR(P4_MEMORY_COMPLETE, SSC, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_LOAD_PORT_REPLAY, SPLIT_LD, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_STORE_PORT_REPLAY, SPLIT_ST, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, NO_STA, 1),
+	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, NO_STD, 3),
+	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, PARTIAL_DATA, 4),
+	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, UNALGN_ADDR, 5),
+
+	P4_MAKE_EVENT_ATTR(P4_PAGE_WALK_TYPE, DTMISS, 0),
+	P4_MAKE_EVENT_ATTR(P4_PAGE_WALK_TYPE, ITMISS, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITS, 0),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITE, 1),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITM, 2),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITS, 3),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITE, 4),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITM, 5),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_MISS, 8),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_MISS, 9),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, WR_2ndL_MISS, 10),
+
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, DEFAULT, 0),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, ALL_READ, 5),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, ALL_WRITE, 6),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_UC, 7),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WC, 8),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WT, 9),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WP, 10),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WB, 11),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, OWN, 13),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, OTHER, 14),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, PREFETCH, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, DEFAULT, 0),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, ALL_READ, 5),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, ALL_WRITE, 6),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_UC, 7),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WC, 8),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WT, 9),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WP, 10),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WB, 11),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, OWN, 13),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, OTHER, 14),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, PREFETCH, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_DRV, 0),
+	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OWN, 1),
+	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OTHER, 2),
+	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DBSY_DRV, 3),
+	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DBSY_OWN, 4),
+	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DBSY_OTHER, 5),
+
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_TYPE0, 0),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_TYPE1, 1),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_LEN0, 2),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_LEN1, 3),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_IO_TYPE, 5),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_LOCK_TYPE, 6),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_CACHE_TYPE, 7),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_SPLIT_TYPE, 8),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_DEM_TYPE, 9),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_ORD_TYPE, 10),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, MEM_TYPE0, 11),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, MEM_TYPE1, 12),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, MEM_TYPE2, 13),
+
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_TYPE0, 0),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_TYPE1, 1),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_LEN0, 2),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_LEN1, 3),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE, 5),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE, 6),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE, 7),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE, 8),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE, 9),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE, 10),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, MEM_TYPE0, 11),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, MEM_TYPE1, 12),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, MEM_TYPE2, 13),
+
+	P4_MAKE_EVENT_ATTR(P4_SSE_INPUT_ASSIST, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_PACKED_SP_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_PACKED_DP_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_SCALAR_SP_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_SCALAR_DP_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_64BIT_MMX_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_128BIT_MMX_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_X87_FP_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_TC_MISC, FLUSH, 4),
+
+	P4_MAKE_EVENT_ATTR(P4_GLOBAL_POWER_EVENTS, RUNNING, 0),
+
+	P4_MAKE_EVENT_ATTR(P4_TC_MS_XFER, CISC, 0),
+
+	P4_MAKE_EVENT_ATTR(P4_UOP_QUEUE_WRITES, FROM_TC_BUILD, 0),
+	P4_MAKE_EVENT_ATTR(P4_UOP_QUEUE_WRITES, FROM_TC_DELIVER, 1),
+	P4_MAKE_EVENT_ATTR(P4_UOP_QUEUE_WRITES, FROM_ROM, 2),
+
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL, 1),
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, CALL, 2),
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, RETURN, 3),
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT, 4),
+
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CONDITIONAL, 1),
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CALL, 2),
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, RETURN, 3),
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, INDIRECT, 4),
+
+	P4_MAKE_EVENT_ATTR(P4_RESOURCE_STALL, SBFULL, 5),
+
+	P4_MAKE_EVENT_ATTR(P4_WC_BUFFER, WCB_EVICTS, 0),
+	P4_MAKE_EVENT_ATTR(P4_WC_BUFFER, WCB_FULL_EVICTS, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_FRONT_END_EVENT, NBOGUS, 0),
+	P4_MAKE_EVENT_ATTR(P4_FRONT_END_EVENT, BOGUS, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS0, 0),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS1, 1),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS2, 2),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS3, 3),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS0, 4),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS1, 5),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS2, 6),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS3, 7),
+
+	P4_MAKE_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS, 0),
+	P4_MAKE_EVENT_ATTR(P4_REPLAY_EVENT, BOGUS, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSNTAG, 0),
+	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSTAG, 1),
+	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG, 2),
+	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSTAG, 3),
+
+	P4_MAKE_EVENT_ATTR(P4_UOPS_RETIRED, NBOGUS, 0),
+	P4_MAKE_EVENT_ATTR(P4_UOPS_RETIRED, BOGUS, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS, 1),
+	P4_MAKE_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES, 2),
+
+	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMNP, 0),
+	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMNM, 1),
+	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMTP, 2),
+	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMTM, 3),
+
+	P4_MAKE_EVENT_ATTR(P4_MISPRED_BRANCH_RETIRED, NBOGUS, 0),
+
+	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, FPSU, 0),
+	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, FPSO, 1),
+	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, POAO, 2),
+	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, POAU, 3),
+	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, PREA, 4),
+
+	P4_MAKE_EVENT_ATTR(P4_MACHINE_CLEAR, CLEAR, 0),
+	P4_MAKE_EVENT_ATTR(P4_MACHINE_CLEAR, MOCLEAR, 1),
+	P4_MAKE_EVENT_ATTR(P4_MACHINE_CLEAR, SMCLEAR, 2),
+
+	P4_MAKE_EVENT_ATTR(P4_INSTR_COMPLETED, NBOGUS, 0),
+	P4_MAKE_EVENT_ATTR(P4_INSTR_COMPLETED, BOGUS, 1),
+};
+
+#endif /* PERF_EVENT_P4_H */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e24f6374f9f5..e6a3f5f81c96 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -190,6 +190,8 @@ struct x86_pmu {
 	void		(*enable_all)(void);
 	void		(*enable)(struct perf_event *);
 	void		(*disable)(struct perf_event *);
+	int		(*hw_config)(struct perf_event_attr *attr, struct hw_perf_event *hwc);
+	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
 	unsigned	eventsel;
 	unsigned	perfctr;
 	u64		(*event_map)(int);
@@ -415,6 +417,25 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
 	return 0;
 }
 
+static int x86_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
+{
+	/*
+	 * Generate PMC IRQs:
+	 * (keep 'enabled' bit clear for now)
+	 */
+	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
+
+	/*
+	 * Count user and OS events unless requested not to
+	 */
+	if (!attr->exclude_user)
+		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
+	if (!attr->exclude_kernel)
+		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+
+	return 0;
+}
+
 /*
  * Setup the hardware configuration for a given attr_type
  */
@@ -446,23 +467,13 @@ static int __hw_perf_event_init(struct perf_event *event)
 
 	event->destroy = hw_perf_event_destroy;
 
-	/*
-	 * Generate PMC IRQs:
-	 * (keep 'enabled' bit clear for now)
-	 */
-	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
-
 	hwc->idx = -1;
 	hwc->last_cpu = -1;
 	hwc->last_tag = ~0ULL;
 
-	/*
-	 * Count user and OS events unless requested not to.
-	 */
-	if (!attr->exclude_user)
-		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
-	if (!attr->exclude_kernel)
-		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+	/* Processor specifics */
+	if (x86_pmu.hw_config(attr, hwc))
+		return -EOPNOTSUPP;
 
 	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
@@ -517,7 +528,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 			return -EOPNOTSUPP;
 
 		/* BTS is currently only allowed for user-mode. */
-		if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+		if (!attr->exclude_kernel)
 			return -EOPNOTSUPP;
 	}
 
@@ -931,7 +942,7 @@ static int x86_pmu_enable(struct perf_event *event)
 	if (n < 0)
 		return n;
 
-	ret = x86_schedule_events(cpuc, n, assign);
+	ret = x86_pmu.schedule_events(cpuc, n, assign);
 	if (ret)
 		return ret;
 	/*
@@ -1263,7 +1274,7 @@ int hw_perf_group_sched_in(struct perf_event *leader,
 	if (n0 < 0)
 		return n0;
 
-	ret = x86_schedule_events(cpuc, n0, assign);
+	ret = x86_pmu.schedule_events(cpuc, n0, assign);
 	if (ret)
 		return ret;
 
@@ -1313,6 +1324,7 @@ undo:
 
 #include "perf_event_amd.c"
 #include "perf_event_p6.c"
+#include "perf_event_p4.c"
 #include "perf_event_intel_lbr.c"
 #include "perf_event_intel_ds.c"
 #include "perf_event_intel.c"
@@ -1515,7 +1527,7 @@ static int validate_group(struct perf_event *event)
 
 	fake_cpuc->n_events = n;
 
-	ret = x86_schedule_events(fake_cpuc, n, NULL);
+	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
 
 out_free:
 	kfree(fake_cpuc);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 573458f1caf2..358a8e3d05f8 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -363,6 +363,8 @@ static __initconst struct x86_pmu amd_pmu = {
 	.enable_all		= x86_pmu_enable_all,
 	.enable			= x86_pmu_enable_event,
 	.disable		= x86_pmu_disable_event,
+	.hw_config		= x86_hw_config,
+	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_K7_EVNTSEL0,
 	.perfctr		= MSR_K7_PERFCTR0,
 	.event_map		= amd_pmu_event_map,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 971dc6e7d54b..044b8436b19d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -749,6 +749,8 @@ static __initconst struct x86_pmu core_pmu = {
 	.enable_all		= x86_pmu_enable_all,
 	.enable			= x86_pmu_enable_event,
 	.disable		= x86_pmu_disable_event,
+	.hw_config		= x86_hw_config,
+	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
@@ -786,6 +788,8 @@ static __initconst struct x86_pmu intel_pmu = {
 	.enable_all		= intel_pmu_enable_all,
 	.enable			= intel_pmu_enable_event,
 	.disable		= intel_pmu_disable_event,
+	.hw_config		= x86_hw_config,
+	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
@@ -839,12 +843,13 @@ static __init int intel_pmu_init(void)
 	int version;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-		/* check for P6 processor family */
-	   if (boot_cpu_data.x86 == 6) {
-		return p6_pmu_init();
-	   } else {
+		switch (boot_cpu_data.x86) {
+		case 0x6:
+			return p6_pmu_init();
+		case 0xf:
+			return p4_pmu_init();
+		}
 		return -ENODEV;
-	   }
 	}
 
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
new file mode 100644
index 000000000000..381f593e8297
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -0,0 +1,607 @@
+/*
+ * Netburst Perfomance Events (P4, old Xeon)
+ *
+ *  Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
+ *  Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#ifdef CONFIG_CPU_SUP_INTEL
+
+#include <asm/perf_event_p4.h>
+
+/*
+ * array indices: 0,1 - HT threads, used with HT enabled cpu
+ */
+struct p4_event_template {
+	u32 opcode;			/* ESCR event + CCCR selector */
+	u64 config;			/* packed predefined bits */
+	int dep;			/* upstream dependency event index */
+	unsigned int emask;		/* ESCR EventMask */
+	unsigned int escr_msr[2];	/* ESCR MSR for this event */
+	unsigned int cntr[2];		/* counter index (offset) */
+};
+
+struct p4_pmu_res {
+	/* maps hw_conf::idx into template for ESCR sake */
+	struct p4_event_template *tpl[ARCH_P4_MAX_CCCR];
+};
+
+static DEFINE_PER_CPU(struct p4_pmu_res, p4_pmu_config);
+
+/*
+ * WARN: CCCR1 doesn't have a working enable bit so try to not
+ * use it if possible
+ *
+ * Also as only we start to support raw events we will need to
+ * append _all_ P4_EVENT_PACK'ed events here
+ */
+struct p4_event_template p4_templates[] = {
+	[0] = {
+		.opcode	= P4_UOP_TYPE,
+		.config	= 0,
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS)	|
+			P4_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES),
+		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+		.cntr		= { 16, 17 },
+	},
+	[1] = {
+		.opcode	= P4_GLOBAL_POWER_EVENTS,
+		.config	= 0,
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_GLOBAL_POWER_EVENTS, RUNNING),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { 0, 2 },
+	},
+	[2] = {
+		.opcode	= P4_INSTR_RETIRED,
+		.config	= 0,
+		.dep	= 0, /* needs front-end tagging */
+		.emask	=
+			P4_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSNTAG)	|
+			P4_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSTAG)	|
+			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG)	|
+			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSTAG),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { 12, 14 },
+	},
+	[3] = {
+		.opcode	= P4_BSQ_CACHE_REFERENCE,
+		.config	= 0,
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITM),
+		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+		.cntr		= { 0, 2 },
+	},
+	[4] = {
+		.opcode	= P4_BSQ_CACHE_REFERENCE,
+		.config	= 0,
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
+		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+		.cntr		= { 0, 3 },
+	},
+	[5] = {
+		.opcode	= P4_RETIRED_BRANCH_TYPE,
+		.config	= 0,
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
+			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CALL)		|
+			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, RETURN)		|
+			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, INDIRECT),
+		.escr_msr	= { MSR_P4_TBPU_ESCR0, MSR_P4_TBPU_ESCR1 },
+		.cntr		= { 4, 6 },
+	},
+	[6] = {
+		.opcode	= P4_MISPRED_BRANCH_RETIRED,
+		.config	= 0,
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_MISPRED_BRANCH_RETIRED, NBOGUS),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.cntr		= { 12, 14 },
+	},
+	[7] = {
+		.opcode	= P4_FSB_DATA_ACTIVITY,
+		.config	= p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_DRV)	|
+			P4_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OWN),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { 0, 2 },
+	},
+};
+
+static struct p4_event_template *p4_event_map[PERF_COUNT_HW_MAX] = {
+	/* non-halted CPU clocks */
+	[PERF_COUNT_HW_CPU_CYCLES]		= &p4_templates[1],
+
+	/* retired instructions: dep on tagging the FSB */
+	[PERF_COUNT_HW_INSTRUCTIONS]		= &p4_templates[2],
+
+	/* cache hits */
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= &p4_templates[3],
+
+	/* cache misses */
+	[PERF_COUNT_HW_CACHE_MISSES]		= &p4_templates[4],
+
+	/* branch instructions retired */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= &p4_templates[5],
+
+	/* mispredicted branches retired */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= &p4_templates[6],
+
+	/* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
+	[PERF_COUNT_HW_BUS_CYCLES]		= &p4_templates[7],
+};
+
+static u64 p4_pmu_event_map(int hw_event)
+{
+	struct p4_event_template *tpl;
+	u64 config;
+
+	if (hw_event > ARRAY_SIZE(p4_event_map)) {
+		printk_once(KERN_ERR "PMU: Incorrect event index\n");
+		return 0;
+	}
+	tpl = p4_event_map[hw_event];
+
+	/*
+	 * fill config up according to
+	 * a predefined event template
+	 */
+	config  = tpl->config;
+	config |= p4_config_pack_escr(P4_EVENT_UNPACK_EVENT(tpl->opcode) << P4_EVNTSEL_EVENT_SHIFT);
+	config |= p4_config_pack_escr(tpl->emask << P4_EVNTSEL_EVENTMASK_SHIFT);
+	config |= p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(tpl->opcode) << P4_CCCR_ESCR_SELECT_SHIFT);
+
+	/* on HT machine we need a special bit */
+	if (p4_ht_active() && p4_ht_thread(raw_smp_processor_id()))
+		config = p4_set_ht_bit(config);
+
+	return config;
+}
+
+/*
+ * Note that we still have 5 events (from global events SDM list)
+ * intersected in opcode+emask bits so we will need another
+ * scheme there do distinguish templates.
+ */
+static inline int p4_pmu_emask_match(unsigned int dst, unsigned int src)
+{
+	return dst & src;
+}
+
+static struct p4_event_template *p4_pmu_template_lookup(u64 config)
+{
+	u32 opcode = p4_config_unpack_opcode(config);
+	unsigned int emask = p4_config_unpack_emask(config);
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(p4_templates); i++) {
+		if (opcode == p4_templates[i].opcode &&
+			p4_pmu_emask_match(emask, p4_templates[i].emask))
+			return &p4_templates[i];
+	}
+
+	return NULL;
+}
+
+/*
+ * We don't control raw events so it's up to the caller
+ * to pass sane values (and we don't count the thread number
+ * on HT machine but allow HT-compatible specifics to be
+ * passed on)
+ */
+static u64 p4_pmu_raw_event(u64 hw_event)
+{
+	return hw_event &
+		(p4_config_pack_escr(P4_EVNTSEL_MASK_HT) |
+		 p4_config_pack_cccr(P4_CCCR_MASK_HT));
+}
+
+static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
+{
+	int cpu = raw_smp_processor_id();
+
+	/*
+	 * the reason we use cpu that early is that: if we get scheduled
+	 * first time on the same cpu -- we will not need swap thread
+	 * specific flags in config (and will save some cpu cycles)
+	 */
+
+	/* CCCR by default */
+	hwc->config = p4_config_pack_cccr(p4_default_cccr_conf(cpu));
+
+	/* Count user and OS events unless not requested to */
+	hwc->config |= p4_config_pack_escr(p4_default_escr_conf(cpu, attr->exclude_kernel,
+								attr->exclude_user));
+	return 0;
+}
+
+static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
+{
+	unsigned long dummy;
+
+	rdmsrl(hwc->config_base + hwc->idx, dummy);
+	if (dummy & P4_CCCR_OVF) {
+		(void)checking_wrmsrl(hwc->config_base + hwc->idx,
+			((u64)dummy) & ~P4_CCCR_OVF);
+	}
+}
+
+static inline void p4_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	/*
+	 * If event gets disabled while counter is in overflowed
+	 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
+	 * asserted again and again
+	 */
+	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
+		(u64)(p4_config_unpack_cccr(hwc->config)) &
+			~P4_CCCR_ENABLE & ~P4_CCCR_OVF);
+}
+
+static void p4_pmu_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+		struct perf_event *event = cpuc->events[idx];
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		p4_pmu_disable_event(event);
+	}
+}
+
+static void p4_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int thread = p4_ht_config_thread(hwc->config);
+	u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
+	u64 escr_base;
+	struct p4_event_template *tpl;
+	struct p4_pmu_res *c;
+
+	/*
+	 * some preparation work from per-cpu private fields
+	 * since we need to find out which ESCR to use
+	 */
+	c = &__get_cpu_var(p4_pmu_config);
+	tpl = c->tpl[hwc->idx];
+	if (!tpl) {
+		pr_crit("%s: Wrong index: %d\n", __func__, hwc->idx);
+		return;
+	}
+	escr_base = (u64)tpl->escr_msr[thread];
+
+	/*
+	 * - we dont support cascaded counters yet
+	 * - and counter 1 is broken (erratum)
+	 */
+	WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
+	WARN_ON_ONCE(hwc->idx == 1);
+
+	(void)checking_wrmsrl(escr_base, escr_conf);
+	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
+		(u64)(p4_config_unpack_cccr(hwc->config)) | P4_CCCR_ENABLE);
+}
+
+static void p4_pmu_enable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+		struct perf_event *event = cpuc->events[idx];
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		p4_pmu_enable_event(event);
+	}
+}
+
+static int p4_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	struct perf_event *event;
+	struct hw_perf_event *hwc;
+	int idx, handled = 0;
+	u64 val;
+
+	data.addr = 0;
+	data.raw = NULL;
+
+	cpuc = &__get_cpu_var(cpu_hw_events);
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		event = cpuc->events[idx];
+		hwc = &event->hw;
+
+		WARN_ON_ONCE(hwc->idx != idx);
+
+		/*
+		 * FIXME: Redundant call, actually not needed
+		 * but just to check if we're screwed
+		 */
+		p4_pmu_clear_cccr_ovf(hwc);
+
+		val = x86_perf_event_update(event);
+		if (val & (1ULL << (x86_pmu.event_bits - 1)))
+			continue;
+
+		/*
+		 * event overflow
+		 */
+		handled		= 1;
+		data.period	= event->hw.last_period;
+
+		if (!x86_perf_event_set_period(event))
+			continue;
+		if (perf_event_overflow(event, 1, &data, regs))
+			p4_pmu_disable_event(event);
+	}
+
+	if (handled) {
+		/* p4 quirk: unmask it again */
+		apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
+		inc_irq_stat(apic_perf_irqs);
+	}
+
+	return handled;
+}
+
+/*
+ * swap thread specific fields according to a thread
+ * we are going to run on
+ */
+static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
+{
+	u32 escr, cccr;
+
+	/*
+	 * we either lucky and continue on same cpu or no HT support
+	 */
+	if (!p4_should_swap_ts(hwc->config, cpu))
+		return;
+
+	/*
+	 * the event is migrated from an another logical
+	 * cpu, so we need to swap thread specific flags
+	 */
+
+	escr = p4_config_unpack_escr(hwc->config);
+	cccr = p4_config_unpack_cccr(hwc->config);
+
+	if (p4_ht_thread(cpu)) {
+		cccr &= ~P4_CCCR_OVF_PMI_T0;
+		cccr |= P4_CCCR_OVF_PMI_T1;
+		if (escr & P4_EVNTSEL_T0_OS) {
+			escr &= ~P4_EVNTSEL_T0_OS;
+			escr |= P4_EVNTSEL_T1_OS;
+		}
+		if (escr & P4_EVNTSEL_T0_USR) {
+			escr &= ~P4_EVNTSEL_T0_USR;
+			escr |= P4_EVNTSEL_T1_USR;
+		}
+		hwc->config  = p4_config_pack_escr(escr);
+		hwc->config |= p4_config_pack_cccr(cccr);
+		hwc->config |= P4_CONFIG_HT;
+	} else {
+		cccr &= ~P4_CCCR_OVF_PMI_T1;
+		cccr |= P4_CCCR_OVF_PMI_T0;
+		if (escr & P4_EVNTSEL_T1_OS) {
+			escr &= ~P4_EVNTSEL_T1_OS;
+			escr |= P4_EVNTSEL_T0_OS;
+		}
+		if (escr & P4_EVNTSEL_T1_USR) {
+			escr &= ~P4_EVNTSEL_T1_USR;
+			escr |= P4_EVNTSEL_T0_USR;
+		}
+		hwc->config  = p4_config_pack_escr(escr);
+		hwc->config |= p4_config_pack_cccr(cccr);
+		hwc->config &= ~P4_CONFIG_HT;
+	}
+}
+
+/* ESCRs are not sequential in memory so we need a map */
+static unsigned int p4_escr_map[ARCH_P4_TOTAL_ESCR] = {
+	MSR_P4_ALF_ESCR0,	/*  0 */
+	MSR_P4_ALF_ESCR1,	/*  1 */
+	MSR_P4_BPU_ESCR0,	/*  2 */
+	MSR_P4_BPU_ESCR1,	/*  3 */
+	MSR_P4_BSU_ESCR0,	/*  4 */
+	MSR_P4_BSU_ESCR1,	/*  5 */
+	MSR_P4_CRU_ESCR0,	/*  6 */
+	MSR_P4_CRU_ESCR1,	/*  7 */
+	MSR_P4_CRU_ESCR2,	/*  8 */
+	MSR_P4_CRU_ESCR3,	/*  9 */
+	MSR_P4_CRU_ESCR4,	/* 10 */
+	MSR_P4_CRU_ESCR5,	/* 11 */
+	MSR_P4_DAC_ESCR0,	/* 12 */
+	MSR_P4_DAC_ESCR1,	/* 13 */
+	MSR_P4_FIRM_ESCR0,	/* 14 */
+	MSR_P4_FIRM_ESCR1,	/* 15 */
+	MSR_P4_FLAME_ESCR0,	/* 16 */
+	MSR_P4_FLAME_ESCR1,	/* 17 */
+	MSR_P4_FSB_ESCR0,	/* 18 */
+	MSR_P4_FSB_ESCR1,	/* 19 */
+	MSR_P4_IQ_ESCR0,	/* 20 */
+	MSR_P4_IQ_ESCR1,	/* 21 */
+	MSR_P4_IS_ESCR0,	/* 22 */
+	MSR_P4_IS_ESCR1,	/* 23 */
+	MSR_P4_ITLB_ESCR0,	/* 24 */
+	MSR_P4_ITLB_ESCR1,	/* 25 */
+	MSR_P4_IX_ESCR0,	/* 26 */
+	MSR_P4_IX_ESCR1,	/* 27 */
+	MSR_P4_MOB_ESCR0,	/* 28 */
+	MSR_P4_MOB_ESCR1,	/* 29 */
+	MSR_P4_MS_ESCR0,	/* 30 */
+	MSR_P4_MS_ESCR1,	/* 31 */
+	MSR_P4_PMH_ESCR0,	/* 32 */
+	MSR_P4_PMH_ESCR1,	/* 33 */
+	MSR_P4_RAT_ESCR0,	/* 34 */
+	MSR_P4_RAT_ESCR1,	/* 35 */
+	MSR_P4_SAAT_ESCR0,	/* 36 */
+	MSR_P4_SAAT_ESCR1,	/* 37 */
+	MSR_P4_SSU_ESCR0,	/* 38 */
+	MSR_P4_SSU_ESCR1,	/* 39 */
+	MSR_P4_TBPU_ESCR0,	/* 40 */
+	MSR_P4_TBPU_ESCR1,	/* 41 */
+	MSR_P4_TC_ESCR0,	/* 42 */
+	MSR_P4_TC_ESCR1,	/* 43 */
+	MSR_P4_U2L_ESCR0,	/* 44 */
+	MSR_P4_U2L_ESCR1,	/* 45 */
+};
+
+static int p4_get_escr_idx(unsigned int addr)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(p4_escr_map); i++) {
+		if (addr == p4_escr_map[i])
+			return i;
+	}
+
+	return -1;
+}
+
+static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long escr_mask[BITS_TO_LONGS(ARCH_P4_TOTAL_ESCR)];
+
+	struct hw_perf_event *hwc;
+	struct p4_event_template *tpl;
+	struct p4_pmu_res *c;
+	int cpu = raw_smp_processor_id();
+	int escr_idx, thread, i, num;
+
+	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+	bitmap_zero(escr_mask, ARCH_P4_TOTAL_ESCR);
+
+	c = &__get_cpu_var(p4_pmu_config);
+	/*
+	 * Firstly find out which resource events are going
+	 * to use, if ESCR+CCCR tuple is already borrowed
+	 * then get out of here
+	 */
+	for (i = 0, num = n; i < n; i++, num--) {
+		hwc = &cpuc->event_list[i]->hw;
+		tpl = p4_pmu_template_lookup(hwc->config);
+		if (!tpl)
+			goto done;
+		thread = p4_ht_thread(cpu);
+		escr_idx = p4_get_escr_idx(tpl->escr_msr[thread]);
+		if (escr_idx == -1)
+			goto done;
+
+		/* already allocated and remains on the same cpu */
+		if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
+			if (assign)
+				assign[i] = hwc->idx;
+			/* upstream dependent event */
+			if (unlikely(tpl->dep != -1))
+				printk_once(KERN_WARNING "PMU: Dep events are "
+					"not implemented yet\n");
+			goto reserve;
+		}
+
+		/* it may be already borrowed */
+		if (test_bit(tpl->cntr[thread], used_mask) ||
+			test_bit(escr_idx, escr_mask))
+			goto done;
+
+		/*
+		 * ESCR+CCCR+COUNTERs are available to use lets swap
+		 * thread specific bits, push assigned bits
+		 * back and save template into per-cpu
+		 * area (which will allow us to find out the ESCR
+		 * to be used at moment of "enable event via real MSR")
+		 */
+		p4_pmu_swap_config_ts(hwc, cpu);
+		if (assign) {
+			assign[i] = tpl->cntr[thread];
+			c->tpl[assign[i]] = tpl;
+		}
+reserve:
+		set_bit(tpl->cntr[thread], used_mask);
+		set_bit(escr_idx, escr_mask);
+	}
+
+done:
+	return num ? -ENOSPC : 0;
+}
+
+static __initconst struct x86_pmu p4_pmu = {
+	.name			= "Netburst P4/Xeon",
+	.handle_irq		= p4_pmu_handle_irq,
+	.disable_all		= p4_pmu_disable_all,
+	.enable_all		= p4_pmu_enable_all,
+	.enable			= p4_pmu_enable_event,
+	.disable		= p4_pmu_disable_event,
+	.eventsel		= MSR_P4_BPU_CCCR0,
+	.perfctr		= MSR_P4_BPU_PERFCTR0,
+	.event_map		= p4_pmu_event_map,
+	.raw_event		= p4_pmu_raw_event,
+	.max_events		= ARRAY_SIZE(p4_event_map),
+	.get_event_constraints	= x86_get_event_constraints,
+	/*
+	 * IF HT disabled we may need to use all
+	 * ARCH_P4_MAX_CCCR counters simulaneously
+	 * though leave it restricted at moment assuming
+	 * HT is on
+	 */
+	.num_events		= ARCH_P4_MAX_CCCR,
+	.apic			= 1,
+	.event_bits		= 40,
+	.event_mask		= (1ULL << 40) - 1,
+	.max_period		= (1ULL << 39) - 1,
+	.hw_config		= p4_hw_config,
+	.schedule_events	= p4_pmu_schedule_events,
+};
+
+static __init int p4_pmu_init(void)
+{
+	unsigned int low, high;
+
+	/* If we get stripped -- indexig fails */
+	BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
+
+	rdmsr(MSR_IA32_MISC_ENABLE, low, high);
+	if (!(low & (1 << 7))) {
+		pr_cont("unsupported Netburst CPU model %d ",
+			boot_cpu_data.x86_model);
+		return -ENODEV;
+	}
+
+	pr_cont("Netburst events, ");
+
+	x86_pmu = p4_pmu;
+
+	return 0;
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index a330485d14da..6ff4d01d880f 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -109,6 +109,8 @@ static __initconst struct x86_pmu p6_pmu = {
 	.enable_all		= p6_pmu_enable_all,
 	.enable			= p6_pmu_enable_event,
 	.disable		= p6_pmu_disable_event,
+	.hw_config		= x86_hw_config,
+	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_P6_EVNTSEL0,
 	.perfctr		= MSR_P6_PERFCTR0,
 	.event_map		= p6_pmu_event_map,
-- 
cgit v1.2.3-55-g7522


From 0b861225a5890f22445f08ca9cc7a87cff276ff7 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Fri, 12 Mar 2010 00:50:16 +0300
Subject: x86, perf: Fix NULL deref on not assigned x86_pmu

In case of not assigned x86_pmu and software events NULL dereference may
being hit via x86_pmu::schedule_events method.

Fix it by checking if x86_pmu is initialized at all.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20100311215016.GG25162@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e6a3f5f81c96..5586a02067d8 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1269,6 +1269,9 @@ int hw_perf_group_sched_in(struct perf_event *leader,
 	int assign[X86_PMC_IDX_MAX];
 	int n0, n1, ret;
 
+	if (!x86_pmu_initialized())
+		return 0;
+
 	/* n0 = total number of events */
 	n0 = collect_events(cpuc, leader, true);
 	if (n0 < 0)
-- 
cgit v1.2.3-55-g7522


From c4507257764ae0387f0d8c2ca57808f37e4c9439 Mon Sep 17 00:00:00 2001
From: John Stultz
Date: Thu, 11 Mar 2010 14:04:47 -0800
Subject: time: Clean up direct xtime usage in xen

Cleanup xen's direct use of internal timekeeping values.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/xen/time.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 0d3f07cd1b5f..6365df925cb1 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -475,6 +475,7 @@ void xen_timer_resume(void)
 __init void xen_time_init(void)
 {
 	int cpu = smp_processor_id();
+	struct timespec tp;
 
 	clocksource_register(&xen_clocksource);
 
@@ -486,9 +487,8 @@ __init void xen_time_init(void)
 	}
 
 	/* Set initial system time with full resolution */
-	xen_read_wallclock(&xtime);
-	set_normalized_timespec(&wall_to_monotonic,
-				-xtime.tv_sec, -xtime.tv_nsec);
+	xen_read_wallclock(&tp);
+	do_settimeofday(&tp);
 
 	setup_force_cpu_cap(X86_FEATURE_TSC);
 
-- 
cgit v1.2.3-55-g7522


From 32cbd7dfce93382a70f155bf539871b4c55bed29 Mon Sep 17 00:00:00 2001
From: Huang Ying
Date: Sat, 13 Mar 2010 16:28:42 +0800
Subject: crypto: aesni-intel - Fix CTR optimization build failure with gas
 2.16.1

Andrew Morton reported that AES-NI CTR optimization failed to compile
with gas 2.16.1, the error message is as follow:

arch/x86/crypto/aesni-intel_asm.S: Assembler messages:
arch/x86/crypto/aesni-intel_asm.S:752: Error: suffix or operands invalid for `movq'
arch/x86/crypto/aesni-intel_asm.S:753: Error: suffix or operands invalid for `movq'

To fix this, a gas macro is defined to assemble movq with 64bit
general purpose registers and XMM registers. The macro will generate
the raw .byte sequence for needed instructions.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_asm.S |  4 +-
 arch/x86/include/asm/inst.h       | 96 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 95 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 822846a9eba9..ff16756a51c1 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -749,8 +749,8 @@ _aesni_inc_init:
 	movaps IV, CTR
 	PSHUFB_XMM BSWAP_MASK CTR
 	mov $1, TCTR_LOW
-	movq TCTR_LOW, INC
-	movq CTR, TCTR_LOW
+	MOVQ_R64_XMM TCTR_LOW INC
+	MOVQ_R64_XMM CTR TCTR_LOW
 	ret
 
 /*
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
index 14cf526091f9..840a399701b2 100644
--- a/arch/x86/include/asm/inst.h
+++ b/arch/x86/include/asm/inst.h
@@ -7,7 +7,66 @@
 
 #ifdef __ASSEMBLY__
 
+#define REG_NUM_INVALID		100
+
+#define REG_TYPE_R64		0
+#define REG_TYPE_XMM		1
+#define REG_TYPE_INVALID	100
+
+	.macro R64_NUM opd r64
+	\opd = REG_NUM_INVALID
+	.ifc \r64,%rax
+	\opd = 0
+	.endif
+	.ifc \r64,%rcx
+	\opd = 1
+	.endif
+	.ifc \r64,%rdx
+	\opd = 2
+	.endif
+	.ifc \r64,%rbx
+	\opd = 3
+	.endif
+	.ifc \r64,%rsp
+	\opd = 4
+	.endif
+	.ifc \r64,%rbp
+	\opd = 5
+	.endif
+	.ifc \r64,%rsi
+	\opd = 6
+	.endif
+	.ifc \r64,%rdi
+	\opd = 7
+	.endif
+	.ifc \r64,%r8
+	\opd = 8
+	.endif
+	.ifc \r64,%r9
+	\opd = 9
+	.endif
+	.ifc \r64,%r10
+	\opd = 10
+	.endif
+	.ifc \r64,%r11
+	\opd = 11
+	.endif
+	.ifc \r64,%r12
+	\opd = 12
+	.endif
+	.ifc \r64,%r13
+	\opd = 13
+	.endif
+	.ifc \r64,%r14
+	\opd = 14
+	.endif
+	.ifc \r64,%r15
+	\opd = 15
+	.endif
+	.endm
+
 	.macro XMM_NUM opd xmm
+	\opd = REG_NUM_INVALID
 	.ifc \xmm,%xmm0
 	\opd = 0
 	.endif
@@ -58,13 +117,25 @@
 	.endif
 	.endm
 
+	.macro REG_TYPE type reg
+	R64_NUM reg_type_r64 \reg
+	XMM_NUM reg_type_xmm \reg
+	.if reg_type_r64 != REG_NUM_INVALID
+	\type = REG_TYPE_R64
+	.elseif reg_type_xmm != REG_NUM_INVALID
+	\type = REG_TYPE_XMM
+	.else
+	\type = REG_TYPE_INVALID
+	.endif
+	.endm
+
 	.macro PFX_OPD_SIZE
 	.byte 0x66
 	.endm
 
-	.macro PFX_REX opd1 opd2
-	.if (\opd1 | \opd2) & 8
-	.byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1)
+	.macro PFX_REX opd1 opd2 W=0
+	.if ((\opd1 | \opd2) & 8) || \W
+	.byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
 	.endif
 	.endm
 
@@ -145,6 +216,25 @@
 	.byte 0x0f, 0x38, 0xdf
 	MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2
 	.endm
+
+	.macro MOVQ_R64_XMM opd1 opd2
+	REG_TYPE movq_r64_xmm_opd1_type \opd1
+	.if movq_r64_xmm_opd1_type == REG_TYPE_XMM
+	XMM_NUM movq_r64_xmm_opd1 \opd1
+	R64_NUM movq_r64_xmm_opd2 \opd2
+	.else
+	R64_NUM movq_r64_xmm_opd1 \opd1
+	XMM_NUM movq_r64_xmm_opd2 \opd2
+	.endif
+	PFX_OPD_SIZE
+	PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1
+	.if movq_r64_xmm_opd1_type == REG_TYPE_XMM
+	.byte 0x0f, 0x7e
+	.else
+	.byte 0x0f, 0x6e
+	.endif
+	MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
+	.endm
 #endif
 
 #endif
-- 
cgit v1.2.3-55-g7522


From 8576e1971663ffdb6139041de97cdd2e1d4791cc Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Sat, 13 Mar 2010 11:11:16 +0300
Subject: x86, perf: Unmask LVTPC only if we have APIC supported

Ingo reported:

 |
 | There's a build failure on -tip with the P4 driver, on UP 32-bit, if
 | PERF_EVENTS is enabled but UP_APIC is disabled:
 |
 | arch/x86/built-in.o: In function `p4_pmu_handle_irq':
 | perf_event.c:(.text+0xa756): undefined reference to `apic'
 | perf_event.c:(.text+0xa76e): undefined reference to `apic'
 |

So we have to unmask LVTPC only if we're configured to have one.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Lin Ming <ming.m.lin@intel.com>
CC: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20100313081116.GA5179@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 381f593e8297..ef861da1c8ab 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -365,8 +365,10 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 	}
 
 	if (handled) {
+#ifdef CONFIG_X86_LOCAL_APIC
 		/* p4 quirk: unmask it again */
 		apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
+#endif
 		inc_irq_stat(apic_perf_irqs);
 	}
 
-- 
cgit v1.2.3-55-g7522


From 68ca406930d6380b3be7ada5f15fcf85bfcbd552 Mon Sep 17 00:00:00 2001
From: Len Brown
Date: Fri, 19 Feb 2010 00:09:22 -0500
Subject: ACPI: delete the "acpi=ht" boot option

acpi=ht was important in 2003 -- before ACPI was
universally deployed and enabled by default in
the major Linux distributions.

At that time, there were a fair number of people who
or chose to, or needed to, run with acpi=off,
yet also wanted access to Hyper-threading.

Today we find that many invocations of "acpi=ht"
are accidental, and thus is it possible that it
is doing more harm than good.

In 2.6.34, we warn on invocation of acpi=ht.
In 2.6.35, we delete the boot option.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 Documentation/kernel-parameters.txt |  3 +--
 arch/ia64/include/asm/acpi.h        |  1 -
 arch/x86/include/asm/acpi.h         |  2 --
 arch/x86/kernel/acpi/boot.c         | 19 +++----------------
 arch/x86/lguest/boot.c              |  1 -
 drivers/acpi/tables.c               |  4 ++--
 6 files changed, 6 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 3bc48b0bd3a9..41b924ff0b51 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -143,11 +143,10 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	acpi=		[HW,ACPI,X86]
 			Advanced Configuration and Power Interface
-			Format: { force | off | ht | strict | noirq | rsdt }
+			Format: { force | off | strict | noirq | rsdt }
 			force -- enable ACPI if default was off
 			off -- disable ACPI if default was on
 			noirq -- do not use ACPI for IRQ routing
-			ht -- run only enough ACPI to enable Hyper Threading
 			strict -- Be less tolerant of platforms that are not
 				strictly ACPI specification compliant.
 			rsdt -- prefer RSDT over (default) XSDT
diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h
index 21adbd7f90f8..837dc82a013e 100644
--- a/arch/ia64/include/asm/acpi.h
+++ b/arch/ia64/include/asm/acpi.h
@@ -94,7 +94,6 @@ ia64_acpi_release_global_lock (unsigned int *lock)
 #define acpi_noirq 0	/* ACPI always enabled on IA64 */
 #define acpi_pci_disabled 0 /* ACPI PCI always enabled on IA64 */
 #define acpi_strict 1	/* no ACPI spec workarounds on IA64 */
-#define acpi_ht 0	/* no HT-only mode on IA64 */
 #endif
 #define acpi_processor_cstate_check(x) (x) /* no idle limits on IA64 :) */
 static inline void disable_acpi(void) { }
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 56f462cf22d2..aa2c39d968fc 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -85,7 +85,6 @@ extern int acpi_ioapic;
 extern int acpi_noirq;
 extern int acpi_strict;
 extern int acpi_disabled;
-extern int acpi_ht;
 extern int acpi_pci_disabled;
 extern int acpi_skip_timer_override;
 extern int acpi_use_timer_override;
@@ -97,7 +96,6 @@ void acpi_pic_sci_set_trigger(unsigned int, u16);
 static inline void disable_acpi(void)
 {
 	acpi_disabled = 1;
-	acpi_ht = 0;
 	acpi_pci_disabled = 1;
 	acpi_noirq = 1;
 }
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7914ab0ad76e..63bcf39f8f24 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -62,7 +62,6 @@ EXPORT_SYMBOL(acpi_disabled);
 int acpi_noirq;				/* skip ACPI IRQ initialization */
 int acpi_pci_disabled;		/* skip ACPI PCI scan and IRQ initialization */
 EXPORT_SYMBOL(acpi_pci_disabled);
-int acpi_ht __initdata = 1;	/* enable HT */
 
 int acpi_lapic;
 int acpi_ioapic;
@@ -1460,9 +1459,8 @@ void __init acpi_boot_table_init(void)
 
 	/*
 	 * If acpi_disabled, bail out
-	 * One exception: acpi=ht continues far enough to enumerate LAPICs
 	 */
-	if (acpi_disabled && !acpi_ht)
+	if (acpi_disabled)
 		return; 
 
 	/*
@@ -1493,9 +1491,8 @@ int __init early_acpi_boot_init(void)
 {
 	/*
 	 * If acpi_disabled, bail out
-	 * One exception: acpi=ht continues far enough to enumerate LAPICs
 	 */
-	if (acpi_disabled && !acpi_ht)
+	if (acpi_disabled)
 		return 1;
 
 	/*
@@ -1513,9 +1510,8 @@ int __init acpi_boot_init(void)
 
 	/*
 	 * If acpi_disabled, bail out
-	 * One exception: acpi=ht continues far enough to enumerate LAPICs
 	 */
-	if (acpi_disabled && !acpi_ht)
+	if (acpi_disabled)
 		return 1;
 
 	acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1550,21 +1546,12 @@ static int __init parse_acpi(char *arg)
 	/* acpi=force to over-ride black-list */
 	else if (strcmp(arg, "force") == 0) {
 		acpi_force = 1;
-		acpi_ht = 1;
 		acpi_disabled = 0;
 	}
 	/* acpi=strict disables out-of-spec workarounds */
 	else if (strcmp(arg, "strict") == 0) {
 		acpi_strict = 1;
 	}
-	/* Limit ACPI just to boot-time to enable HT */
-	else if (strcmp(arg, "ht") == 0) {
-		if (!acpi_force) {
-			printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n");
-			disable_acpi();
-		}
-		acpi_ht = 1;
-	}
 	/* acpi=rsdt use RSDT instead of XSDT */
 	else if (strcmp(arg, "rsdt") == 0) {
 		acpi_rsdt_forced = 1;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 7e59dc1d3fc2..8eb9eed60282 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1391,7 +1391,6 @@ __init void lguest_init(void)
 #endif
 #ifdef CONFIG_ACPI
 	acpi_disabled = 1;
-	acpi_ht = 0;
 #endif
 
 	/*
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index 8a0ed2800e63..f336bca7c450 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -213,7 +213,7 @@ acpi_table_parse_entries(char *id,
 	unsigned long table_end;
 	acpi_size tbl_size;
 
-	if (acpi_disabled && !acpi_ht)
+	if (acpi_disabled)
 		return -ENODEV;
 
 	if (!handler)
@@ -280,7 +280,7 @@ int __init acpi_table_parse(char *id, acpi_table_handler handler)
 	struct acpi_table_header *table = NULL;
 	acpi_size tbl_size;
 
-	if (acpi_disabled && !acpi_ht)
+	if (acpi_disabled)
 		return -ENODEV;
 
 	if (!handler)
-- 
cgit v1.2.3-55-g7522


From e4495262826d1eabca3529fa6ac22394eb348132 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Mon, 15 Mar 2010 12:58:22 +0800
Subject: perf, x86: Enable not tagged retired instruction counting on P4s

This should turn on instruction counting on P4s, which was missing in
the first version of the new PMU driver.

It's inaccurate for now, we still need dependant event to tag mops
before we can count them precisely. The result is that the number of
instruction may be lifted up.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1268629102.3355.11.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h | 8 ++++----
 arch/x86/kernel/cpu/perf_event_p4.c  | 8 +++-----
 2 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 829f4711645f..b47b9e9ac13f 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -324,8 +324,8 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
 
 #define P4_SSE_INPUT_ASSIST		P4_EVENT_PACK(0x34, 0x01)
 	/*
-	 * MSR_P4_FIRM_ESCR:	8, 9
-	 * MSR_P4_FIRM_ESCR:	10, 11
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
 #define P4_PACKED_SP_UOP		P4_EVENT_PACK(0x08, 0x01)
@@ -462,8 +462,8 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
 
 #define P4_INSTR_RETIRED		P4_EVENT_PACK(0x02, 0x04)
 	/*
-	 * MSR_P4_CRU_ESCR2:	12, 13, 16
-	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 * MSR_P4_CRU_ESCR0:	12, 13, 16
+	 * MSR_P4_CRU_ESCR1:	14, 15, 17
 	 */
 
 #define P4_UOPS_RETIRED			P4_EVENT_PACK(0x01, 0x04)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ef861da1c8ab..a11ce73a93c9 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -60,13 +60,11 @@ struct p4_event_template p4_templates[] = {
 	[2] = {
 		.opcode	= P4_INSTR_RETIRED,
 		.config	= 0,
-		.dep	= 0, /* needs front-end tagging */
+		.dep	= -1, /* needs front-end tagging */
 		.emask	=
 			P4_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSNTAG)	|
-			P4_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSTAG)	|
-			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG)	|
-			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSTAG),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
 		.cntr		= { 12, 14 },
 	},
 	[3] = {
-- 
cgit v1.2.3-55-g7522


From 1d199b1ad606ae8b88acebd295b101c4e1cf2a57 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Tue, 16 Mar 2010 01:05:02 +0100
Subject: perf: Fix unexported generic perf_arch_fetch_caller_regs

perf_arch_fetch_caller_regs() is exported for the overriden x86
version, but not for the generic weak version.

As a general rule, weak functions should not have their symbol
exported in the same file they are defined.

So let's export it on trace_event_perf.c as it is used by trace
events only.

This fixes:

	ERROR: ".perf_arch_fetch_caller_regs" [fs/xfs/xfs.ko] undefined!
	ERROR: ".perf_arch_fetch_caller_regs" [arch/powerpc/platforms/cell/spufs/spufs.ko] undefined!

-v2: And also only build it if trace events are enabled.
-v3: Fix changelog mistake

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1268697902-9518-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 3 ++-
 kernel/perf_event.c              | 2 ++
 kernel/trace/trace_event_perf.c  | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 978d297170a1..0d3466cf7f57 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1695,6 +1695,7 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return entry;
 }
 
+#ifdef CONFIG_EVENT_TRACING
 void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
 {
 	regs->ip = ip;
@@ -1706,4 +1707,4 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 	regs->cs = __KERNEL_CS;
 	local_save_flags(regs->flags);
 }
-EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
+#endif
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 8bf61273c58b..455393e71cab 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2790,10 +2790,12 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return NULL;
 }
 
+#ifdef CONFIG_EVENT_TRACING
 __weak
 void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
 {
 }
+#endif
 
 /*
  * Output
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0709e4f75114..7d79a10c3cde 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -12,6 +12,8 @@
 DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
 EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
 
+EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
+
 static char *perf_trace_buf;
 static char *perf_trace_buf_nmi;
 
-- 
cgit v1.2.3-55-g7522


From 8ea7f544100844307072cae2f5fc108afdef999a Mon Sep 17 00:00:00 2001
From: Lin Ming
Date: Tue, 16 Mar 2010 10:12:36 +0800
Subject: x86, perf: Fix comments in Pentium-4 PMU definitions

Reported-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1268705556.3379.8.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index b47b9e9ac13f..b842b3238e46 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -319,6 +319,7 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
 
 #define P4_BSQ_ACTIVE_ENTRIES		P4_EVENT_PACK(0x06, 0x07)
 	/*
+	 * NOTE: no ESCR name in docs, it's guessed
 	 * MSR_P4_BSU_ESCR1:	2, 3
 	 */
 
@@ -468,8 +469,8 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
 
 #define P4_UOPS_RETIRED			P4_EVENT_PACK(0x01, 0x04)
 	/*
-	 * MSR_P4_CRU_ESCR2:	12, 13, 16
-	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 * MSR_P4_CRU_ESCR0:	12, 13, 16
+	 * MSR_P4_CRU_ESCR1:	14, 15, 17
 	 */
 
 #define P4_UOP_TYPE			P4_EVENT_PACK(0x02, 0x02)
-- 
cgit v1.2.3-55-g7522


From 984763cb90d4b5444baa0c3e43feff7926bf1834 Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Tue, 16 Mar 2010 17:07:33 +0100
Subject: perf, x86: Report error code that returned from x86_pmu.hw_config()

If x86_pmu.hw_config() fails a fixed error code (-EOPNOTSUPP) is
returned even if a different error was reported. This patch fixes
this.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Cyrill Gorcunov <gorcunov@gmail.com>
Acked-by: Lin Ming <ming.m.lin@intel.com>
Cc: acme@redhat.com
Cc: eranian@google.com
Cc: gorcunov@openvz.org
Cc: peterz@infradead.org
Cc: fweisbec@gmail.com
LKML-Reference: <20100316160733.GR1585@erda.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0d3466cf7f57..5dacf63f913e 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -472,8 +472,9 @@ static int __hw_perf_event_init(struct perf_event *event)
 	hwc->last_tag = ~0ULL;
 
 	/* Processor specifics */
-	if (x86_pmu.hw_config(attr, hwc))
-		return -EOPNOTSUPP;
+	err = x86_pmu.hw_config(attr, hwc);
+	if (err)
+		return err;
 
 	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
-- 
cgit v1.2.3-55-g7522


From b27ea29c6267889be255f2217fa7a6106e6a8b04 Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Wed, 17 Mar 2010 12:49:10 +0100
Subject: perf/core, x86: Reduce number of CONFIG_X86_LOCAL_APIC macros

The function reserve_pmc_hardware() and release_pmc_hardware()
were hard to read. This patch improves readability of the code by
removing most of the CONFIG_X86_LOCAL_APIC macros.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268826553-19518-2-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5dacf63f913e..793e63f6c420 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -313,9 +313,10 @@ again:
 static atomic_t active_events;
 static DEFINE_MUTEX(pmc_reserve_mutex);
 
+#ifdef CONFIG_X86_LOCAL_APIC
+
 static bool reserve_pmc_hardware(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
 	int i;
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -330,11 +331,9 @@ static bool reserve_pmc_hardware(void)
 		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
 			goto eventsel_fail;
 	}
-#endif
 
 	return true;
 
-#ifdef CONFIG_X86_LOCAL_APIC
 eventsel_fail:
 	for (i--; i >= 0; i--)
 		release_evntsel_nmi(x86_pmu.eventsel + i);
@@ -349,12 +348,10 @@ perfctr_fail:
 		enable_lapic_nmi_watchdog();
 
 	return false;
-#endif
 }
 
 static void release_pmc_hardware(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
 	int i;
 
 	for (i = 0; i < x86_pmu.num_events; i++) {
@@ -364,9 +361,15 @@ static void release_pmc_hardware(void)
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		enable_lapic_nmi_watchdog();
-#endif
 }
 
+#else
+
+static bool reserve_pmc_hardware(void) { return true; }
+static void release_pmc_hardware(void) {}
+
+#endif
+
 static int reserve_ds_buffers(void);
 static void release_ds_buffers(void);
 
-- 
cgit v1.2.3-55-g7522


From 10f1014d86fd4fe5087080d609b51183396c5e4c Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Wed, 17 Mar 2010 12:49:12 +0100
Subject: perf/core, x86: Remove cpu_hw_events.interrupts

This member in the struct is not used anymore and can be
removed.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268826553-19518-4-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 793e63f6c420..104292a58c2b 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -102,7 +102,6 @@ struct cpu_hw_events {
 	 */
 	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
 	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long		interrupts;
 	int			enabled;
 
 	int			n_events;
-- 
cgit v1.2.3-55-g7522


From d6dc0b4ead6e8720096ecfa3d9e899b47ddbc8ed Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Wed, 17 Mar 2010 12:49:13 +0100
Subject: perf/core, x86: Remove duplicate perf_event_mask variable

The same information is stored also in x86_pmu.intel_ctrl. This
patch removes perf_event_mask and instead uses
x86_pmu.intel_ctrl directly.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268826553-19518-5-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 104292a58c2b..c97d5b52d12a 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -75,8 +75,6 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 	return len;
 }
 
-static u64 perf_event_mask __read_mostly;
-
 struct event_constraint {
 	union {
 		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
@@ -1406,7 +1404,7 @@ void __init init_hw_perf_events(void)
 		     x86_pmu.num_events, X86_PMC_MAX_GENERIC);
 		x86_pmu.num_events = X86_PMC_MAX_GENERIC;
 	}
-	perf_event_mask = (1 << x86_pmu.num_events) - 1;
+	x86_pmu.intel_ctrl = (1 << x86_pmu.num_events) - 1;
 	perf_max_events = x86_pmu.num_events;
 
 	if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
@@ -1415,9 +1413,8 @@ void __init init_hw_perf_events(void)
 		x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
 	}
 
-	perf_event_mask |=
+	x86_pmu.intel_ctrl |=
 		((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
-	x86_pmu.intel_ctrl = perf_event_mask;
 
 	perf_events_lapic_init();
 	register_die_notifier(&perf_event_nmi_notifier);
@@ -1442,7 +1439,7 @@ void __init init_hw_perf_events(void)
 	pr_info("... value mask:             %016Lx\n", x86_pmu.event_mask);
 	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
 	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_events_fixed);
-	pr_info("... event mask:             %016Lx\n", perf_event_mask);
+	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
 
 	perf_cpu_notifier(x86_pmu_notifier);
 }
-- 
cgit v1.2.3-55-g7522


From 2acebe9ecb2b77876e87a1480729cfb2db4570dd Mon Sep 17 00:00:00 2001
From: Jack Steiner
Date: Wed, 17 Mar 2010 10:40:38 -0500
Subject: x86, UV: Delete unneeded boot messages

SGI:UV: Delete extra boot messages that describe the system
topology. These messages are no longer useful.

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20100317154038.GA29346@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 3740c8a4eae7..9a93b07fa141 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -736,9 +736,6 @@ void __init uv_system_init(void)
 		uv_node_to_blade[nid] = blade;
 		uv_cpu_to_blade[cpu] = blade;
 		max_pnode = max(pnode, max_pnode);
-
-		printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
-			cpu, apicid, pnode, nid, lcpu, blade);
 	}
 
 	/* Add blade/pnode info for nodes without cpus */
-- 
cgit v1.2.3-55-g7522


From d674cd1963129b70bc5f631c51fb30fb73213fb2 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Wed, 17 Mar 2010 13:37:00 +0300
Subject: x86, apic: Allow to use certain functions without APIC built-in
 support

In case even if the kernel is configured so that
no APIC support is built-in we still may allow
to use certain apic functions as dummy calls.

In particular we start using it in perf-events code.

Note that this is not that same as NOOP apic driver (which
is used if APIC support is present but no physical APIC is
available), this is for the case when we don't have apic code
compiled in at all.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <20100317104356.011052632@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apic.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index b4ac2cdcb64f..1fa03e04ae44 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -373,6 +373,7 @@ extern atomic_t init_deasserted;
 extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
 #endif
 
+#ifdef CONFIG_X86_LOCAL_APIC
 static inline u32 apic_read(u32 reg)
 {
 	return apic->read(reg);
@@ -403,10 +404,19 @@ static inline u32 safe_apic_wait_icr_idle(void)
 	return apic->safe_wait_icr_idle();
 }
 
+#else /* CONFIG_X86_LOCAL_APIC */
+
+static inline u32 apic_read(u32 reg) { return 0; }
+static inline void apic_write(u32 reg, u32 val) { }
+static inline u64 apic_icr_read(void) { return 0; }
+static inline void apic_icr_write(u32 low, u32 high) { }
+static inline void apic_wait_icr_idle(void) { }
+static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
+
+#endif /* CONFIG_X86_LOCAL_APIC */
 
 static inline void ack_APIC_irq(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
 	/*
 	 * ack_APIC_irq() actually gets compiled as a single instruction
 	 * ... yummie.
@@ -414,7 +424,6 @@ static inline void ack_APIC_irq(void)
 
 	/* Docs say use 0 for future compatibility */
 	apic_write(APIC_EOI, 0);
-#endif
 }
 
 static inline unsigned default_get_apic_id(unsigned long x)
-- 
cgit v1.2.3-55-g7522


From 7335f75e9ca166044e38a96abad422d8e6e364b5 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Wed, 17 Mar 2010 13:37:01 +0300
Subject: x86, perf: Use apic_write unconditionally

Since apic_write() maps to a plain noop in the !CONFIG_X86_LOCAL_APIC
case we're safe to remove this conditional compilation and clean up
the code a bit.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: fweisbec@gmail.com
Cc: acme@redhat.com
Cc: eranian@google.com
Cc: peterz@infradead.org
LKML-Reference: <20100317104356.232371479@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c    | 4 ----
 arch/x86/kernel/cpu/perf_event_p4.c | 2 --
 2 files changed, 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c97d5b52d12a..14eca80918dc 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1136,7 +1136,6 @@ void set_perf_event_pending(void)
 
 void perf_events_lapic_init(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
 	if (!x86_pmu.apic || !x86_pmu_initialized())
 		return;
 
@@ -1144,7 +1143,6 @@ void perf_events_lapic_init(void)
 	 * Always use NMI for PMU
 	 */
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
 }
 
 static int __kprobes
@@ -1168,9 +1166,7 @@ perf_event_nmi_handler(struct notifier_block *self,
 
 	regs = args->regs;
 
-#ifdef CONFIG_X86_LOCAL_APIC
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
 	/*
 	 * Can't rely on the handled return value to say it was our NMI, two
 	 * events could trigger 'simultaneously' raising two back-to-back NMIs.
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index a11ce73a93c9..0367889b4ae0 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -363,10 +363,8 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 	}
 
 	if (handled) {
-#ifdef CONFIG_X86_LOCAL_APIC
 		/* p4 quirk: unmask it again */
 		apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
-#endif
 		inc_irq_stat(apic_perf_irqs);
 	}
 
-- 
cgit v1.2.3-55-g7522


From f34edbc1cdb0f8f83d94e1d668dd6e41abf0defb Mon Sep 17 00:00:00 2001
From: Lin Ming
Date: Thu, 18 Mar 2010 18:33:07 +0800
Subject: perf, x86: Add a key to simplify template lookup in Pentium-4 PMU

Currently, we use opcode(Event and Event-Selector) + emask to
look up template in p4_templates.

But cache events (L1-dcache-load-misses, LLC-load-misses, etc)
use the same event(P4_REPLAY_EVENT) to do the counting, ie, they
have the same opcode and emask. So we can not use current lookup
mechanism to find the template for cache events.

This patch introduces a "key", which is the index into
p4_templates. The low 12 bits of CCCR are reserved, so we can
hide the "key" in the low 12 bits of hwc->config.

We extract the key from hwc->config and then quickly find the
template.

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1268908387.13901.127.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h |  5 ++-
 arch/x86/kernel/cpu/perf_event_p4.c  | 86 ++++++++++++++----------------------
 2 files changed, 38 insertions(+), 53 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index b842b3238e46..7d3406a2773c 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -65,6 +65,7 @@
 #define P4_CCCR_THREAD_SINGLE		0x00010000U
 #define P4_CCCR_THREAD_BOTH		0x00020000U
 #define P4_CCCR_THREAD_ANY		0x00030000U
+#define P4_CCCR_RESERVED		0x00000fffU
 
 /* Non HT mask */
 #define P4_CCCR_MASK				\
@@ -116,7 +117,7 @@
 #define p4_config_pack_escr(v)		(((u64)(v)) << 32)
 #define p4_config_pack_cccr(v)		(((u64)(v)) & 0xffffffffULL)
 #define p4_config_unpack_escr(v)	(((u64)(v)) >> 32)
-#define p4_config_unpack_cccr(v)	(((u64)(v)) & 0xffffffffULL)
+#define p4_config_unpack_cccr(v)	(((u64)(v)) & 0xfffff000ULL)
 
 #define p4_config_unpack_emask(v)			\
 	({						\
@@ -126,6 +127,8 @@
 		t;					\
 	})
 
+#define p4_config_unpack_key(v)		(((u64)(v)) & P4_CCCR_RESERVED)
+
 #define P4_CONFIG_HT_SHIFT		63
 #define P4_CONFIG_HT			(1ULL << P4_CONFIG_HT_SHIFT)
 
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 0367889b4ae0..3e97ed3904cc 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -18,6 +18,7 @@ struct p4_event_template {
 	u32 opcode;			/* ESCR event + CCCR selector */
 	u64 config;			/* packed predefined bits */
 	int dep;			/* upstream dependency event index */
+	int key;			/* index into p4_templates */
 	unsigned int emask;		/* ESCR EventMask */
 	unsigned int escr_msr[2];	/* ESCR MSR for this event */
 	unsigned int cntr[2];		/* counter index (offset) */
@@ -39,38 +40,31 @@ static DEFINE_PER_CPU(struct p4_pmu_res, p4_pmu_config);
  */
 struct p4_event_template p4_templates[] = {
 	[0] = {
-		.opcode	= P4_UOP_TYPE,
-		.config	= 0,
-		.dep	= -1,
-		.emask	=
-			P4_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS)	|
-			P4_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES),
-		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
-		.cntr		= { 16, 17 },
-	},
-	[1] = {
 		.opcode	= P4_GLOBAL_POWER_EVENTS,
 		.config	= 0,
 		.dep	= -1,
+		.key	= 0,
 		.emask	=
 			P4_EVENT_ATTR(P4_GLOBAL_POWER_EVENTS, RUNNING),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
 		.cntr		= { 0, 2 },
 	},
-	[2] = {
+	[1] = {
 		.opcode	= P4_INSTR_RETIRED,
 		.config	= 0,
 		.dep	= -1, /* needs front-end tagging */
+		.key	= 1,
 		.emask	=
 			P4_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSNTAG)	|
 			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG),
 		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
 		.cntr		= { 12, 14 },
 	},
-	[3] = {
+	[2] = {
 		.opcode	= P4_BSQ_CACHE_REFERENCE,
 		.config	= 0,
 		.dep	= -1,
+		.key	= 2,
 		.emask	=
 			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
 			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
@@ -81,10 +75,11 @@ struct p4_event_template p4_templates[] = {
 		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
 		.cntr		= { 0, 2 },
 	},
-	[4] = {
+	[3] = {
 		.opcode	= P4_BSQ_CACHE_REFERENCE,
 		.config	= 0,
 		.dep	= -1,
+		.key	= 3,
 		.emask	=
 			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
 			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
@@ -92,10 +87,11 @@ struct p4_event_template p4_templates[] = {
 		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
 		.cntr		= { 0, 3 },
 	},
-	[5] = {
+	[4] = {
 		.opcode	= P4_RETIRED_BRANCH_TYPE,
 		.config	= 0,
 		.dep	= -1,
+		.key	= 4,
 		.emask	=
 			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
 			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CALL)		|
@@ -104,48 +100,38 @@ struct p4_event_template p4_templates[] = {
 		.escr_msr	= { MSR_P4_TBPU_ESCR0, MSR_P4_TBPU_ESCR1 },
 		.cntr		= { 4, 6 },
 	},
-	[6] = {
+	[5] = {
 		.opcode	= P4_MISPRED_BRANCH_RETIRED,
 		.config	= 0,
 		.dep	= -1,
+		.key	= 5,
 		.emask	=
 			P4_EVENT_ATTR(P4_MISPRED_BRANCH_RETIRED, NBOGUS),
 		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
 		.cntr		= { 12, 14 },
 	},
-	[7] = {
+	[6] = {
 		.opcode	= P4_FSB_DATA_ACTIVITY,
 		.config	= p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
 		.dep	= -1,
+		.key	= 6,
 		.emask	=
 			P4_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_DRV)	|
 			P4_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OWN),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
 		.cntr		= { 0, 2 },
 	},
-};
-
-static struct p4_event_template *p4_event_map[PERF_COUNT_HW_MAX] = {
-	/* non-halted CPU clocks */
-	[PERF_COUNT_HW_CPU_CYCLES]		= &p4_templates[1],
-
-	/* retired instructions: dep on tagging the FSB */
-	[PERF_COUNT_HW_INSTRUCTIONS]		= &p4_templates[2],
-
-	/* cache hits */
-	[PERF_COUNT_HW_CACHE_REFERENCES]	= &p4_templates[3],
-
-	/* cache misses */
-	[PERF_COUNT_HW_CACHE_MISSES]		= &p4_templates[4],
-
-	/* branch instructions retired */
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= &p4_templates[5],
-
-	/* mispredicted branches retired */
-	[PERF_COUNT_HW_BRANCH_MISSES]		= &p4_templates[6],
-
-	/* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
-	[PERF_COUNT_HW_BUS_CYCLES]		= &p4_templates[7],
+	[7] = {
+		.opcode	= P4_UOP_TYPE,
+		.config	= 0,
+		.dep	= -1,
+		.key	= 7,
+		.emask	=
+			P4_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS)	|
+			P4_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES),
+		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+		.cntr		= { 16, 17 },
+	},
 };
 
 static u64 p4_pmu_event_map(int hw_event)
@@ -153,11 +139,11 @@ static u64 p4_pmu_event_map(int hw_event)
 	struct p4_event_template *tpl;
 	u64 config;
 
-	if (hw_event > ARRAY_SIZE(p4_event_map)) {
+	if (hw_event > ARRAY_SIZE(p4_templates)) {
 		printk_once(KERN_ERR "PMU: Incorrect event index\n");
 		return 0;
 	}
-	tpl = p4_event_map[hw_event];
+	tpl = &p4_templates[hw_event];
 
 	/*
 	 * fill config up according to
@@ -167,6 +153,7 @@ static u64 p4_pmu_event_map(int hw_event)
 	config |= p4_config_pack_escr(P4_EVENT_UNPACK_EVENT(tpl->opcode) << P4_EVNTSEL_EVENT_SHIFT);
 	config |= p4_config_pack_escr(tpl->emask << P4_EVNTSEL_EVENTMASK_SHIFT);
 	config |= p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(tpl->opcode) << P4_CCCR_ESCR_SELECT_SHIFT);
+	config |= p4_config_pack_cccr(hw_event & P4_CCCR_RESERVED);
 
 	/* on HT machine we need a special bit */
 	if (p4_ht_active() && p4_ht_thread(raw_smp_processor_id()))
@@ -187,17 +174,12 @@ static inline int p4_pmu_emask_match(unsigned int dst, unsigned int src)
 
 static struct p4_event_template *p4_pmu_template_lookup(u64 config)
 {
-	u32 opcode = p4_config_unpack_opcode(config);
-	unsigned int emask = p4_config_unpack_emask(config);
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(p4_templates); i++) {
-		if (opcode == p4_templates[i].opcode &&
-			p4_pmu_emask_match(emask, p4_templates[i].emask))
-			return &p4_templates[i];
-	}
+	int key = p4_config_unpack_key(config);
 
-	return NULL;
+	if (key < ARRAY_SIZE(p4_templates))
+		return &p4_templates[key];
+	else
+		return NULL;
 }
 
 /*
@@ -564,7 +546,7 @@ static __initconst struct x86_pmu p4_pmu = {
 	.perfctr		= MSR_P4_BPU_PERFCTR0,
 	.event_map		= p4_pmu_event_map,
 	.raw_event		= p4_pmu_raw_event,
-	.max_events		= ARRAY_SIZE(p4_event_map),
+	.max_events		= ARRAY_SIZE(p4_templates),
 	.get_event_constraints	= x86_get_event_constraints,
 	/*
 	 * IF HT disabled we may need to use all
-- 
cgit v1.2.3-55-g7522


From cb7d6b5053e86598735d9af19930f5929f007b7f Mon Sep 17 00:00:00 2001
From: Lin Ming
Date: Thu, 18 Mar 2010 18:33:12 +0800
Subject: perf, x86: Add cache events for the Pentium-4 PMU

Move the HT bit setting code from p4_pmu_event_map to
p4_hw_config. So the cache events can get HT bit set correctly.

Tested on my P4 desktop, below 6 cache events work:

 L1-dcache-load-misses
 LLC-load-misses
 dTLB-load-misses
 dTLB-store-misses
 iTLB-loads
 iTLB-load-misses

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1268908392.13901.128.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/msr-index.h     |   2 +
 arch/x86/include/asm/perf_event_p4.h |  10 +++
 arch/x86/kernel/cpu/perf_event_p4.c  | 153 +++++++++++++++++++++++++++++++++--
 3 files changed, 159 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1cd58cdbc03f..aef562c0a647 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -357,6 +357,8 @@
 #define MSR_P4_U2L_ESCR0		0x000003b0
 #define MSR_P4_U2L_ESCR1		0x000003b1
 
+#define MSR_P4_PEBS_MATRIX_VERT		0x000003f2
+
 /* Intel Core-based CPU performance counters */
 #define MSR_CORE_PERF_FIXED_CTR0	0x00000309
 #define MSR_CORE_PERF_FIXED_CTR1	0x0000030a
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 7d3406a2773c..871249cf4d2b 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -708,4 +708,14 @@ enum P4_EVENTS_ATTR {
 	P4_MAKE_EVENT_ATTR(P4_INSTR_COMPLETED, BOGUS, 1),
 };
 
+enum {
+	KEY_P4_L1D_OP_READ_RESULT_MISS,
+	KEY_P4_LL_OP_READ_RESULT_MISS,
+	KEY_P4_DTLB_OP_READ_RESULT_MISS,
+	KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
+	KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
+	KEY_P4_ITLB_OP_READ_RESULT_MISS,
+	KEY_P4_UOP_TYPE,
+};
+
 #endif /* PERF_EVENT_P4_H */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 3e97ed3904cc..b7bf9911198c 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -19,6 +19,11 @@ struct p4_event_template {
 	u64 config;			/* packed predefined bits */
 	int dep;			/* upstream dependency event index */
 	int key;			/* index into p4_templates */
+	u64 msr;			/*
+					 * the high 32 bits set into MSR_IA32_PEBS_ENABLE and
+					 * the low 32 bits set into MSR_P4_PEBS_MATRIX_VERT
+					 * for cache events
+					 */
 	unsigned int emask;		/* ESCR EventMask */
 	unsigned int escr_msr[2];	/* ESCR MSR for this event */
 	unsigned int cntr[2];		/* counter index (offset) */
@@ -31,6 +36,67 @@ struct p4_pmu_res {
 
 static DEFINE_PER_CPU(struct p4_pmu_res, p4_pmu_config);
 
+#define P4_CACHE_EVENT_CONFIG(event, bit) \
+	p4_config_pack_escr(P4_EVENT_UNPACK_EVENT(event) << P4_EVNTSEL_EVENT_SHIFT) | \
+	p4_config_pack_escr((event##_##bit) << P4_EVNTSEL_EVENTMASK_SHIFT) | \
+	p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(event) << P4_CCCR_ESCR_SELECT_SHIFT)
+
+static __initconst u64 p4_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+					/* 1stL_cache_load_miss_retired */
+		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+					| KEY_P4_L1D_OP_READ_RESULT_MISS,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+					/* 2ndL_cache_load_miss_retired */
+		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+					| KEY_P4_LL_OP_READ_RESULT_MISS,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+					/* DTLB_load_miss_retired */
+		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+					| KEY_P4_DTLB_OP_READ_RESULT_MISS,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+					/* DTLB_store_miss_retired */
+		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+					| KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+					/* ITLB_reference.HIT */
+		[ C(RESULT_ACCESS) ] = P4_CACHE_EVENT_CONFIG(P4_ITLB_REFERENCE, HIT)
+					| KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
+
+					/* ITLB_reference.MISS */
+		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_ITLB_REFERENCE, MISS)
+					| KEY_P4_ITLB_OP_READ_RESULT_MISS,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
 /*
  * WARN: CCCR1 doesn't have a working enable bit so try to not
  * use it if possible
@@ -121,11 +187,77 @@ struct p4_event_template p4_templates[] = {
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
 		.cntr		= { 0, 2 },
 	},
-	[7] = {
+	[KEY_P4_L1D_OP_READ_RESULT_MISS] = {
+		.opcode	= P4_REPLAY_EVENT,
+		.config	= 0,
+		.dep	= -1,
+		.msr	= (u64)(1 << 0 | 1 << 24) << 32 | (1 << 0),
+		.key	= KEY_P4_L1D_OP_READ_RESULT_MISS,
+		.emask	=
+			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+		.cntr		= { 16, 17 },
+	},
+	[KEY_P4_LL_OP_READ_RESULT_MISS] = {
+		.opcode	= P4_REPLAY_EVENT,
+		.config	= 0,
+		.dep	= -1,
+		.msr	= (u64)(1 << 1 | 1 << 24) << 32 | (1 << 0),
+		.key	= KEY_P4_LL_OP_READ_RESULT_MISS,
+		.emask	=
+			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+		.cntr		= { 16, 17 },
+	},
+	[KEY_P4_DTLB_OP_READ_RESULT_MISS] = {
+		.opcode	= P4_REPLAY_EVENT,
+		.config	= 0,
+		.dep	= -1,
+		.msr	= (u64)(1 << 2 | 1 << 24) << 32 | (1 << 0),
+		.key	= KEY_P4_DTLB_OP_READ_RESULT_MISS,
+		.emask	=
+			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+		.cntr		= { 16, 17 },
+	},
+	[KEY_P4_DTLB_OP_WRITE_RESULT_MISS] = {
+		.opcode	= P4_REPLAY_EVENT,
+		.config	= 0,
+		.dep	= -1,
+		.msr	= (u64)(1 << 2 | 1 << 24) << 32 | (1 << 1),
+		.key	= KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
+		.emask	=
+			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+		.cntr		= { 16, 17 },
+	},
+	[KEY_P4_ITLB_OP_READ_RESULT_ACCESS] = {
+		.opcode	= P4_ITLB_REFERENCE,
+		.config	= 0,
+		.dep	= -1,
+		.msr	= 0,
+		.key	= KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
+		.emask	=
+			P4_EVENT_ATTR(P4_ITLB_REFERENCE, HIT),
+		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+		.cntr		= { 0, 2 },
+	},
+	[KEY_P4_ITLB_OP_READ_RESULT_MISS] = {
+		.opcode	= P4_ITLB_REFERENCE,
+		.config	= 0,
+		.dep	= -1,
+		.msr	= 0,
+		.key	= KEY_P4_ITLB_OP_READ_RESULT_MISS,
+		.emask	=
+			P4_EVENT_ATTR(P4_ITLB_REFERENCE, MISS),
+		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+		.cntr		= { 0, 2 },
+	},
+	[KEY_P4_UOP_TYPE] = {
 		.opcode	= P4_UOP_TYPE,
 		.config	= 0,
 		.dep	= -1,
-		.key	= 7,
+		.key	= KEY_P4_UOP_TYPE,
 		.emask	=
 			P4_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS)	|
 			P4_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES),
@@ -155,10 +287,6 @@ static u64 p4_pmu_event_map(int hw_event)
 	config |= p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(tpl->opcode) << P4_CCCR_ESCR_SELECT_SHIFT);
 	config |= p4_config_pack_cccr(hw_event & P4_CCCR_RESERVED);
 
-	/* on HT machine we need a special bit */
-	if (p4_ht_active() && p4_ht_thread(raw_smp_processor_id()))
-		config = p4_set_ht_bit(config);
-
 	return config;
 }
 
@@ -211,6 +339,10 @@ static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
 	/* Count user and OS events unless not requested to */
 	hwc->config |= p4_config_pack_escr(p4_default_escr_conf(cpu, attr->exclude_kernel,
 								attr->exclude_user));
+	/* on HT machine we need a special bit */
+	if (p4_ht_active() && p4_ht_thread(cpu))
+		hwc->config = p4_set_ht_bit(hwc->config);
+
 	return 0;
 }
 
@@ -271,6 +403,12 @@ static void p4_pmu_enable_event(struct perf_event *event)
 		pr_crit("%s: Wrong index: %d\n", __func__, hwc->idx);
 		return;
 	}
+
+	if (tpl->msr) {
+		(void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, tpl->msr >> 32);
+		(void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, tpl->msr & 0xffffffff);
+	}
+
 	escr_base = (u64)tpl->escr_msr[thread];
 
 	/*
@@ -577,6 +715,9 @@ static __init int p4_pmu_init(void)
 		return -ENODEV;
 	}
 
+	memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
+	       sizeof(hw_cache_event_ids));
+
 	pr_cont("Netburst events, ");
 
 	x86_pmu = p4_pmu;
-- 
cgit v1.2.3-55-g7522


From 4b24a88b35e15e04bd8f2c5dda65b5dc8ebca05f Mon Sep 17 00:00:00 2001
From: Stephane Eranian
Date: Wed, 17 Mar 2010 23:21:01 +0200
Subject: perf_events: Fix resource leak in x86 __hw_perf_event_init()

If reserve_pmc_hardware() succeeds but reserve_ds_buffers()
fails, then we need to release_pmc_hardware. It won't be done
by the destroy() callback because we return before setting it
in case of error.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Cc: peterz@infradead.org
Cc: paulus@samba.org
Cc: davem@davemloft.net
Cc: fweisbec@gmail.com
Cc: robert.richter@amd.com
Cc: perfmon2-devel@lists.sf.net
LKML-Reference: <4ba1568b.15185e0a.182a.7802@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--
 arch/x86/kernel/cpu/perf_event.c |    5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
---
 arch/x86/kernel/cpu/perf_event.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 14eca80918dc..f571f514de2a 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -455,8 +455,11 @@ static int __hw_perf_event_init(struct perf_event *event)
 		if (atomic_read(&active_events) == 0) {
 			if (!reserve_pmc_hardware())
 				err = -EBUSY;
-			else
+			else {
 				err = reserve_ds_buffers();
+				if (err)
+					release_pmc_hardware();
+			}
 		}
 		if (!err)
 			atomic_inc(&active_events);
-- 
cgit v1.2.3-55-g7522


From 9c8c6bad3137112d2c7bf3d215b736ee4215fa74 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Fri, 19 Mar 2010 00:12:56 +0300
Subject: x86, perf: Fix few cosmetic dabs for P4 pmu (comments and
 constantify)

- A few ESCR have escaped fixing at previous attempt.
- p4_escr_map is read only, make it const.

Nothing serious.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <20100318211256.GH5062@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h | 4 ++--
 arch/x86/kernel/cpu/perf_event_p4.c  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 871249cf4d2b..2a1a57f71539 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -401,13 +401,13 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
 #define P4_RETIRED_MISPRED_BRANCH_TYPE	P4_EVENT_PACK(0x05, 0x02)
 	/*
 	 * MSR_P4_TBPU_ESCR0:	4, 5
-	 * MSR_P4_TBPU_ESCR0:	6, 7
+	 * MSR_P4_TBPU_ESCR1:	6, 7
 	 */
 
 #define P4_RETIRED_BRANCH_TYPE		P4_EVENT_PACK(0x04, 0x02)
 	/*
 	 * MSR_P4_TBPU_ESCR0:	4, 5
-	 * MSR_P4_TBPU_ESCR0:	6, 7
+	 * MSR_P4_TBPU_ESCR1:	6, 7
 	 */
 
 #define P4_RESOURCE_STALL		P4_EVENT_PACK(0x01, 0x01)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index b7bf9911198c..b8a811ab7609 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -545,7 +545,7 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
 }
 
 /* ESCRs are not sequential in memory so we need a map */
-static unsigned int p4_escr_map[ARCH_P4_TOTAL_ESCR] = {
+static const unsigned int p4_escr_map[ARCH_P4_TOTAL_ESCR] = {
 	MSR_P4_ALF_ESCR0,	/*  0 */
 	MSR_P4_ALF_ESCR1,	/*  1 */
 	MSR_P4_BPU_ESCR0,	/*  2 */
-- 
cgit v1.2.3-55-g7522


From 40b7e05e17eef31ff30fe08dfc2424ef653a792c Mon Sep 17 00:00:00 2001
From: Lin Ming
Date: Fri, 19 Mar 2010 15:28:58 +0800
Subject: perf, x86: Fix key indexing in Pentium-4 PMU

Index 0-6 in p4_templates are reserved for common hardware
events. So p4_templates is arranged as below:

    0  -    6:  common hardware events
    7  -    N:  cache events
  N+1  -  ...:  other raw events

Reported-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1268983738.13901.142.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 2a1a57f71539..facf96186b26 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -709,7 +709,7 @@ enum P4_EVENTS_ATTR {
 };
 
 enum {
-	KEY_P4_L1D_OP_READ_RESULT_MISS,
+	KEY_P4_L1D_OP_READ_RESULT_MISS = PERF_COUNT_HW_MAX,
 	KEY_P4_LL_OP_READ_RESULT_MISS,
 	KEY_P4_DTLB_OP_READ_RESULT_MISS,
 	KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
-- 
cgit v1.2.3-55-g7522


From 4bd96a7a8185755b091233b16034c7436cbf57af Mon Sep 17 00:00:00 2001
From: Shane Wang
Date: Wed, 10 Mar 2010 14:36:10 +0800
Subject: x86, tboot: Add support for S3 memory integrity protection

This patch adds support for S3 memory integrity protection within an Intel(R)
TXT launched kernel, for all kernel and userspace memory.  All RAM used by the
kernel and userspace, as indicated by memory ranges of type E820_RAM and
E820_RESERVED_KERN in the e820 table, will be integrity protected.

The MAINTAINERS file is also updated to reflect the maintainers of the
TXT-related code.

All MACing is done in tboot, based on a complexity analysis and tradeoff.

v3: Compared with v2, this patch adds a check of array size in
tboot.c, and a note to specify which c/s of tboot supports this kind
of MACing in intel_txt.txt.

Signed-off-by: Shane Wang <shane.wang@intel.com>
LKML-Reference: <4B973DDA.6050902@intel.com>
Signed-off-by: Joseph Cihula <joseph.cihula@intel.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 Documentation/intel_txt.txt | 16 +++++++++-------
 MAINTAINERS                 | 11 +++++++++++
 arch/x86/include/asm/e820.h |  7 ++++++-
 arch/x86/kernel/tboot.c     | 20 +++++++++++---------
 4 files changed, 37 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/intel_txt.txt b/Documentation/intel_txt.txt
index f40a1f030019..87c8990dbbd9 100644
--- a/Documentation/intel_txt.txt
+++ b/Documentation/intel_txt.txt
@@ -161,13 +161,15 @@ o  In order to put a system into any of the sleep states after a TXT
       has been restored, it will restore the TPM PCRs and then
       transfer control back to the kernel's S3 resume vector.
       In order to preserve system integrity across S3, the kernel
-      provides tboot with a set of memory ranges (kernel
-      code/data/bss, S3 resume code, and AP trampoline) that tboot
-      will calculate a MAC (message authentication code) over and then
-      seal with the TPM.  On resume and once the measured environment
-      has been re-established, tboot will re-calculate the MAC and
-      verify it against the sealed value.  Tboot's policy determines
-      what happens if the verification fails.
+      provides tboot with a set of memory ranges (RAM and RESERVED_KERN
+      in the e820 table, but not any memory that BIOS might alter over
+      the S3 transition) that tboot will calculate a MAC (message
+      authentication code) over and then seal with the TPM. On resume
+      and once the measured environment has been re-established, tboot
+      will re-calculate the MAC and verify it against the sealed value.
+      Tboot's policy determines what happens if the verification fails.
+      Note that the c/s 194 of tboot which has the new MAC code supports
+      this.
 
 That's pretty much it for TXT support.
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 47cc449d89d8..d3072cb8805d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2940,6 +2940,17 @@ S:	Odd Fixes
 F:	Documentation/networking/README.ipw2200
 F:	drivers/net/wireless/ipw2x00/ipw2200.*
 
+INTEL(R) TRUSTED EXECUTION TECHNOLOGY (TXT)
+M:	Joseph Cihula <joseph.cihula@intel.com>
+M:	Shane Wang <shane.wang@intel.com>
+L:	tboot-devel@lists.sourceforge.net
+W:	http://tboot.sourceforge.net
+T:	Mercurial http://www.bughost.org/repos.hg/tboot.hg
+S:	Supported
+F:	Documentation/intel_txt.txt
+F:	include/linux/tboot.h
+F:	arch/x86/kernel/tboot.c
+
 INTEL WIRELESS WIMAX CONNECTION 2400
 M:	Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
 M:	linux-wimax@intel.com
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 0e22296790d3..ec8a52d14ab1 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -45,7 +45,12 @@
 #define E820_NVS	4
 #define E820_UNUSABLE	5
 
-/* reserved RAM used by kernel itself */
+/*
+ * reserved RAM used by kernel itself
+ * if CONFIG_INTEL_TXT is enabled, memory of this type will be
+ * included in the S3 integrity calculation and so should not include
+ * any memory that BIOS might alter over the S3 transition
+ */
 #define E820_RESERVED_KERN        128
 
 #ifndef __ASSEMBLY__
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 86c9f91b48ae..cc2c60474fd0 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -175,6 +175,9 @@ static void add_mac_region(phys_addr_t start, unsigned long size)
 	struct tboot_mac_region *mr;
 	phys_addr_t end = start + size;
 
+	if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS)
+		panic("tboot: Too many MAC regions\n");
+
 	if (start && size) {
 		mr = &tboot->mac_regions[tboot->num_mac_regions++];
 		mr->start = round_down(start, PAGE_SIZE);
@@ -184,18 +187,17 @@ static void add_mac_region(phys_addr_t start, unsigned long size)
 
 static int tboot_setup_sleep(void)
 {
+	int i;
+
 	tboot->num_mac_regions = 0;
 
-	/* S3 resume code */
-	add_mac_region(acpi_wakeup_address, WAKEUP_SIZE);
+	for (i = 0; i < e820.nr_map; i++) {
+		if ((e820.map[i].type != E820_RAM)
+		 && (e820.map[i].type != E820_RESERVED_KERN))
+			continue;
 
-#ifdef CONFIG_X86_TRAMPOLINE
-	/* AP trampoline code */
-	add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
-#endif
-
-	/* kernel code + data + bss */
-	add_mac_region(virt_to_phys(_text), _end - _text);
+		add_mac_region(e820.map[i].addr, e820.map[i].size);
+	}
 
 	tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
 
-- 
cgit v1.2.3-55-g7522


From 62e7bec49479e0c61e8cfd914f722a9ca6fd52e5 Mon Sep 17 00:00:00 2001
From: Huang Ying
Date: Wed, 24 Mar 2010 21:37:57 +0800
Subject: crypto: aesni-intel - Fix another CTR build failure with gas 2.16.1

The previous AES-NI CTR optimization compiling failure gas 2.16.1 fix
introduces another compiling failure by itself. This patch fixes that.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/include/asm/inst.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
index 840a399701b2..280bf7fb6aba 100644
--- a/arch/x86/include/asm/inst.h
+++ b/arch/x86/include/asm/inst.h
@@ -120,9 +120,9 @@
 	.macro REG_TYPE type reg
 	R64_NUM reg_type_r64 \reg
 	XMM_NUM reg_type_xmm \reg
-	.if reg_type_r64 != REG_NUM_INVALID
+	.if reg_type_r64 <> REG_NUM_INVALID
 	\type = REG_TYPE_R64
-	.elseif reg_type_xmm != REG_NUM_INVALID
+	.elseif reg_type_xmm <> REG_NUM_INVALID
 	\type = REG_TYPE_XMM
 	.else
 	\type = REG_TYPE_INVALID
-- 
cgit v1.2.3-55-g7522


From d814f30105798b6677ecb73ed61d691ff96dada9 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Wed, 24 Mar 2010 12:09:26 +0800
Subject: x86, perf: Add raw events support for the P4 PMU

The adding of raw event support lead to complete code
refactoring. I hope is became more readable then it was.

The list of changes:

1)  The 64bit config field is enough to hold all information we need
    to track event details. To achieve it we used *own* enum for
    events selection in ESCR register and map this key into proper
    value at moment of event enabling.

    For the same reason we use 12LSB bits in CCCR register -- to track
    which exactly cache trace event was requested. And we cear this bits
    at real 'write' moment.

2)  There is no per-cpu area reserved for P4 PMU anymore. We
    don't need it. All is held by config.

3)  Now we may use any available counter, ie we try to grab any
    possible counter.

v2:
  - Lin Ming reported the lack of ESCR selector in CCCR for cache events

v3:
  - Don't loose cache event codes at config unpacking procedure, we may
    need it one day so no obscure hack behind our back, better to clear
    reserved bits explicitly when needed (thanks Ming for pointing out)

  - Lin Ming fixed misplaced opcodes in cache events

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Tested-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1269403766.3409.6.camel@minggr.sh.intel.com>
[ v4: did a few whitespace fixlets ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h | 691 +++++++++++++++++---------------
 arch/x86/kernel/cpu/perf_event_p4.c  | 746 ++++++++++++++++++++---------------
 2 files changed, 806 insertions(+), 631 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index facf96186b26..b05400a542ff 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -15,38 +15,40 @@
  * perf-MSRs are not shared and every thread has its
  * own perf-MSRs set)
  */
-#define ARCH_P4_TOTAL_ESCR		(46)
-#define ARCH_P4_RESERVED_ESCR		(2) /* IQ_ESCR(0,1) not always present */
-#define ARCH_P4_MAX_ESCR		(ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
-#define ARCH_P4_MAX_CCCR		(18)
-#define ARCH_P4_MAX_COUNTER		(ARCH_P4_MAX_CCCR / 2)
-
-#define P4_EVNTSEL_EVENT_MASK		0x7e000000U
-#define P4_EVNTSEL_EVENT_SHIFT		25
-#define P4_EVNTSEL_EVENTMASK_MASK	0x01fffe00U
-#define P4_EVNTSEL_EVENTMASK_SHIFT	9
-#define P4_EVNTSEL_TAG_MASK		0x000001e0U
-#define P4_EVNTSEL_TAG_SHIFT		5
-#define P4_EVNTSEL_TAG_ENABLE		0x00000010U
-#define P4_EVNTSEL_T0_OS		0x00000008U
-#define P4_EVNTSEL_T0_USR		0x00000004U
-#define P4_EVNTSEL_T1_OS		0x00000002U
-#define P4_EVNTSEL_T1_USR		0x00000001U
+#define ARCH_P4_TOTAL_ESCR	(46)
+#define ARCH_P4_RESERVED_ESCR	(2) /* IQ_ESCR(0,1) not always present */
+#define ARCH_P4_MAX_ESCR	(ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
+#define ARCH_P4_MAX_CCCR	(18)
+#define ARCH_P4_MAX_COUNTER	(ARCH_P4_MAX_CCCR / 2)
+
+#define P4_ESCR_EVENT_MASK	0x7e000000U
+#define P4_ESCR_EVENT_SHIFT	25
+#define P4_ESCR_EVENTMASK_MASK	0x01fffe00U
+#define P4_ESCR_EVENTMASK_SHIFT	9
+#define P4_ESCR_TAG_MASK	0x000001e0U
+#define P4_ESCR_TAG_SHIFT	5
+#define P4_ESCR_TAG_ENABLE	0x00000010U
+#define P4_ESCR_T0_OS		0x00000008U
+#define P4_ESCR_T0_USR		0x00000004U
+#define P4_ESCR_T1_OS		0x00000002U
+#define P4_ESCR_T1_USR		0x00000001U
+
+#define P4_ESCR_EVENT(v)	((v) << P4_ESCR_EVENT_SHIFT)
+#define P4_ESCR_EMASK(v)	((v) << P4_ESCR_EVENTMASK_SHIFT)
+#define P4_ESCR_TAG(v)		((v) << P4_ESCR_TAG_SHIFT)
 
 /* Non HT mask */
-#define P4_EVNTSEL_MASK				\
-	(P4_EVNTSEL_EVENT_MASK		|	\
-	P4_EVNTSEL_EVENTMASK_MASK	|	\
-	P4_EVNTSEL_TAG_MASK		|	\
-	P4_EVNTSEL_TAG_ENABLE		|	\
-	P4_EVNTSEL_T0_OS		|	\
-	P4_EVNTSEL_T0_USR)
+#define P4_ESCR_MASK			\
+	(P4_ESCR_EVENT_MASK	|	\
+	P4_ESCR_EVENTMASK_MASK	|	\
+	P4_ESCR_TAG_MASK	|	\
+	P4_ESCR_TAG_ENABLE	|	\
+	P4_ESCR_T0_OS		|	\
+	P4_ESCR_T0_USR)
 
 /* HT mask */
-#define P4_EVNTSEL_MASK_HT			\
-	(P4_EVNTSEL_MASK		|	\
-	P4_EVNTSEL_T1_OS		|	\
-	P4_EVNTSEL_T1_USR)
+#define P4_ESCR_MASK_HT			\
+	(P4_ESCR_MASK |	P4_ESCR_T1_OS | P4_ESCR_T1_USR)
 
 #define P4_CCCR_OVF			0x80000000U
 #define P4_CCCR_CASCADE			0x40000000U
@@ -56,7 +58,6 @@
 #define P4_CCCR_EDGE			0x01000000U
 #define P4_CCCR_THRESHOLD_MASK		0x00f00000U
 #define P4_CCCR_THRESHOLD_SHIFT		20
-#define P4_CCCR_THRESHOLD(v)		((v) << P4_CCCR_THRESHOLD_SHIFT)
 #define P4_CCCR_COMPLEMENT		0x00080000U
 #define P4_CCCR_COMPARE			0x00040000U
 #define P4_CCCR_ESCR_SELECT_MASK	0x0000e000U
@@ -67,6 +68,13 @@
 #define P4_CCCR_THREAD_ANY		0x00030000U
 #define P4_CCCR_RESERVED		0x00000fffU
 
+#define P4_CCCR_THRESHOLD(v)		((v) << P4_CCCR_THRESHOLD_SHIFT)
+#define P4_CCCR_ESEL(v)			((v) << P4_CCCR_ESCR_SELECT_SHIFT)
+
+/* Custom bits in reerved CCCR area */
+#define P4_CCCR_CACHE_OPS_MASK		0x0000003fU
+
+
 /* Non HT mask */
 #define P4_CCCR_MASK				\
 	(P4_CCCR_OVF			|	\
@@ -81,25 +89,11 @@
 	P4_CCCR_ENABLE)
 
 /* HT mask */
-#define P4_CCCR_MASK_HT				\
-	(P4_CCCR_MASK			|	\
-	P4_CCCR_THREAD_ANY)
+#define P4_CCCR_MASK_HT	(P4_CCCR_MASK | P4_CCCR_THREAD_ANY)
 
-/*
- * format is 32 bit: ee ss aa aa
- * where
- *	ee - 8 bit event
- *	ss - 8 bit selector
- *	aa aa - 16 bits reserved for tags/attributes
- */
-#define P4_EVENT_PACK(event, selector)		(((event) << 24) | ((selector) << 16))
-#define P4_EVENT_UNPACK_EVENT(packed)		(((packed) >> 24) & 0xff)
-#define P4_EVENT_UNPACK_SELECTOR(packed)	(((packed) >> 16) & 0xff)
-#define P4_EVENT_PACK_ATTR(attr)		((attr))
-#define P4_EVENT_UNPACK_ATTR(packed)		((packed) & 0xffff)
-#define P4_MAKE_EVENT_ATTR(class, name, bit)	class##_##name = (1 << bit)
-#define P4_EVENT_ATTR(class, name)		class##_##name
-#define P4_EVENT_ATTR_STR(class, name)		__stringify(class##_##name)
+#define P4_GEN_ESCR_EMASK(class, name, bit)	\
+	class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
+#define P4_ESCR_EMASK_BIT(class, name)		class##__##name
 
 /*
  * config field is 64bit width and consists of
@@ -117,35 +111,29 @@
 #define p4_config_pack_escr(v)		(((u64)(v)) << 32)
 #define p4_config_pack_cccr(v)		(((u64)(v)) & 0xffffffffULL)
 #define p4_config_unpack_escr(v)	(((u64)(v)) >> 32)
-#define p4_config_unpack_cccr(v)	(((u64)(v)) & 0xfffff000ULL)
+#define p4_config_unpack_cccr(v)	(((u64)(v)) & 0xffffffffULL)
 
 #define p4_config_unpack_emask(v)			\
 	({						\
 		u32 t = p4_config_unpack_escr((v));	\
-		t  &= P4_EVNTSEL_EVENTMASK_MASK;	\
-		t >>= P4_EVNTSEL_EVENTMASK_SHIFT;	\
+		t = t &  P4_ESCR_EVENTMASK_MASK;	\
+		t = t >> P4_ESCR_EVENTMASK_SHIFT;	\
+		t;					\
+	})
+
+#define p4_config_unpack_event(v)			\
+	({						\
+		u32 t = p4_config_unpack_escr((v));	\
+		t = t &  P4_ESCR_EVENT_MASK;		\
+		t = t >> P4_ESCR_EVENT_SHIFT;		\
 		t;					\
 	})
 
-#define p4_config_unpack_key(v)		(((u64)(v)) & P4_CCCR_RESERVED)
+#define p4_config_unpack_cache_event(v)	(((u64)(v)) & P4_CCCR_CACHE_OPS_MASK)
 
 #define P4_CONFIG_HT_SHIFT		63
 #define P4_CONFIG_HT			(1ULL << P4_CONFIG_HT_SHIFT)
 
-static inline u32 p4_config_unpack_opcode(u64 config)
-{
-	u32 e, s;
-
-	/*
-	 * we don't care about HT presence here since
-	 * event opcode doesn't depend on it
-	 */
-	e = (p4_config_unpack_escr(config) & P4_EVNTSEL_EVENT_MASK) >> P4_EVNTSEL_EVENT_SHIFT;
-	s = (p4_config_unpack_cccr(config) & P4_CCCR_ESCR_SELECT_MASK) >> P4_CCCR_ESCR_SELECT_SHIFT;
-
-	return P4_EVENT_PACK(e, s);
-}
-
 static inline bool p4_is_event_cascaded(u64 config)
 {
 	u32 cccr = p4_config_unpack_cccr(config);
@@ -212,19 +200,73 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
 
 	if (!p4_ht_thread(cpu)) {
 		if (!exclude_os)
-			escr |= P4_EVNTSEL_T0_OS;
+			escr |= P4_ESCR_T0_OS;
 		if (!exclude_usr)
-			escr |= P4_EVNTSEL_T0_USR;
+			escr |= P4_ESCR_T0_USR;
 	} else {
 		if (!exclude_os)
-			escr |= P4_EVNTSEL_T1_OS;
+			escr |= P4_ESCR_T1_OS;
 		if (!exclude_usr)
-			escr |= P4_EVNTSEL_T1_USR;
+			escr |= P4_ESCR_T1_USR;
 	}
 
 	return escr;
 }
 
+enum P4_EVENTS {
+	P4_EVENT_TC_DELIVER_MODE,
+	P4_EVENT_BPU_FETCH_REQUEST,
+	P4_EVENT_ITLB_REFERENCE,
+	P4_EVENT_MEMORY_CANCEL,
+	P4_EVENT_MEMORY_COMPLETE,
+	P4_EVENT_LOAD_PORT_REPLAY,
+	P4_EVENT_STORE_PORT_REPLAY,
+	P4_EVENT_MOB_LOAD_REPLAY,
+	P4_EVENT_PAGE_WALK_TYPE,
+	P4_EVENT_BSQ_CACHE_REFERENCE,
+	P4_EVENT_IOQ_ALLOCATION,
+	P4_EVENT_IOQ_ACTIVE_ENTRIES,
+	P4_EVENT_FSB_DATA_ACTIVITY,
+	P4_EVENT_BSQ_ALLOCATION,
+	P4_EVENT_BSQ_ACTIVE_ENTRIES,
+	P4_EVENT_SSE_INPUT_ASSIST,
+	P4_EVENT_PACKED_SP_UOP,
+	P4_EVENT_PACKED_DP_UOP,
+	P4_EVENT_SCALAR_SP_UOP,
+	P4_EVENT_SCALAR_DP_UOP,
+	P4_EVENT_64BIT_MMX_UOP,
+	P4_EVENT_128BIT_MMX_UOP,
+	P4_EVENT_X87_FP_UOP,
+	P4_EVENT_TC_MISC,
+	P4_EVENT_GLOBAL_POWER_EVENTS,
+	P4_EVENT_TC_MS_XFER,
+	P4_EVENT_UOP_QUEUE_WRITES,
+	P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE,
+	P4_EVENT_RETIRED_BRANCH_TYPE,
+	P4_EVENT_RESOURCE_STALL,
+	P4_EVENT_WC_BUFFER,
+	P4_EVENT_B2B_CYCLES,
+	P4_EVENT_BNR,
+	P4_EVENT_SNOOP,
+	P4_EVENT_RESPONSE,
+	P4_EVENT_FRONT_END_EVENT,
+	P4_EVENT_EXECUTION_EVENT,
+	P4_EVENT_REPLAY_EVENT,
+	P4_EVENT_INSTR_RETIRED,
+	P4_EVENT_UOPS_RETIRED,
+	P4_EVENT_UOP_TYPE,
+	P4_EVENT_BRANCH_RETIRED,
+	P4_EVENT_MISPRED_BRANCH_RETIRED,
+	P4_EVENT_X87_ASSIST,
+	P4_EVENT_MACHINE_CLEAR,
+	P4_EVENT_INSTR_COMPLETED,
+};
+
+#define P4_OPCODE(event)		event##_OPCODE
+#define P4_OPCODE_ESEL(opcode)		((opcode & 0x00ff) >> 0)
+#define P4_OPCODE_EVNT(opcode)		((opcode & 0xff00) >> 8)
+#define P4_OPCODE_PACK(event, sel)	(((event) << 8) | sel)
+
 /*
  * Comments below the event represent ESCR restriction
  * for this event and counter index per ESCR
@@ -238,484 +280,515 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
  * working so that we should not use this CCCR and respective
  * counter as result
  */
-#define P4_TC_DELIVER_MODE		P4_EVENT_PACK(0x01, 0x01)
+enum P4_EVENT_OPCODES {
+	P4_OPCODE(P4_EVENT_TC_DELIVER_MODE)		= P4_OPCODE_PACK(0x01, 0x01),
 	/*
 	 * MSR_P4_TC_ESCR0:	4, 5
 	 * MSR_P4_TC_ESCR1:	6, 7
 	 */
 
-#define P4_BPU_FETCH_REQUEST		P4_EVENT_PACK(0x03, 0x00)
+	P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST)		= P4_OPCODE_PACK(0x03, 0x00),
 	/*
 	 * MSR_P4_BPU_ESCR0:	0, 1
 	 * MSR_P4_BPU_ESCR1:	2, 3
 	 */
 
-#define P4_ITLB_REFERENCE		P4_EVENT_PACK(0x18, 0x03)
+	P4_OPCODE(P4_EVENT_ITLB_REFERENCE)		= P4_OPCODE_PACK(0x18, 0x03),
 	/*
 	 * MSR_P4_ITLB_ESCR0:	0, 1
 	 * MSR_P4_ITLB_ESCR1:	2, 3
 	 */
 
-#define P4_MEMORY_CANCEL		P4_EVENT_PACK(0x02, 0x05)
+	P4_OPCODE(P4_EVENT_MEMORY_CANCEL)		= P4_OPCODE_PACK(0x02, 0x05),
 	/*
 	 * MSR_P4_DAC_ESCR0:	8, 9
 	 * MSR_P4_DAC_ESCR1:	10, 11
 	 */
 
-#define P4_MEMORY_COMPLETE		P4_EVENT_PACK(0x08, 0x02)
+	P4_OPCODE(P4_EVENT_MEMORY_COMPLETE)		= P4_OPCODE_PACK(0x08, 0x02),
 	/*
 	 * MSR_P4_SAAT_ESCR0:	8, 9
 	 * MSR_P4_SAAT_ESCR1:	10, 11
 	 */
 
-#define P4_LOAD_PORT_REPLAY		P4_EVENT_PACK(0x04, 0x02)
+	P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY)		= P4_OPCODE_PACK(0x04, 0x02),
 	/*
 	 * MSR_P4_SAAT_ESCR0:	8, 9
 	 * MSR_P4_SAAT_ESCR1:	10, 11
 	 */
 
-#define P4_STORE_PORT_REPLAY		P4_EVENT_PACK(0x05, 0x02)
+	P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY)		= P4_OPCODE_PACK(0x05, 0x02),
 	/*
 	 * MSR_P4_SAAT_ESCR0:	8, 9
 	 * MSR_P4_SAAT_ESCR1:	10, 11
 	 */
 
-#define P4_MOB_LOAD_REPLAY		P4_EVENT_PACK(0x03, 0x02)
+	P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY)		= P4_OPCODE_PACK(0x03, 0x02),
 	/*
 	 * MSR_P4_MOB_ESCR0:	0, 1
 	 * MSR_P4_MOB_ESCR1:	2, 3
 	 */
 
-#define P4_PAGE_WALK_TYPE		P4_EVENT_PACK(0x01, 0x04)
+	P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE)		= P4_OPCODE_PACK(0x01, 0x04),
 	/*
 	 * MSR_P4_PMH_ESCR0:	0, 1
 	 * MSR_P4_PMH_ESCR1:	2, 3
 	 */
 
-#define P4_BSQ_CACHE_REFERENCE		P4_EVENT_PACK(0x0c, 0x07)
+	P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE)		= P4_OPCODE_PACK(0x0c, 0x07),
 	/*
 	 * MSR_P4_BSU_ESCR0:	0, 1
 	 * MSR_P4_BSU_ESCR1:	2, 3
 	 */
 
-#define P4_IOQ_ALLOCATION		P4_EVENT_PACK(0x03, 0x06)
+	P4_OPCODE(P4_EVENT_IOQ_ALLOCATION)		= P4_OPCODE_PACK(0x03, 0x06),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_IOQ_ACTIVE_ENTRIES		P4_EVENT_PACK(0x1a, 0x06)
+	P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES)		= P4_OPCODE_PACK(0x1a, 0x06),
 	/*
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_FSB_DATA_ACTIVITY		P4_EVENT_PACK(0x17, 0x06)
+	P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY)		= P4_OPCODE_PACK(0x17, 0x06),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_BSQ_ALLOCATION		P4_EVENT_PACK(0x05, 0x07)
+	P4_OPCODE(P4_EVENT_BSQ_ALLOCATION)		= P4_OPCODE_PACK(0x05, 0x07),
 	/*
 	 * MSR_P4_BSU_ESCR0:	0, 1
 	 */
 
-#define P4_BSQ_ACTIVE_ENTRIES		P4_EVENT_PACK(0x06, 0x07)
+	P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES)		= P4_OPCODE_PACK(0x06, 0x07),
 	/*
 	 * NOTE: no ESCR name in docs, it's guessed
 	 * MSR_P4_BSU_ESCR1:	2, 3
 	 */
 
-#define P4_SSE_INPUT_ASSIST		P4_EVENT_PACK(0x34, 0x01)
+	P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST)		= P4_OPCODE_PACK(0x34, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_PACKED_SP_UOP		P4_EVENT_PACK(0x08, 0x01)
+	P4_OPCODE(P4_EVENT_PACKED_SP_UOP)		= P4_OPCODE_PACK(0x08, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_PACKED_DP_UOP		P4_EVENT_PACK(0x0c, 0x01)
+	P4_OPCODE(P4_EVENT_PACKED_DP_UOP)		= P4_OPCODE_PACK(0x0c, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_SCALAR_SP_UOP		P4_EVENT_PACK(0x0a, 0x01)
+	P4_OPCODE(P4_EVENT_SCALAR_SP_UOP)		= P4_OPCODE_PACK(0x0a, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_SCALAR_DP_UOP		P4_EVENT_PACK(0x0e, 0x01)
+	P4_OPCODE(P4_EVENT_SCALAR_DP_UOP)		= P4_OPCODE_PACK(0x0e, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_64BIT_MMX_UOP		P4_EVENT_PACK(0x02, 0x01)
+	P4_OPCODE(P4_EVENT_64BIT_MMX_UOP)		= P4_OPCODE_PACK(0x02, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_128BIT_MMX_UOP		P4_EVENT_PACK(0x1a, 0x01)
+	P4_OPCODE(P4_EVENT_128BIT_MMX_UOP)		= P4_OPCODE_PACK(0x1a, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_X87_FP_UOP			P4_EVENT_PACK(0x04, 0x01)
+	P4_OPCODE(P4_EVENT_X87_FP_UOP)			= P4_OPCODE_PACK(0x04, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_TC_MISC			P4_EVENT_PACK(0x06, 0x01)
+	P4_OPCODE(P4_EVENT_TC_MISC)			= P4_OPCODE_PACK(0x06, 0x01),
 	/*
 	 * MSR_P4_TC_ESCR0:	4, 5
 	 * MSR_P4_TC_ESCR1:	6, 7
 	 */
 
-#define P4_GLOBAL_POWER_EVENTS		P4_EVENT_PACK(0x13, 0x06)
+	P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS)		= P4_OPCODE_PACK(0x13, 0x06),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_TC_MS_XFER			P4_EVENT_PACK(0x05, 0x00)
+	P4_OPCODE(P4_EVENT_TC_MS_XFER)			= P4_OPCODE_PACK(0x05, 0x00),
 	/*
 	 * MSR_P4_MS_ESCR0:	4, 5
 	 * MSR_P4_MS_ESCR1:	6, 7
 	 */
 
-#define P4_UOP_QUEUE_WRITES		P4_EVENT_PACK(0x09, 0x00)
+	P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES)		= P4_OPCODE_PACK(0x09, 0x00),
 	/*
 	 * MSR_P4_MS_ESCR0:	4, 5
 	 * MSR_P4_MS_ESCR1:	6, 7
 	 */
 
-#define P4_RETIRED_MISPRED_BRANCH_TYPE	P4_EVENT_PACK(0x05, 0x02)
+	P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE)	= P4_OPCODE_PACK(0x05, 0x02),
 	/*
 	 * MSR_P4_TBPU_ESCR0:	4, 5
 	 * MSR_P4_TBPU_ESCR1:	6, 7
 	 */
 
-#define P4_RETIRED_BRANCH_TYPE		P4_EVENT_PACK(0x04, 0x02)
+	P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE)		= P4_OPCODE_PACK(0x04, 0x02),
 	/*
 	 * MSR_P4_TBPU_ESCR0:	4, 5
 	 * MSR_P4_TBPU_ESCR1:	6, 7
 	 */
 
-#define P4_RESOURCE_STALL		P4_EVENT_PACK(0x01, 0x01)
+	P4_OPCODE(P4_EVENT_RESOURCE_STALL)		= P4_OPCODE_PACK(0x01, 0x01),
 	/*
 	 * MSR_P4_ALF_ESCR0:	12, 13, 16
 	 * MSR_P4_ALF_ESCR1:	14, 15, 17
 	 */
 
-#define P4_WC_BUFFER			P4_EVENT_PACK(0x05, 0x05)
+	P4_OPCODE(P4_EVENT_WC_BUFFER)			= P4_OPCODE_PACK(0x05, 0x05),
 	/*
 	 * MSR_P4_DAC_ESCR0:	8, 9
 	 * MSR_P4_DAC_ESCR1:	10, 11
 	 */
 
-#define P4_B2B_CYCLES			P4_EVENT_PACK(0x16, 0x03)
+	P4_OPCODE(P4_EVENT_B2B_CYCLES)			= P4_OPCODE_PACK(0x16, 0x03),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_BNR				P4_EVENT_PACK(0x08, 0x03)
+	P4_OPCODE(P4_EVENT_BNR)				= P4_OPCODE_PACK(0x08, 0x03),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_SNOOP			P4_EVENT_PACK(0x06, 0x03)
+	P4_OPCODE(P4_EVENT_SNOOP)			= P4_OPCODE_PACK(0x06, 0x03),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_RESPONSE			P4_EVENT_PACK(0x04, 0x03)
+	P4_OPCODE(P4_EVENT_RESPONSE)			= P4_OPCODE_PACK(0x04, 0x03),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_FRONT_END_EVENT		P4_EVENT_PACK(0x08, 0x05)
+	P4_OPCODE(P4_EVENT_FRONT_END_EVENT)		= P4_OPCODE_PACK(0x08, 0x05),
 	/*
 	 * MSR_P4_CRU_ESCR2:	12, 13, 16
 	 * MSR_P4_CRU_ESCR3:	14, 15, 17
 	 */
 
-#define P4_EXECUTION_EVENT		P4_EVENT_PACK(0x0c, 0x05)
+	P4_OPCODE(P4_EVENT_EXECUTION_EVENT)		= P4_OPCODE_PACK(0x0c, 0x05),
 	/*
 	 * MSR_P4_CRU_ESCR2:	12, 13, 16
 	 * MSR_P4_CRU_ESCR3:	14, 15, 17
 	 */
 
-#define P4_REPLAY_EVENT			P4_EVENT_PACK(0x09, 0x05)
+	P4_OPCODE(P4_EVENT_REPLAY_EVENT)		= P4_OPCODE_PACK(0x09, 0x05),
 	/*
 	 * MSR_P4_CRU_ESCR2:	12, 13, 16
 	 * MSR_P4_CRU_ESCR3:	14, 15, 17
 	 */
 
-#define P4_INSTR_RETIRED		P4_EVENT_PACK(0x02, 0x04)
+	P4_OPCODE(P4_EVENT_INSTR_RETIRED)		= P4_OPCODE_PACK(0x02, 0x04),
 	/*
 	 * MSR_P4_CRU_ESCR0:	12, 13, 16
 	 * MSR_P4_CRU_ESCR1:	14, 15, 17
 	 */
 
-#define P4_UOPS_RETIRED			P4_EVENT_PACK(0x01, 0x04)
+	P4_OPCODE(P4_EVENT_UOPS_RETIRED)		= P4_OPCODE_PACK(0x01, 0x04),
 	/*
 	 * MSR_P4_CRU_ESCR0:	12, 13, 16
 	 * MSR_P4_CRU_ESCR1:	14, 15, 17
 	 */
 
-#define P4_UOP_TYPE			P4_EVENT_PACK(0x02, 0x02)
+	P4_OPCODE(P4_EVENT_UOP_TYPE)			= P4_OPCODE_PACK(0x02, 0x02),
 	/*
 	 * MSR_P4_RAT_ESCR0:	12, 13, 16
 	 * MSR_P4_RAT_ESCR1:	14, 15, 17
 	 */
 
-#define P4_BRANCH_RETIRED		P4_EVENT_PACK(0x06, 0x05)
+	P4_OPCODE(P4_EVENT_BRANCH_RETIRED)		= P4_OPCODE_PACK(0x06, 0x05),
 	/*
 	 * MSR_P4_CRU_ESCR2:	12, 13, 16
 	 * MSR_P4_CRU_ESCR3:	14, 15, 17
 	 */
 
-#define P4_MISPRED_BRANCH_RETIRED	P4_EVENT_PACK(0x03, 0x04)
+	P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED)	= P4_OPCODE_PACK(0x03, 0x04),
 	/*
 	 * MSR_P4_CRU_ESCR0:	12, 13, 16
 	 * MSR_P4_CRU_ESCR1:	14, 15, 17
 	 */
 
-#define P4_X87_ASSIST			P4_EVENT_PACK(0x03, 0x05)
+	P4_OPCODE(P4_EVENT_X87_ASSIST)			= P4_OPCODE_PACK(0x03, 0x05),
 	/*
 	 * MSR_P4_CRU_ESCR2:	12, 13, 16
 	 * MSR_P4_CRU_ESCR3:	14, 15, 17
 	 */
 
-#define P4_MACHINE_CLEAR		P4_EVENT_PACK(0x02, 0x05)
+	P4_OPCODE(P4_EVENT_MACHINE_CLEAR)		= P4_OPCODE_PACK(0x02, 0x05),
 	/*
 	 * MSR_P4_CRU_ESCR2:	12, 13, 16
 	 * MSR_P4_CRU_ESCR3:	14, 15, 17
 	 */
 
-#define P4_INSTR_COMPLETED		P4_EVENT_PACK(0x07, 0x04)
+	P4_OPCODE(P4_EVENT_INSTR_COMPLETED)		= P4_OPCODE_PACK(0x07, 0x04),
 	/*
 	 * MSR_P4_CRU_ESCR0:	12, 13, 16
 	 * MSR_P4_CRU_ESCR1:	14, 15, 17
 	 */
+};
 
 /*
- * a caller should use P4_EVENT_ATTR helper to
- * pick the attribute needed, for example
+ * a caller should use P4_ESCR_EMASK_NAME helper to
+ * pick the EventMask needed, for example
  *
- *	P4_EVENT_ATTR(P4_TC_DELIVER_MODE, DD)
+ *	P4_ESCR_EMASK_NAME(P4_EVENT_TC_DELIVER_MODE, DD)
  */
-enum P4_EVENTS_ATTR {
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, DD, 0),
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, DB, 1),
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, DI, 2),
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, BD, 3),
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, BB, 4),
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, BI, 5),
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, ID, 6),
-
-	P4_MAKE_EVENT_ATTR(P4_BPU_FETCH_REQUEST, TCMISS, 0),
-
-	P4_MAKE_EVENT_ATTR(P4_ITLB_REFERENCE, HIT, 0),
-	P4_MAKE_EVENT_ATTR(P4_ITLB_REFERENCE, MISS, 1),
-	P4_MAKE_EVENT_ATTR(P4_ITLB_REFERENCE, HIT_UK, 2),
-
-	P4_MAKE_EVENT_ATTR(P4_MEMORY_CANCEL, ST_RB_FULL, 2),
-	P4_MAKE_EVENT_ATTR(P4_MEMORY_CANCEL, 64K_CONF, 3),
-
-	P4_MAKE_EVENT_ATTR(P4_MEMORY_COMPLETE, LSC, 0),
-	P4_MAKE_EVENT_ATTR(P4_MEMORY_COMPLETE, SSC, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_LOAD_PORT_REPLAY, SPLIT_LD, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_STORE_PORT_REPLAY, SPLIT_ST, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, NO_STA, 1),
-	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, NO_STD, 3),
-	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, PARTIAL_DATA, 4),
-	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, UNALGN_ADDR, 5),
-
-	P4_MAKE_EVENT_ATTR(P4_PAGE_WALK_TYPE, DTMISS, 0),
-	P4_MAKE_EVENT_ATTR(P4_PAGE_WALK_TYPE, ITMISS, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITS, 0),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITE, 1),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITM, 2),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITS, 3),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITE, 4),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITM, 5),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_MISS, 8),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_MISS, 9),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, WR_2ndL_MISS, 10),
-
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, DEFAULT, 0),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, ALL_READ, 5),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, ALL_WRITE, 6),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_UC, 7),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WC, 8),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WT, 9),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WP, 10),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WB, 11),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, OWN, 13),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, OTHER, 14),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, PREFETCH, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, DEFAULT, 0),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, ALL_READ, 5),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, ALL_WRITE, 6),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_UC, 7),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WC, 8),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WT, 9),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WP, 10),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WB, 11),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, OWN, 13),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, OTHER, 14),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, PREFETCH, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_DRV, 0),
-	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OWN, 1),
-	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OTHER, 2),
-	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DBSY_DRV, 3),
-	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DBSY_OWN, 4),
-	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DBSY_OTHER, 5),
-
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_TYPE0, 0),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_TYPE1, 1),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_LEN0, 2),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_LEN1, 3),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_IO_TYPE, 5),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_LOCK_TYPE, 6),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_CACHE_TYPE, 7),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_SPLIT_TYPE, 8),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_DEM_TYPE, 9),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_ORD_TYPE, 10),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, MEM_TYPE0, 11),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, MEM_TYPE1, 12),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, MEM_TYPE2, 13),
-
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_TYPE0, 0),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_TYPE1, 1),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_LEN0, 2),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_LEN1, 3),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE, 5),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE, 6),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE, 7),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE, 8),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE, 9),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE, 10),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, MEM_TYPE0, 11),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, MEM_TYPE1, 12),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, MEM_TYPE2, 13),
-
-	P4_MAKE_EVENT_ATTR(P4_SSE_INPUT_ASSIST, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_PACKED_SP_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_PACKED_DP_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_SCALAR_SP_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_SCALAR_DP_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_64BIT_MMX_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_128BIT_MMX_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_X87_FP_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_TC_MISC, FLUSH, 4),
-
-	P4_MAKE_EVENT_ATTR(P4_GLOBAL_POWER_EVENTS, RUNNING, 0),
-
-	P4_MAKE_EVENT_ATTR(P4_TC_MS_XFER, CISC, 0),
-
-	P4_MAKE_EVENT_ATTR(P4_UOP_QUEUE_WRITES, FROM_TC_BUILD, 0),
-	P4_MAKE_EVENT_ATTR(P4_UOP_QUEUE_WRITES, FROM_TC_DELIVER, 1),
-	P4_MAKE_EVENT_ATTR(P4_UOP_QUEUE_WRITES, FROM_ROM, 2),
-
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL, 1),
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, CALL, 2),
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, RETURN, 3),
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT, 4),
-
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CONDITIONAL, 1),
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CALL, 2),
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, RETURN, 3),
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, INDIRECT, 4),
-
-	P4_MAKE_EVENT_ATTR(P4_RESOURCE_STALL, SBFULL, 5),
-
-	P4_MAKE_EVENT_ATTR(P4_WC_BUFFER, WCB_EVICTS, 0),
-	P4_MAKE_EVENT_ATTR(P4_WC_BUFFER, WCB_FULL_EVICTS, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_FRONT_END_EVENT, NBOGUS, 0),
-	P4_MAKE_EVENT_ATTR(P4_FRONT_END_EVENT, BOGUS, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS0, 0),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS1, 1),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS2, 2),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS3, 3),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS0, 4),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS1, 5),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS2, 6),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS3, 7),
-
-	P4_MAKE_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS, 0),
-	P4_MAKE_EVENT_ATTR(P4_REPLAY_EVENT, BOGUS, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSNTAG, 0),
-	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSTAG, 1),
-	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG, 2),
-	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSTAG, 3),
-
-	P4_MAKE_EVENT_ATTR(P4_UOPS_RETIRED, NBOGUS, 0),
-	P4_MAKE_EVENT_ATTR(P4_UOPS_RETIRED, BOGUS, 1),
+enum P4_ESCR_EMASKS {
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DB, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DI, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BD, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BB, 4),
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BI, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, ID, 6),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_BPU_FETCH_REQUEST, TCMISS, 0),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, HIT, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, MISS, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, HIT_UK, 2),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_CANCEL, 64K_CONF, 3),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_COMPLETE, LSC, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_COMPLETE, SSC, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, NO_STA, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, NO_STD, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA, 4),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR, 5),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_PAGE_WALK_TYPE, DTMISS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_PAGE_WALK_TYPE, ITMISS, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE, 4),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS, 8),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS, 9),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS, 10),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, DEFAULT, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, ALL_READ, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE, 6),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_UC, 7),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WC, 8),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WT, 9),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WP, 10),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WB, 11),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, OWN, 13),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, OTHER, 14),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, PREFETCH, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE, 6),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC, 7),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC, 8),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT, 9),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP, 10),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB, 11),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN, 13),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER, 14),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN, 4),
+	P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER, 5),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE, 6),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE, 7),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE, 8),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE, 9),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE, 10),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0, 11),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1, 12),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2, 13),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE, 6),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE, 7),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE, 8),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE, 9),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE, 10),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0, 11),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1, 12),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2, 13),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_SSE_INPUT_ASSIST, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_PACKED_SP_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_PACKED_DP_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_SCALAR_SP_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_SCALAR_DP_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_64BIT_MMX_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_128BIT_MMX_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_X87_FP_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_MISC, FLUSH, 4),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING, 0),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_MS_XFER, CISC, 0),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM, 2),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT, 4),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, CALL, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT, 4),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_RESOURCE_STALL, SBFULL, 5),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_WC_BUFFER, WCB_EVICTS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_FRONT_END_EVENT, NBOGUS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_FRONT_END_EVENT, BOGUS, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS0, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS1, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS2, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS3, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS0, 4),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS1, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS2, 6),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS3, 7),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_REPLAY_EVENT, NBOGUS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_REPLAY_EVENT, BOGUS, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, NBOGUSTAG, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, BOGUSNTAG, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, BOGUSTAG, 3),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOPS_RETIRED, NBOGUS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOPS_RETIRED, BOGUS, 1),
 
-	P4_MAKE_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS, 1),
-	P4_MAKE_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES, 2),
-
-	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMNP, 0),
-	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMNM, 1),
-	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMTP, 2),
-	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMTM, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOP_TYPE, TAGLOADS, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOP_TYPE, TAGSTORES, 2),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMNP, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMNM, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMTP, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMTM, 3),
 
-	P4_MAKE_EVENT_ATTR(P4_MISPRED_BRANCH_RETIRED, NBOGUS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS, 0),
 
-	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, FPSU, 0),
-	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, FPSO, 1),
-	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, POAO, 2),
-	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, POAU, 3),
-	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, PREA, 4),
+	P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, FPSU, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, FPSO, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, POAO, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, POAU, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, PREA, 4),
 
-	P4_MAKE_EVENT_ATTR(P4_MACHINE_CLEAR, CLEAR, 0),
-	P4_MAKE_EVENT_ATTR(P4_MACHINE_CLEAR, MOCLEAR, 1),
-	P4_MAKE_EVENT_ATTR(P4_MACHINE_CLEAR, SMCLEAR, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, CLEAR, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, MOCLEAR, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, SMCLEAR, 2),
 
-	P4_MAKE_EVENT_ATTR(P4_INSTR_COMPLETED, NBOGUS, 0),
-	P4_MAKE_EVENT_ATTR(P4_INSTR_COMPLETED, BOGUS, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, NBOGUS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1),
 };
 
-enum {
-	KEY_P4_L1D_OP_READ_RESULT_MISS = PERF_COUNT_HW_MAX,
-	KEY_P4_LL_OP_READ_RESULT_MISS,
-	KEY_P4_DTLB_OP_READ_RESULT_MISS,
-	KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
-	KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
-	KEY_P4_ITLB_OP_READ_RESULT_MISS,
-	KEY_P4_UOP_TYPE,
+/* P4 PEBS: stale for a while */
+#define P4_PEBS_METRIC_MASK	0x00001fffU
+#define P4_PEBS_UOB_TAG		0x01000000U
+#define P4_PEBS_ENABLE		0x02000000U
+
+/* Replay metrics for MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT */
+#define P4_PEBS__1stl_cache_load_miss_retired	0x3000001
+#define P4_PEBS__2ndl_cache_load_miss_retired	0x3000002
+#define P4_PEBS__dtlb_load_miss_retired		0x3000004
+#define P4_PEBS__dtlb_store_miss_retired	0x3000004
+#define P4_PEBS__dtlb_all_miss_retired		0x3000004
+#define P4_PEBS__tagged_mispred_branch		0x3018000
+#define P4_PEBS__mob_load_replay_retired	0x3000200
+#define P4_PEBS__split_load_retired		0x3000400
+#define P4_PEBS__split_store_retired		0x3000400
+
+#define P4_VERT__1stl_cache_load_miss_retired	0x0000001
+#define P4_VERT__2ndl_cache_load_miss_retired	0x0000001
+#define P4_VERT__dtlb_load_miss_retired		0x0000001
+#define P4_VERT__dtlb_store_miss_retired	0x0000002
+#define P4_VERT__dtlb_all_miss_retired		0x0000003
+#define P4_VERT__tagged_mispred_branch		0x0000010
+#define P4_VERT__mob_load_replay_retired	0x0000001
+#define P4_VERT__split_load_retired		0x0000001
+#define P4_VERT__split_store_retired		0x0000002
+
+enum P4_CACHE_EVENTS {
+	P4_CACHE__NONE,
+
+	P4_CACHE__1stl_cache_load_miss_retired,
+	P4_CACHE__2ndl_cache_load_miss_retired,
+	P4_CACHE__dtlb_load_miss_retired,
+	P4_CACHE__dtlb_store_miss_retired,
+	P4_CACHE__itlb_reference_hit,
+	P4_CACHE__itlb_reference_miss,
+
+	P4_CACHE__MAX
 };
 
 #endif /* PERF_EVENT_P4_H */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index b8a811ab7609..f8fe069f14e2 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -11,35 +11,281 @@
 
 #include <asm/perf_event_p4.h>
 
+#define P4_CNTR_LIMIT 3
 /*
  * array indices: 0,1 - HT threads, used with HT enabled cpu
  */
-struct p4_event_template {
-	u32 opcode;			/* ESCR event + CCCR selector */
-	u64 config;			/* packed predefined bits */
-	int dep;			/* upstream dependency event index */
-	int key;			/* index into p4_templates */
-	u64 msr;			/*
-					 * the high 32 bits set into MSR_IA32_PEBS_ENABLE and
-					 * the low 32 bits set into MSR_P4_PEBS_MATRIX_VERT
-					 * for cache events
-					 */
-	unsigned int emask;		/* ESCR EventMask */
-	unsigned int escr_msr[2];	/* ESCR MSR for this event */
-	unsigned int cntr[2];		/* counter index (offset) */
+struct p4_event_bind {
+	unsigned int opcode;			/* Event code and ESCR selector */
+	unsigned int escr_msr[2];		/* ESCR MSR for this event */
+	unsigned char cntr[2][P4_CNTR_LIMIT];	/* counter index (offset), -1 on abscence */
 };
 
-struct p4_pmu_res {
-	/* maps hw_conf::idx into template for ESCR sake */
-	struct p4_event_template *tpl[ARCH_P4_MAX_CCCR];
+struct p4_cache_event_bind {
+	unsigned int metric_pebs;
+	unsigned int metric_vert;
 };
 
-static DEFINE_PER_CPU(struct p4_pmu_res, p4_pmu_config);
+#define P4_GEN_CACHE_EVENT_BIND(name)		\
+	[P4_CACHE__##name] = {			\
+		.metric_pebs = P4_PEBS__##name,	\
+		.metric_vert = P4_VERT__##name,	\
+	}
+
+static struct p4_cache_event_bind p4_cache_event_bind_map[] = {
+	P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired),
+	P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired),
+	P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired),
+	P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired),
+};
+
+/*
+ * Note that we don't use CCCR1 here, there is an
+ * exception for P4_BSQ_ALLOCATION but we just have
+ * no workaround
+ *
+ * consider this binding as resources which particular
+ * event may borrow, it doesn't contain EventMask,
+ * Tags and friends -- they are left to a caller
+ */
+static struct p4_event_bind p4_event_bind_map[] = {
+	[P4_EVENT_TC_DELIVER_MODE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
+		.escr_msr	= { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_BPU_FETCH_REQUEST] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
+		.escr_msr	= { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_ITLB_REFERENCE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
+		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_MEMORY_CANCEL] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
+		.escr_msr	= { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_MEMORY_COMPLETE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
+		.escr_msr	= { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_LOAD_PORT_REPLAY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
+		.escr_msr	= { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_STORE_PORT_REPLAY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
+		.escr_msr	= { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_MOB_LOAD_REPLAY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
+		.escr_msr	= { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_PAGE_WALK_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
+		.escr_msr	= { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_BSQ_CACHE_REFERENCE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
+		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_IOQ_ALLOCATION] = {
+		.opcode		= P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_IOQ_ACTIVE_ENTRIES] = {	/* shared ESCR */
+		.opcode		= P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
+		.escr_msr	= { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
+		.cntr		= { {2, -1, -1}, {3, -1, -1} },
+	},
+	[P4_EVENT_FSB_DATA_ACTIVITY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_BSQ_ALLOCATION] = {		/* shared ESCR, broken CCCR1 */
+		.opcode		= P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
+		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+		.cntr		= { {0, -1, -1}, {1, -1, -1} },
+	},
+	[P4_EVENT_BSQ_ACTIVE_ENTRIES] = {	/* shared ESCR */
+		.opcode		= P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
+		.escr_msr	= { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+		.cntr		= { {2, -1, -1}, {3, -1, -1} },
+	},
+	[P4_EVENT_SSE_INPUT_ASSIST] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_PACKED_SP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_PACKED_DP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_SCALAR_SP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_SCALAR_DP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_64BIT_MMX_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_128BIT_MMX_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_X87_FP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_X87_FP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_TC_MISC] = {
+		.opcode		= P4_OPCODE(P4_EVENT_TC_MISC),
+		.escr_msr	= { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_GLOBAL_POWER_EVENTS] = {
+		.opcode		= P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_TC_MS_XFER] = {
+		.opcode		= P4_OPCODE(P4_EVENT_TC_MS_XFER),
+		.escr_msr	= { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_UOP_QUEUE_WRITES] = {
+		.opcode		= P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
+		.escr_msr	= { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
+		.escr_msr	= { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_RETIRED_BRANCH_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
+		.escr_msr	= { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_RESOURCE_STALL] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RESOURCE_STALL),
+		.escr_msr	= { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_WC_BUFFER] = {
+		.opcode		= P4_OPCODE(P4_EVENT_WC_BUFFER),
+		.escr_msr	= { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_B2B_CYCLES] = {
+		.opcode		= P4_OPCODE(P4_EVENT_B2B_CYCLES),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_BNR] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BNR),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_SNOOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SNOOP),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_RESPONSE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RESPONSE),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_FRONT_END_EVENT] = {
+		.opcode		= P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_EXECUTION_EVENT] = {
+		.opcode		= P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_REPLAY_EVENT] = {
+		.opcode		= P4_OPCODE(P4_EVENT_REPLAY_EVENT),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_INSTR_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_INSTR_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_UOPS_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_UOPS_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_UOP_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_UOP_TYPE),
+		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_BRANCH_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_MISPRED_BRANCH_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_X87_ASSIST] = {
+		.opcode		= P4_OPCODE(P4_EVENT_X87_ASSIST),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_MACHINE_CLEAR] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_INSTR_COMPLETED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+};
 
-#define P4_CACHE_EVENT_CONFIG(event, bit) \
-	p4_config_pack_escr(P4_EVENT_UNPACK_EVENT(event) << P4_EVNTSEL_EVENT_SHIFT) | \
-	p4_config_pack_escr((event##_##bit) << P4_EVNTSEL_EVENTMASK_SHIFT) | \
-	p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(event) << P4_CCCR_ESCR_SELECT_SHIFT)
+#define P4_GEN_CACHE_EVENT(event, bit, cache_event)			  \
+	p4_config_pack_escr(P4_ESCR_EVENT(event)			| \
+			    P4_ESCR_EMASK_BIT(event, bit))		| \
+	p4_config_pack_cccr(cache_event					| \
+			    P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
 
 static __initconst u64 p4_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
@@ -49,42 +295,35 @@ static __initconst u64 p4_hw_cache_event_ids
  [ C(L1D ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
-					/* 1stL_cache_load_miss_retired */
-		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
-					| KEY_P4_L1D_OP_READ_RESULT_MISS,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_CACHE__1stl_cache_load_miss_retired),
 	},
  },
  [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
-					/* 2ndL_cache_load_miss_retired */
-		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
-					| KEY_P4_LL_OP_READ_RESULT_MISS,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_CACHE__2ndl_cache_load_miss_retired),
 	},
- },
+},
  [ C(DTLB) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
-					/* DTLB_load_miss_retired */
-		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
-					| KEY_P4_DTLB_OP_READ_RESULT_MISS,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_CACHE__dtlb_load_miss_retired),
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
-					/* DTLB_store_miss_retired */
-		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
-					| KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_CACHE__dtlb_store_miss_retired),
 	},
  },
  [ C(ITLB) ] = {
 	[ C(OP_READ) ] = {
-					/* ITLB_reference.HIT */
-		[ C(RESULT_ACCESS) ] = P4_CACHE_EVENT_CONFIG(P4_ITLB_REFERENCE, HIT)
-					| KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
-
-					/* ITLB_reference.MISS */
-		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_ITLB_REFERENCE, MISS)
-					| KEY_P4_ITLB_OP_READ_RESULT_MISS,
+		[ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
+						P4_CACHE__itlb_reference_hit),
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
+						P4_CACHE__itlb_reference_miss),
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = -1,
@@ -97,219 +336,89 @@ static __initconst u64 p4_hw_cache_event_ids
  },
 };
 
-/*
- * WARN: CCCR1 doesn't have a working enable bit so try to not
- * use it if possible
- *
- * Also as only we start to support raw events we will need to
- * append _all_ P4_EVENT_PACK'ed events here
- */
-struct p4_event_template p4_templates[] = {
-	[0] = {
-		.opcode	= P4_GLOBAL_POWER_EVENTS,
-		.config	= 0,
-		.dep	= -1,
-		.key	= 0,
-		.emask	=
-			P4_EVENT_ATTR(P4_GLOBAL_POWER_EVENTS, RUNNING),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.cntr		= { 0, 2 },
-	},
-	[1] = {
-		.opcode	= P4_INSTR_RETIRED,
-		.config	= 0,
-		.dep	= -1, /* needs front-end tagging */
-		.key	= 1,
-		.emask	=
-			P4_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSNTAG)	|
-			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG),
-		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-		.cntr		= { 12, 14 },
-	},
-	[2] = {
-		.opcode	= P4_BSQ_CACHE_REFERENCE,
-		.config	= 0,
-		.dep	= -1,
-		.key	= 2,
-		.emask	=
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITM),
-		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
-		.cntr		= { 0, 2 },
-	},
-	[3] = {
-		.opcode	= P4_BSQ_CACHE_REFERENCE,
-		.config	= 0,
-		.dep	= -1,
-		.key	= 3,
-		.emask	=
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
-		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
-		.cntr		= { 0, 3 },
-	},
-	[4] = {
-		.opcode	= P4_RETIRED_BRANCH_TYPE,
-		.config	= 0,
-		.dep	= -1,
-		.key	= 4,
-		.emask	=
-			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
-			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CALL)		|
-			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, RETURN)		|
-			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, INDIRECT),
-		.escr_msr	= { MSR_P4_TBPU_ESCR0, MSR_P4_TBPU_ESCR1 },
-		.cntr		= { 4, 6 },
-	},
-	[5] = {
-		.opcode	= P4_MISPRED_BRANCH_RETIRED,
-		.config	= 0,
-		.dep	= -1,
-		.key	= 5,
-		.emask	=
-			P4_EVENT_ATTR(P4_MISPRED_BRANCH_RETIRED, NBOGUS),
-		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-		.cntr		= { 12, 14 },
-	},
-	[6] = {
-		.opcode	= P4_FSB_DATA_ACTIVITY,
-		.config	= p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
-		.dep	= -1,
-		.key	= 6,
-		.emask	=
-			P4_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_DRV)	|
-			P4_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OWN),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.cntr		= { 0, 2 },
-	},
-	[KEY_P4_L1D_OP_READ_RESULT_MISS] = {
-		.opcode	= P4_REPLAY_EVENT,
-		.config	= 0,
-		.dep	= -1,
-		.msr	= (u64)(1 << 0 | 1 << 24) << 32 | (1 << 0),
-		.key	= KEY_P4_L1D_OP_READ_RESULT_MISS,
-		.emask	=
-			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
-		.cntr		= { 16, 17 },
-	},
-	[KEY_P4_LL_OP_READ_RESULT_MISS] = {
-		.opcode	= P4_REPLAY_EVENT,
-		.config	= 0,
-		.dep	= -1,
-		.msr	= (u64)(1 << 1 | 1 << 24) << 32 | (1 << 0),
-		.key	= KEY_P4_LL_OP_READ_RESULT_MISS,
-		.emask	=
-			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
-		.cntr		= { 16, 17 },
-	},
-	[KEY_P4_DTLB_OP_READ_RESULT_MISS] = {
-		.opcode	= P4_REPLAY_EVENT,
-		.config	= 0,
-		.dep	= -1,
-		.msr	= (u64)(1 << 2 | 1 << 24) << 32 | (1 << 0),
-		.key	= KEY_P4_DTLB_OP_READ_RESULT_MISS,
-		.emask	=
-			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
-		.cntr		= { 16, 17 },
-	},
-	[KEY_P4_DTLB_OP_WRITE_RESULT_MISS] = {
-		.opcode	= P4_REPLAY_EVENT,
-		.config	= 0,
-		.dep	= -1,
-		.msr	= (u64)(1 << 2 | 1 << 24) << 32 | (1 << 1),
-		.key	= KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
-		.emask	=
-			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
-		.cntr		= { 16, 17 },
-	},
-	[KEY_P4_ITLB_OP_READ_RESULT_ACCESS] = {
-		.opcode	= P4_ITLB_REFERENCE,
-		.config	= 0,
-		.dep	= -1,
-		.msr	= 0,
-		.key	= KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
-		.emask	=
-			P4_EVENT_ATTR(P4_ITLB_REFERENCE, HIT),
-		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
-		.cntr		= { 0, 2 },
-	},
-	[KEY_P4_ITLB_OP_READ_RESULT_MISS] = {
-		.opcode	= P4_ITLB_REFERENCE,
-		.config	= 0,
-		.dep	= -1,
-		.msr	= 0,
-		.key	= KEY_P4_ITLB_OP_READ_RESULT_MISS,
-		.emask	=
-			P4_EVENT_ATTR(P4_ITLB_REFERENCE, MISS),
-		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
-		.cntr		= { 0, 2 },
-	},
-	[KEY_P4_UOP_TYPE] = {
-		.opcode	= P4_UOP_TYPE,
-		.config	= 0,
-		.dep	= -1,
-		.key	= KEY_P4_UOP_TYPE,
-		.emask	=
-			P4_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS)	|
-			P4_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES),
-		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
-		.cntr		= { 16, 17 },
-	},
+static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
+  /* non-halted CPU clocks */
+  [PERF_COUNT_HW_CPU_CYCLES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+
+  /*
+   * retired instructions
+   * in a sake of simplicity we don't use the FSB tagging
+   */
+  [PERF_COUNT_HW_INSTRUCTIONS] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
+
+  /* cache hits */
+  [PERF_COUNT_HW_CACHE_REFERENCES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
+
+  /* cache misses */
+  [PERF_COUNT_HW_CACHE_MISSES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
+
+  /* branch instructions retired */
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
+
+  /* mispredicted branches retired */
+  [PERF_COUNT_HW_BRANCH_MISSES]	=
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
+
+  /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
+  [PERF_COUNT_HW_BUS_CYCLES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN))	|
+	p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
 };
 
+static struct p4_event_bind *p4_config_get_bind(u64 config)
+{
+	unsigned int evnt = p4_config_unpack_event(config);
+	struct p4_event_bind *bind = NULL;
+
+	if (evnt < ARRAY_SIZE(p4_event_bind_map))
+		bind = &p4_event_bind_map[evnt];
+
+	return bind;
+}
+
 static u64 p4_pmu_event_map(int hw_event)
 {
-	struct p4_event_template *tpl;
+	struct p4_event_bind *bind;
+	unsigned int esel;
 	u64 config;
 
-	if (hw_event > ARRAY_SIZE(p4_templates)) {
-		printk_once(KERN_ERR "PMU: Incorrect event index\n");
+	if (hw_event > ARRAY_SIZE(p4_general_events)) {
+		printk_once(KERN_ERR "P4 PMU: Bad index: %i\n", hw_event);
 		return 0;
 	}
-	tpl = &p4_templates[hw_event];
 
-	/*
-	 * fill config up according to
-	 * a predefined event template
-	 */
-	config  = tpl->config;
-	config |= p4_config_pack_escr(P4_EVENT_UNPACK_EVENT(tpl->opcode) << P4_EVNTSEL_EVENT_SHIFT);
-	config |= p4_config_pack_escr(tpl->emask << P4_EVNTSEL_EVENTMASK_SHIFT);
-	config |= p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(tpl->opcode) << P4_CCCR_ESCR_SELECT_SHIFT);
-	config |= p4_config_pack_cccr(hw_event & P4_CCCR_RESERVED);
+	config = p4_general_events[hw_event];
+	bind = p4_config_get_bind(config);
+	esel = P4_OPCODE_ESEL(bind->opcode);
+	config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
 
 	return config;
 }
 
-/*
- * Note that we still have 5 events (from global events SDM list)
- * intersected in opcode+emask bits so we will need another
- * scheme there do distinguish templates.
- */
-static inline int p4_pmu_emask_match(unsigned int dst, unsigned int src)
-{
-	return dst & src;
-}
-
-static struct p4_event_template *p4_pmu_template_lookup(u64 config)
-{
-	int key = p4_config_unpack_key(config);
-
-	if (key < ARRAY_SIZE(p4_templates))
-		return &p4_templates[key];
-	else
-		return NULL;
-}
-
 /*
  * We don't control raw events so it's up to the caller
  * to pass sane values (and we don't count the thread number
@@ -319,13 +428,14 @@ static struct p4_event_template *p4_pmu_template_lookup(u64 config)
 static u64 p4_pmu_raw_event(u64 hw_event)
 {
 	return hw_event &
-		(p4_config_pack_escr(P4_EVNTSEL_MASK_HT) |
+		(p4_config_pack_escr(P4_ESCR_MASK_HT) |
 		 p4_config_pack_cccr(P4_CCCR_MASK_HT));
 }
 
 static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
 {
 	int cpu = raw_smp_processor_id();
+	u32 escr, cccr;
 
 	/*
 	 * the reason we use cpu that early is that: if we get scheduled
@@ -333,13 +443,10 @@ static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
 	 * specific flags in config (and will save some cpu cycles)
 	 */
 
-	/* CCCR by default */
-	hwc->config = p4_config_pack_cccr(p4_default_cccr_conf(cpu));
+	cccr = p4_default_cccr_conf(cpu);
+	escr = p4_default_escr_conf(cpu, attr->exclude_kernel, attr->exclude_user);
+	hwc->config = p4_config_pack_escr(escr) | p4_config_pack_cccr(cccr);
 
-	/* Count user and OS events unless not requested to */
-	hwc->config |= p4_config_pack_escr(p4_default_escr_conf(cpu, attr->exclude_kernel,
-								attr->exclude_user));
-	/* on HT machine we need a special bit */
 	if (p4_ht_active() && p4_ht_thread(cpu))
 		hwc->config = p4_set_ht_bit(hwc->config);
 
@@ -368,7 +475,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event)
 	 */
 	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
 		(u64)(p4_config_unpack_cccr(hwc->config)) &
-			~P4_CCCR_ENABLE & ~P4_CCCR_OVF);
+			~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
 }
 
 static void p4_pmu_disable_all(void)
@@ -389,27 +496,14 @@ static void p4_pmu_enable_event(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	int thread = p4_ht_config_thread(hwc->config);
 	u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
-	u64 escr_base;
-	struct p4_event_template *tpl;
-	struct p4_pmu_res *c;
+	unsigned int idx = p4_config_unpack_event(hwc->config);
+	unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config);
+	struct p4_event_bind *bind;
+	struct p4_cache_event_bind *bind_cache;
+	u64 escr_addr, cccr;
 
-	/*
-	 * some preparation work from per-cpu private fields
-	 * since we need to find out which ESCR to use
-	 */
-	c = &__get_cpu_var(p4_pmu_config);
-	tpl = c->tpl[hwc->idx];
-	if (!tpl) {
-		pr_crit("%s: Wrong index: %d\n", __func__, hwc->idx);
-		return;
-	}
-
-	if (tpl->msr) {
-		(void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, tpl->msr >> 32);
-		(void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, tpl->msr & 0xffffffff);
-	}
-
-	escr_base = (u64)tpl->escr_msr[thread];
+	bind = &p4_event_bind_map[idx];
+	escr_addr = (u64)bind->escr_msr[thread];
 
 	/*
 	 * - we dont support cascaded counters yet
@@ -418,9 +512,27 @@ static void p4_pmu_enable_event(struct perf_event *event)
 	WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
 	WARN_ON_ONCE(hwc->idx == 1);
 
-	(void)checking_wrmsrl(escr_base, escr_conf);
+	/* we need a real Event value */
+	escr_conf &= ~P4_ESCR_EVENT_MASK;
+	escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
+
+	cccr = p4_config_unpack_cccr(hwc->config);
+
+	/*
+	 * it could be Cache event so that we need to
+	 * set metrics into additional MSRs
+	 */
+	BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK);
+	if (idx_cache > P4_CACHE__NONE &&
+		idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) {
+		bind_cache = &p4_cache_event_bind_map[idx_cache];
+		(void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs);
+		(void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert);
+	}
+
+	(void)checking_wrmsrl(escr_addr, escr_conf);
 	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
-		(u64)(p4_config_unpack_cccr(hwc->config)) | P4_CCCR_ENABLE);
+				(cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
 }
 
 static void p4_pmu_enable_all(void)
@@ -516,13 +628,13 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
 	if (p4_ht_thread(cpu)) {
 		cccr &= ~P4_CCCR_OVF_PMI_T0;
 		cccr |= P4_CCCR_OVF_PMI_T1;
-		if (escr & P4_EVNTSEL_T0_OS) {
-			escr &= ~P4_EVNTSEL_T0_OS;
-			escr |= P4_EVNTSEL_T1_OS;
+		if (escr & P4_ESCR_T0_OS) {
+			escr &= ~P4_ESCR_T0_OS;
+			escr |= P4_ESCR_T1_OS;
 		}
-		if (escr & P4_EVNTSEL_T0_USR) {
-			escr &= ~P4_EVNTSEL_T0_USR;
-			escr |= P4_EVNTSEL_T1_USR;
+		if (escr & P4_ESCR_T0_USR) {
+			escr &= ~P4_ESCR_T0_USR;
+			escr |= P4_ESCR_T1_USR;
 		}
 		hwc->config  = p4_config_pack_escr(escr);
 		hwc->config |= p4_config_pack_cccr(cccr);
@@ -530,13 +642,13 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
 	} else {
 		cccr &= ~P4_CCCR_OVF_PMI_T1;
 		cccr |= P4_CCCR_OVF_PMI_T0;
-		if (escr & P4_EVNTSEL_T1_OS) {
-			escr &= ~P4_EVNTSEL_T1_OS;
-			escr |= P4_EVNTSEL_T0_OS;
+		if (escr & P4_ESCR_T1_OS) {
+			escr &= ~P4_ESCR_T1_OS;
+			escr |= P4_ESCR_T0_OS;
 		}
-		if (escr & P4_EVNTSEL_T1_USR) {
-			escr &= ~P4_EVNTSEL_T1_USR;
-			escr |= P4_EVNTSEL_T0_USR;
+		if (escr & P4_ESCR_T1_USR) {
+			escr &= ~P4_ESCR_T1_USR;
+			escr |= P4_ESCR_T0_USR;
 		}
 		hwc->config  = p4_config_pack_escr(escr);
 		hwc->config |= p4_config_pack_cccr(cccr);
@@ -606,66 +718,56 @@ static int p4_get_escr_idx(unsigned int addr)
 	return -1;
 }
 
+static int p4_next_cntr(int thread, unsigned long *used_mask,
+			struct p4_event_bind *bind)
+{
+	int i = 0, j;
+
+	for (i = 0; i < P4_CNTR_LIMIT; i++) {
+		j = bind->cntr[thread][i++];
+		if (j == -1 || !test_bit(j, used_mask))
+			return j;
+	}
+
+	return -1;
+}
+
 static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 {
 	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long escr_mask[BITS_TO_LONGS(ARCH_P4_TOTAL_ESCR)];
-
-	struct hw_perf_event *hwc;
-	struct p4_event_template *tpl;
-	struct p4_pmu_res *c;
 	int cpu = raw_smp_processor_id();
-	int escr_idx, thread, i, num;
+	struct hw_perf_event *hwc;
+	struct p4_event_bind *bind;
+	unsigned int i, thread, num;
+	int cntr_idx, escr_idx;
 
 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
 	bitmap_zero(escr_mask, ARCH_P4_TOTAL_ESCR);
 
-	c = &__get_cpu_var(p4_pmu_config);
-	/*
-	 * Firstly find out which resource events are going
-	 * to use, if ESCR+CCCR tuple is already borrowed
-	 * then get out of here
-	 */
 	for (i = 0, num = n; i < n; i++, num--) {
+
 		hwc = &cpuc->event_list[i]->hw;
-		tpl = p4_pmu_template_lookup(hwc->config);
-		if (!tpl)
-			goto done;
 		thread = p4_ht_thread(cpu);
-		escr_idx = p4_get_escr_idx(tpl->escr_msr[thread]);
-		if (escr_idx == -1)
-			goto done;
+		bind = p4_config_get_bind(hwc->config);
+		escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
 
-		/* already allocated and remains on the same cpu */
 		if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
+			cntr_idx = hwc->idx;
 			if (assign)
 				assign[i] = hwc->idx;
-			/* upstream dependent event */
-			if (unlikely(tpl->dep != -1))
-				printk_once(KERN_WARNING "PMU: Dep events are "
-					"not implemented yet\n");
 			goto reserve;
 		}
 
-		/* it may be already borrowed */
-		if (test_bit(tpl->cntr[thread], used_mask) ||
-			test_bit(escr_idx, escr_mask))
+		cntr_idx = p4_next_cntr(thread, used_mask, bind);
+		if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
 			goto done;
 
-		/*
-		 * ESCR+CCCR+COUNTERs are available to use lets swap
-		 * thread specific bits, push assigned bits
-		 * back and save template into per-cpu
-		 * area (which will allow us to find out the ESCR
-		 * to be used at moment of "enable event via real MSR")
-		 */
 		p4_pmu_swap_config_ts(hwc, cpu);
-		if (assign) {
-			assign[i] = tpl->cntr[thread];
-			c->tpl[assign[i]] = tpl;
-		}
+		if (assign)
+			assign[i] = cntr_idx;
 reserve:
-		set_bit(tpl->cntr[thread], used_mask);
+		set_bit(cntr_idx, used_mask);
 		set_bit(escr_idx, escr_mask);
 	}
 
@@ -684,7 +786,7 @@ static __initconst struct x86_pmu p4_pmu = {
 	.perfctr		= MSR_P4_BPU_PERFCTR0,
 	.event_map		= p4_pmu_event_map,
 	.raw_event		= p4_pmu_raw_event,
-	.max_events		= ARRAY_SIZE(p4_templates),
+	.max_events		= ARRAY_SIZE(p4_general_events),
 	.get_event_constraints	= x86_get_event_constraints,
 	/*
 	 * IF HT disabled we may need to use all
@@ -716,7 +818,7 @@ static __init int p4_pmu_init(void)
 	}
 
 	memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
-	       sizeof(hw_cache_event_ids));
+		sizeof(hw_cache_event_ids));
 
 	pr_cont("Netburst events, ");
 
-- 
cgit v1.2.3-55-g7522


From 7c5ecaf7666617889f337296c610815b519abfa9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 25 Mar 2010 14:51:49 +0100
Subject: perf, x86: Clean up debugctlmsr bit definitions

Move all debugctlmsr thingies into msr-index.h

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100325135413.861425293@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/msr-index.h           | 13 ++++++++-----
 arch/x86/kernel/cpu/perf_event_intel_ds.c  | 23 +++++++----------------
 arch/x86/kernel/cpu/perf_event_intel_lbr.c |  7 ++-----
 3 files changed, 17 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index aef562c0a647..06e4cf0d3846 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -71,11 +71,14 @@
 #define MSR_IA32_LASTINTTOIP		0x000001de
 
 /* DEBUGCTLMSR bits (others vary by model): */
-#define _DEBUGCTLMSR_LBR	0 /* last branch recording */
-#define _DEBUGCTLMSR_BTF	1 /* single-step on branches */
-
-#define DEBUGCTLMSR_LBR		(1UL << _DEBUGCTLMSR_LBR)
-#define DEBUGCTLMSR_BTF		(1UL << _DEBUGCTLMSR_BTF)
+#define DEBUGCTLMSR_LBR			(1UL <<  0) /* last branch recording */
+#define DEBUGCTLMSR_BTF			(1UL <<  1) /* single-step on branches */
+#define DEBUGCTLMSR_TR			(1UL <<  6)
+#define DEBUGCTLMSR_BTS			(1UL <<  7)
+#define DEBUGCTLMSR_BTINT		(1UL <<  8)
+#define DEBUGCTLMSR_BTS_OFF_OS		(1UL <<  9)
+#define DEBUGCTLMSR_BTS_OFF_USR		(1UL << 10)
+#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI	(1UL << 11)
 
 #define MSR_IA32_MC0_CTL		0x00000400
 #define MSR_IA32_MC0_STATUS		0x00000401
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index c59678a14a2e..2fea3622af7f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -37,15 +37,6 @@ struct pebs_record_nhm {
 	u64 status, dla, dse, lat;
 };
 
-/*
- * Bits in the debugctlmsr controlling branch tracing.
- */
-#define X86_DEBUGCTL_TR			(1 << 6)
-#define X86_DEBUGCTL_BTS		(1 << 7)
-#define X86_DEBUGCTL_BTINT		(1 << 8)
-#define X86_DEBUGCTL_BTS_OFF_OS		(1 << 9)
-#define X86_DEBUGCTL_BTS_OFF_USR	(1 << 10)
-
 /*
  * A debug store configuration.
  *
@@ -193,15 +184,15 @@ static void intel_pmu_enable_bts(u64 config)
 
 	debugctlmsr = get_debugctlmsr();
 
-	debugctlmsr |= X86_DEBUGCTL_TR;
-	debugctlmsr |= X86_DEBUGCTL_BTS;
-	debugctlmsr |= X86_DEBUGCTL_BTINT;
+	debugctlmsr |= DEBUGCTLMSR_TR;
+	debugctlmsr |= DEBUGCTLMSR_BTS;
+	debugctlmsr |= DEBUGCTLMSR_BTINT;
 
 	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
-		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
+		debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
 
 	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
-		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
+		debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
 
 	update_debugctlmsr(debugctlmsr);
 }
@@ -217,8 +208,8 @@ static void intel_pmu_disable_bts(void)
 	debugctlmsr = get_debugctlmsr();
 
 	debugctlmsr &=
-		~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
-		  X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
+		~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
+		  DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
 
 	update_debugctlmsr(debugctlmsr);
 }
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index df4c98e26c5b..d202c1bece1a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -12,15 +12,12 @@ enum {
  * otherwise it becomes near impossible to get a reliable stack.
  */
 
-#define X86_DEBUGCTL_LBR               		(1 << 0)
-#define X86_DEBUGCTL_FREEZE_LBRS_ON_PMI		(1 << 11)
-
 static void __intel_pmu_lbr_enable(void)
 {
 	u64 debugctl;
 
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-	debugctl |= (X86_DEBUGCTL_LBR | X86_DEBUGCTL_FREEZE_LBRS_ON_PMI);
+	debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 }
 
@@ -29,7 +26,7 @@ static void __intel_pmu_lbr_disable(void)
 	u64 debugctl;
 
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-	debugctl &= ~(X86_DEBUGCTL_LBR | X86_DEBUGCTL_FREEZE_LBRS_ON_PMI);
+	debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 }
 
-- 
cgit v1.2.3-55-g7522


From faa4602e47690fb11221e00f9b9697c8dc0d4b19 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 25 Mar 2010 14:51:50 +0100
Subject: x86, perf, bts, mm: Delete the never used BTS-ptrace code

Support for the PMU's BTS features has been upstreamed in
v2.6.32, but we still have the old and disabled ptrace-BTS,
as Linus noticed it not so long ago.

It's buggy: TIF_DEBUGCTLMSR is trampling all over that MSR without
regard for other uses (perf) and doesn't provide the flexibility
needed for perf either.

Its users are ptrace-block-step and ptrace-bts, since ptrace-bts
was never used and ptrace-block-step can be implemented using a
much simpler approach.

So axe all 3000 lines of it. That includes the *locked_memory*()
APIs in mm/mlock.c as well.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Markus Metzger <markus.t.metzger@intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20100325135413.938004390@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.cpu               |   20 -
 arch/x86/Kconfig.debug             |    9 -
 arch/x86/include/asm/ds.h          |  302 --------
 arch/x86/include/asm/processor.h   |   33 +-
 arch/x86/include/asm/ptrace-abi.h  |   57 +-
 arch/x86/include/asm/ptrace.h      |    6 -
 arch/x86/include/asm/thread_info.h |    6 +-
 arch/x86/kernel/Makefile           |    2 -
 arch/x86/kernel/cpu/intel.c        |    2 -
 arch/x86/kernel/ds.c               | 1437 ------------------------------------
 arch/x86/kernel/ds_selftest.c      |  408 ----------
 arch/x86/kernel/ds_selftest.h      |   15 -
 arch/x86/kernel/dumpstack.c        |    5 -
 arch/x86/kernel/kprobes.c          |    6 +-
 arch/x86/kernel/process.c          |    9 -
 arch/x86/kernel/process_32.c       |    8 -
 arch/x86/kernel/process_64.c       |    8 -
 arch/x86/kernel/ptrace.c           |  382 ----------
 arch/x86/kernel/step.c             |   36 +-
 arch/x86/kernel/traps.c            |    5 -
 include/linux/ftrace.h             |   12 -
 include/linux/mm.h                 |    4 -
 include/linux/ptrace.h             |   12 -
 include/linux/sched.h              |    9 -
 kernel/fork.c                      |    3 -
 kernel/ptrace.c                    |    1 -
 kernel/sched.c                     |   43 --
 kernel/trace/Kconfig               |   11 -
 kernel/trace/Makefile              |    1 -
 kernel/trace/trace.h               |    4 -
 kernel/trace/trace_entries.h       |   12 -
 kernel/trace/trace_hw_branches.c   |  312 --------
 kernel/trace/trace_selftest.c      |   57 --
 mm/mlock.c                         |   41 -
 34 files changed, 9 insertions(+), 3269 deletions(-)
 delete mode 100644 arch/x86/include/asm/ds.h
 delete mode 100644 arch/x86/kernel/ds.c
 delete mode 100644 arch/x86/kernel/ds_selftest.c
 delete mode 100644 arch/x86/kernel/ds_selftest.h
 delete mode 100644 kernel/trace/trace_hw_branches.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index a19829374e6a..918fbb1855cc 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -502,23 +502,3 @@ config CPU_SUP_UMC_32
 	  CPU might render the kernel unbootable.
 
 	  If unsure, say N.
-
-config X86_DS
-	def_bool X86_PTRACE_BTS
-	depends on X86_DEBUGCTLMSR
-	select HAVE_HW_BRANCH_TRACER
-
-config X86_PTRACE_BTS
-	bool "Branch Trace Store"
-	default y
-	depends on X86_DEBUGCTLMSR
-	depends on BROKEN
-	---help---
-	  This adds a ptrace interface to the hardware's branch trace store.
-
-	  Debuggers may use it to collect an execution trace of the debugged
-	  application in order to answer the question 'how did I get here?'.
-	  Debuggers may trace user mode as well as kernel mode.
-
-	  Say Y unless there is no application development on this machine
-	  and you want to save a small amount of code size.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bc01e3ebfeb2..bd58c8abbfbd 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -174,15 +174,6 @@ config IOMMU_LEAK
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 
-config X86_DS_SELFTEST
-    bool "DS selftest"
-    default y
-    depends on DEBUG_KERNEL
-    depends on X86_DS
-	---help---
-	  Perform Debug Store selftests at boot time.
-	  If in doubt, say "N".
-
 config HAVE_MMIOTRACE_SUPPORT
 	def_bool y
 
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
deleted file mode 100644
index 70dac199b093..000000000000
--- a/arch/x86/include/asm/ds.h
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- * Debug Store (DS) support
- *
- * This provides a low-level interface to the hardware's Debug Store
- * feature that is used for branch trace store (BTS) and
- * precise-event based sampling (PEBS).
- *
- * It manages:
- * - DS and BTS hardware configuration
- * - buffer overflow handling (to be done)
- * - buffer access
- *
- * It does not do:
- * - security checking (is the caller allowed to trace the task)
- * - buffer allocation (memory accounting)
- *
- *
- * Copyright (C) 2007-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
- */
-
-#ifndef _ASM_X86_DS_H
-#define _ASM_X86_DS_H
-
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/err.h>
-
-
-#ifdef CONFIG_X86_DS
-
-struct task_struct;
-struct ds_context;
-struct ds_tracer;
-struct bts_tracer;
-struct pebs_tracer;
-
-typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
-typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
-
-
-/*
- * A list of features plus corresponding macros to talk about them in
- * the ds_request function's flags parameter.
- *
- * We use the enum to index an array of corresponding control bits;
- * we use the macro to index a flags bit-vector.
- */
-enum ds_feature {
-	dsf_bts = 0,
-	dsf_bts_kernel,
-#define BTS_KERNEL (1 << dsf_bts_kernel)
-	/* trace kernel-mode branches */
-
-	dsf_bts_user,
-#define BTS_USER (1 << dsf_bts_user)
-	/* trace user-mode branches */
-
-	dsf_bts_overflow,
-	dsf_bts_max,
-	dsf_pebs = dsf_bts_max,
-
-	dsf_pebs_max,
-	dsf_ctl_max = dsf_pebs_max,
-	dsf_bts_timestamps = dsf_ctl_max,
-#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps)
-	/* add timestamps into BTS trace */
-
-#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS)
-};
-
-
-/*
- * Request BTS or PEBS
- *
- * Due to alignement constraints, the actual buffer may be slightly
- * smaller than the requested or provided buffer.
- *
- * Returns a pointer to a tracer structure on success, or
- * ERR_PTR(errcode) on failure.
- *
- * The interrupt threshold is independent from the overflow callback
- * to allow users to use their own overflow interrupt handling mechanism.
- *
- * The function might sleep.
- *
- * task: the task to request recording for
- * cpu:  the cpu to request recording for
- * base: the base pointer for the (non-pageable) buffer;
- * size: the size of the provided buffer in bytes
- * ovfl: pointer to a function to be called on buffer overflow;
- *       NULL if cyclic buffer requested
- * th: the interrupt threshold in records from the end of the buffer;
- *     -1 if no interrupt threshold is requested.
- * flags: a bit-mask of the above flags
- */
-extern struct bts_tracer *ds_request_bts_task(struct task_struct *task,
-					      void *base, size_t size,
-					      bts_ovfl_callback_t ovfl,
-					      size_t th, unsigned int flags);
-extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
-					     bts_ovfl_callback_t ovfl,
-					     size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
-						void *base, size_t size,
-						pebs_ovfl_callback_t ovfl,
-						size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs_cpu(int cpu,
-					       void *base, size_t size,
-					       pebs_ovfl_callback_t ovfl,
-					       size_t th, unsigned int flags);
-
-/*
- * Release BTS or PEBS resources
- * Suspend and resume BTS or PEBS tracing
- *
- * Must be called with irq's enabled.
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern void ds_release_bts(struct bts_tracer *tracer);
-extern void ds_suspend_bts(struct bts_tracer *tracer);
-extern void ds_resume_bts(struct bts_tracer *tracer);
-extern void ds_release_pebs(struct pebs_tracer *tracer);
-extern void ds_suspend_pebs(struct pebs_tracer *tracer);
-extern void ds_resume_pebs(struct pebs_tracer *tracer);
-
-/*
- * Release BTS or PEBS resources
- * Suspend and resume BTS or PEBS tracing
- *
- * Cpu tracers must call this on the traced cpu.
- * Task tracers must call ds_release_~_noirq() for themselves.
- *
- * May be called with irq's disabled.
- *
- * Returns 0 if successful;
- * -EPERM if the cpu tracer does not trace the current cpu.
- * -EPERM if the task tracer does not trace itself.
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern int ds_release_bts_noirq(struct bts_tracer *tracer);
-extern int ds_suspend_bts_noirq(struct bts_tracer *tracer);
-extern int ds_resume_bts_noirq(struct bts_tracer *tracer);
-extern int ds_release_pebs_noirq(struct pebs_tracer *tracer);
-extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer);
-extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer);
-
-
-/*
- * The raw DS buffer state as it is used for BTS and PEBS recording.
- *
- * This is the low-level, arch-dependent interface for working
- * directly on the raw trace data.
- */
-struct ds_trace {
-	/* the number of bts/pebs records */
-	size_t n;
-	/* the size of a bts/pebs record in bytes */
-	size_t size;
-	/* pointers into the raw buffer:
-	   - to the first entry */
-	void *begin;
-	/* - one beyond the last entry */
-	void *end;
-	/* - one beyond the newest entry */
-	void *top;
-	/* - the interrupt threshold */
-	void *ith;
-	/* flags given on ds_request() */
-	unsigned int flags;
-};
-
-/*
- * An arch-independent view on branch trace data.
- */
-enum bts_qualifier {
-	bts_invalid,
-#define BTS_INVALID bts_invalid
-
-	bts_branch,
-#define BTS_BRANCH bts_branch
-
-	bts_task_arrives,
-#define BTS_TASK_ARRIVES bts_task_arrives
-
-	bts_task_departs,
-#define BTS_TASK_DEPARTS bts_task_departs
-
-	bts_qual_bit_size = 4,
-	bts_qual_max = (1 << bts_qual_bit_size),
-};
-
-struct bts_struct {
-	__u64 qualifier;
-	union {
-		/* BTS_BRANCH */
-		struct {
-			__u64 from;
-			__u64 to;
-		} lbr;
-		/* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
-		struct {
-			__u64 clock;
-			pid_t pid;
-		} event;
-	} variant;
-};
-
-
-/*
- * The BTS state.
- *
- * This gives access to the raw DS state and adds functions to provide
- * an arch-independent view of the BTS data.
- */
-struct bts_trace {
-	struct ds_trace ds;
-
-	int (*read)(struct bts_tracer *tracer, const void *at,
-		    struct bts_struct *out);
-	int (*write)(struct bts_tracer *tracer, const struct bts_struct *in);
-};
-
-
-/*
- * The PEBS state.
- *
- * This gives access to the raw DS state and the PEBS-specific counter
- * reset value.
- */
-struct pebs_trace {
-	struct ds_trace ds;
-
-	/* the number of valid counters in the below array */
-	unsigned int counters;
-
-#define MAX_PEBS_COUNTERS 4
-	/* the counter reset value */
-	unsigned long long counter_reset[MAX_PEBS_COUNTERS];
-};
-
-
-/*
- * Read the BTS or PEBS trace.
- *
- * Returns a view on the trace collected for the parameter tracer.
- *
- * The view remains valid as long as the traced task is not running or
- * the tracer is suspended.
- * Writes into the trace buffer are not reflected.
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer);
-extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer);
-
-
-/*
- * Reset the write pointer of the BTS/PEBS buffer.
- *
- * Returns 0 on success; -Eerrno on error
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern int ds_reset_bts(struct bts_tracer *tracer);
-extern int ds_reset_pebs(struct pebs_tracer *tracer);
-
-/*
- * Set the PEBS counter reset value.
- *
- * Returns 0 on success; -Eerrno on error
- *
- * tracer: the tracer handle returned from ds_request_pebs()
- * counter: the index of the counter
- * value: the new counter reset value
- */
-extern int ds_set_pebs_reset(struct pebs_tracer *tracer,
-			     unsigned int counter, u64 value);
-
-/*
- * Initialization
- */
-struct cpuinfo_x86;
-extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
-
-/*
- * Context switch work
- */
-extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
-
-#else /* CONFIG_X86_DS */
-
-struct cpuinfo_x86;
-static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
-static inline void ds_switch_to(struct task_struct *prev,
-				struct task_struct *next) {}
-
-#endif /* CONFIG_X86_DS */
-#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b753ea59703a..5bec21a66dc5 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -21,7 +21,6 @@ struct mm_struct;
 #include <asm/msr.h>
 #include <asm/desc_defs.h>
 #include <asm/nops.h>
-#include <asm/ds.h>
 
 #include <linux/personality.h>
 #include <linux/cpumask.h>
@@ -29,6 +28,7 @@ struct mm_struct;
 #include <linux/threads.h>
 #include <linux/math64.h>
 #include <linux/init.h>
+#include <linux/err.h>
 
 #define HBP_NUM 4
 /*
@@ -473,10 +473,6 @@ struct thread_struct {
 	unsigned long		iopl;
 	/* Max allowed port in the bitmap, in bytes: */
 	unsigned		io_bitmap_max;
-/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
-	unsigned long	debugctlmsr;
-	/* Debug Store context; see asm/ds.h */
-	struct ds_context	*ds_ctx;
 };
 
 static inline unsigned long native_get_debugreg(int regno)
@@ -814,21 +810,6 @@ static inline unsigned long get_debugctlmsr(void)
     return debugctlmsr;
 }
 
-static inline unsigned long get_debugctlmsr_on_cpu(int cpu)
-{
-	u64 debugctlmsr = 0;
-	u32 val1, val2;
-
-#ifndef CONFIG_X86_DEBUGCTLMSR
-	if (boot_cpu_data.x86 < 6)
-		return 0;
-#endif
-	rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2);
-	debugctlmsr = val1 | ((u64)val2 << 32);
-
-	return debugctlmsr;
-}
-
 static inline void update_debugctlmsr(unsigned long debugctlmsr)
 {
 #ifndef CONFIG_X86_DEBUGCTLMSR
@@ -838,18 +819,6 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
 }
 
-static inline void update_debugctlmsr_on_cpu(int cpu,
-					     unsigned long debugctlmsr)
-{
-#ifndef CONFIG_X86_DEBUGCTLMSR
-	if (boot_cpu_data.x86 < 6)
-		return;
-#endif
-	wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR,
-		     (u32)((u64)debugctlmsr),
-		     (u32)((u64)debugctlmsr >> 32));
-}
-
 /*
  * from system description table in BIOS. Mostly for MCA use, but
  * others may find it useful:
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
index 86723035a515..52b098a6eebb 100644
--- a/arch/x86/include/asm/ptrace-abi.h
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -82,61 +82,6 @@
 
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
-
-/* configuration/status structure used in PTRACE_BTS_CONFIG and
-   PTRACE_BTS_STATUS commands.
-*/
-struct ptrace_bts_config {
-	/* requested or actual size of BTS buffer in bytes */
-	__u32 size;
-	/* bitmask of below flags */
-	__u32 flags;
-	/* buffer overflow signal */
-	__u32 signal;
-	/* actual size of bts_struct in bytes */
-	__u32 bts_size;
-};
-#endif /* __ASSEMBLY__ */
-
-#define PTRACE_BTS_O_TRACE	0x1 /* branch trace */
-#define PTRACE_BTS_O_SCHED	0x2 /* scheduling events w/ jiffies */
-#define PTRACE_BTS_O_SIGNAL     0x4 /* send SIG<signal> on buffer overflow
-				       instead of wrapping around */
-#define PTRACE_BTS_O_ALLOC	0x8 /* (re)allocate buffer */
-
-#define PTRACE_BTS_CONFIG	40
-/* Configure branch trace recording.
-   ADDR points to a struct ptrace_bts_config.
-   DATA gives the size of that buffer.
-   A new buffer is allocated, if requested in the flags.
-   An overflow signal may only be requested for new buffers.
-   Returns the number of bytes read.
-*/
-#define PTRACE_BTS_STATUS	41
-/* Return the current configuration in a struct ptrace_bts_config
-   pointed to by ADDR; DATA gives the size of that buffer.
-   Returns the number of bytes written.
-*/
-#define PTRACE_BTS_SIZE		42
-/* Return the number of available BTS records for draining.
-   DATA and ADDR are ignored.
-*/
-#define PTRACE_BTS_GET		43
-/* Get a single BTS record.
-   DATA defines the index into the BTS array, where 0 is the newest
-   entry, and higher indices refer to older entries.
-   ADDR is pointing to struct bts_struct (see asm/ds.h).
-*/
-#define PTRACE_BTS_CLEAR	44
-/* Clear the BTS buffer.
-   DATA and ADDR are ignored.
-*/
-#define PTRACE_BTS_DRAIN	45
-/* Read all available BTS records and clear the buffer.
-   ADDR points to an array of struct bts_struct.
-   DATA gives the size of that buffer.
-   BTS records are read from oldest to newest.
-   Returns number of BTS records drained.
-*/
+#endif
 
 #endif /* _ASM_X86_PTRACE_ABI_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 69a686a7dff0..78cd1ea94500 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -289,12 +289,6 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
 extern int do_set_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info, int can_allocate);
 
-#ifdef CONFIG_X86_PTRACE_BTS
-extern void ptrace_bts_untrace(struct task_struct *tsk);
-
-#define arch_ptrace_untrace(tsk)	ptrace_bts_untrace(tsk)
-#endif /* CONFIG_X86_PTRACE_BTS */
-
 #endif /* __KERNEL__ */
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e0d28901e969..dc85e12d1405 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -92,8 +92,6 @@ struct thread_info {
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
 #define TIF_FREEZE		23	/* is freezing for suspend */
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
-#define TIF_DEBUGCTLMSR		25	/* uses thread_struct.debugctlmsr */
-#define TIF_DS_AREA_MSR		26      /* uses thread_struct.ds_area_msr */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
 
@@ -115,8 +113,6 @@ struct thread_info {
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1 << TIF_FREEZE)
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
-#define _TIF_DEBUGCTLMSR	(1 << TIF_DEBUGCTLMSR)
-#define _TIF_DS_AREA_MSR	(1 << TIF_DS_AREA_MSR)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 
@@ -147,7 +143,7 @@ struct thread_info {
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
-	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
+	(_TIF_IO_BITMAP|_TIF_NOTSC)
 
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4c58352209e0..e77b22083721 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -47,8 +47,6 @@ obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
 obj-y				+= process.o
 obj-y				+= i387.o xsave.o
 obj-y				+= ptrace.o
-obj-$(CONFIG_X86_DS)		+= ds.o
-obj-$(CONFIG_X86_DS_SELFTEST)		+= ds_selftest.o
 obj-$(CONFIG_X86_32)		+= tls.o
 obj-$(CONFIG_IA32_EMULATION)	+= tls.o
 obj-y				+= step.o
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7e1cca13af35..d72377c41c76 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -12,7 +12,6 @@
 #include <asm/processor.h>
 #include <asm/pgtable.h>
 #include <asm/msr.h>
-#include <asm/ds.h>
 #include <asm/bugs.h>
 #include <asm/cpu.h>
 
@@ -367,7 +366,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_BTS);
 		if (!(l1 & (1<<12)))
 			set_cpu_cap(c, X86_FEATURE_PEBS);
-		ds_init_intel(c);
 	}
 
 	if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
deleted file mode 100644
index 1c47390dd0e5..000000000000
--- a/arch/x86/kernel/ds.c
+++ /dev/null
@@ -1,1437 +0,0 @@
-/*
- * Debug Store support
- *
- * This provides a low-level interface to the hardware's Debug Store
- * feature that is used for branch trace store (BTS) and
- * precise-event based sampling (PEBS).
- *
- * It manages:
- * - DS and BTS hardware configuration
- * - buffer overflow handling (to be done)
- * - buffer access
- *
- * It does not do:
- * - security checking (is the caller allowed to trace the task)
- * - buffer allocation (memory accounting)
- *
- *
- * Copyright (C) 2007-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
- */
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/trace_clock.h>
-
-#include <asm/ds.h>
-
-#include "ds_selftest.h"
-
-/*
- * The configuration for a particular DS hardware implementation:
- */
-struct ds_configuration {
-	/* The name of the configuration: */
-	const char		*name;
-
-	/* The size of pointer-typed fields in DS, BTS, and PEBS: */
-	unsigned char		sizeof_ptr_field;
-
-	/* The size of a BTS/PEBS record in bytes: */
-	unsigned char		sizeof_rec[2];
-
-	/* The number of pebs counter reset values in the DS structure. */
-	unsigned char		nr_counter_reset;
-
-	/* Control bit-masks indexed by enum ds_feature: */
-	unsigned long		ctl[dsf_ctl_max];
-};
-static struct ds_configuration ds_cfg __read_mostly;
-
-
-/* Maximal size of a DS configuration: */
-#define MAX_SIZEOF_DS		0x80
-
-/* Maximal size of a BTS record: */
-#define MAX_SIZEOF_BTS		(3 * 8)
-
-/* BTS and PEBS buffer alignment: */
-#define DS_ALIGNMENT		(1 << 3)
-
-/* Number of buffer pointers in DS: */
-#define NUM_DS_PTR_FIELDS	8
-
-/* Size of a pebs reset value in DS: */
-#define PEBS_RESET_FIELD_SIZE	8
-
-/* Mask of control bits in the DS MSR register: */
-#define BTS_CONTROL				  \
-	( ds_cfg.ctl[dsf_bts]			| \
-	  ds_cfg.ctl[dsf_bts_kernel]		| \
-	  ds_cfg.ctl[dsf_bts_user]		| \
-	  ds_cfg.ctl[dsf_bts_overflow] )
-
-/*
- * A BTS or PEBS tracer.
- *
- * This holds the configuration of the tracer and serves as a handle
- * to identify tracers.
- */
-struct ds_tracer {
-	/* The DS context (partially) owned by this tracer. */
-	struct ds_context	*context;
-	/* The buffer provided on ds_request() and its size in bytes. */
-	void			*buffer;
-	size_t			size;
-};
-
-struct bts_tracer {
-	/* The common DS part: */
-	struct ds_tracer	ds;
-
-	/* The trace including the DS configuration: */
-	struct bts_trace	trace;
-
-	/* Buffer overflow notification function: */
-	bts_ovfl_callback_t	ovfl;
-
-	/* Active flags affecting trace collection. */
-	unsigned int		flags;
-};
-
-struct pebs_tracer {
-	/* The common DS part: */
-	struct ds_tracer	ds;
-
-	/* The trace including the DS configuration: */
-	struct pebs_trace	trace;
-
-	/* Buffer overflow notification function: */
-	pebs_ovfl_callback_t	ovfl;
-};
-
-/*
- * Debug Store (DS) save area configuration (see Intel64 and IA32
- * Architectures Software Developer's Manual, section 18.5)
- *
- * The DS configuration consists of the following fields; different
- * architetures vary in the size of those fields.
- *
- * - double-word aligned base linear address of the BTS buffer
- * - write pointer into the BTS buffer
- * - end linear address of the BTS buffer (one byte beyond the end of
- *   the buffer)
- * - interrupt pointer into BTS buffer
- *   (interrupt occurs when write pointer passes interrupt pointer)
- * - double-word aligned base linear address of the PEBS buffer
- * - write pointer into the PEBS buffer
- * - end linear address of the PEBS buffer (one byte beyond the end of
- *   the buffer)
- * - interrupt pointer into PEBS buffer
- *   (interrupt occurs when write pointer passes interrupt pointer)
- * - value to which counter is reset following counter overflow
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- *
- * We compute the base address for the first 8 fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- * - an offset giving the start of the respective region
- *
- * This offset is further used to index various arrays holding
- * information for BTS and PEBS at the respective index.
- *
- * On later 32bit processors, we only access the lower 32bit of the
- * 64bit pointer fields. The upper halves will be zeroed out.
- */
-
-enum ds_field {
-	ds_buffer_base = 0,
-	ds_index,
-	ds_absolute_maximum,
-	ds_interrupt_threshold,
-};
-
-enum ds_qualifier {
-	ds_bts = 0,
-	ds_pebs
-};
-
-static inline unsigned long
-ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
-{
-	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
-	return *(unsigned long *)base;
-}
-
-static inline void
-ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
-       unsigned long value)
-{
-	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
-	(*(unsigned long *)base) = value;
-}
-
-
-/*
- * Locking is done only for allocating BTS or PEBS resources.
- */
-static DEFINE_SPINLOCK(ds_lock);
-
-/*
- * We either support (system-wide) per-cpu or per-thread allocation.
- * We distinguish the two based on the task_struct pointer, where a
- * NULL pointer indicates per-cpu allocation for the current cpu.
- *
- * Allocations are use-counted. As soon as resources are allocated,
- * further allocations must be of the same type (per-cpu or
- * per-thread). We model this by counting allocations (i.e. the number
- * of tracers of a certain type) for one type negatively:
- *   =0  no tracers
- *   >0  number of per-thread tracers
- *   <0  number of per-cpu tracers
- *
- * Tracers essentially gives the number of ds contexts for a certain
- * type of allocation.
- */
-static atomic_t tracers = ATOMIC_INIT(0);
-
-static inline int get_tracer(struct task_struct *task)
-{
-	int error;
-
-	spin_lock_irq(&ds_lock);
-
-	if (task) {
-		error = -EPERM;
-		if (atomic_read(&tracers) < 0)
-			goto out;
-		atomic_inc(&tracers);
-	} else {
-		error = -EPERM;
-		if (atomic_read(&tracers) > 0)
-			goto out;
-		atomic_dec(&tracers);
-	}
-
-	error = 0;
-out:
-	spin_unlock_irq(&ds_lock);
-	return error;
-}
-
-static inline void put_tracer(struct task_struct *task)
-{
-	if (task)
-		atomic_dec(&tracers);
-	else
-		atomic_inc(&tracers);
-}
-
-/*
- * The DS context is either attached to a thread or to a cpu:
- * - in the former case, the thread_struct contains a pointer to the
- *   attached context.
- * - in the latter case, we use a static array of per-cpu context
- *   pointers.
- *
- * Contexts are use-counted. They are allocated on first access and
- * deallocated when the last user puts the context.
- */
-struct ds_context {
-	/* The DS configuration; goes into MSR_IA32_DS_AREA: */
-	unsigned char		ds[MAX_SIZEOF_DS];
-
-	/* The owner of the BTS and PEBS configuration, respectively: */
-	struct bts_tracer	*bts_master;
-	struct pebs_tracer	*pebs_master;
-
-	/* Use count: */
-	unsigned long		count;
-
-	/* Pointer to the context pointer field: */
-	struct ds_context	**this;
-
-	/* The traced task; NULL for cpu tracing: */
-	struct task_struct	*task;
-
-	/* The traced cpu; only valid if task is NULL: */
-	int			cpu;
-};
-
-static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context);
-
-
-static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
-{
-	struct ds_context **p_context =
-		(task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu));
-	struct ds_context *context = NULL;
-	struct ds_context *new_context = NULL;
-
-	/* Chances are small that we already have a context. */
-	new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
-	if (!new_context)
-		return NULL;
-
-	spin_lock_irq(&ds_lock);
-
-	context = *p_context;
-	if (likely(!context)) {
-		context = new_context;
-
-		context->this = p_context;
-		context->task = task;
-		context->cpu = cpu;
-		context->count = 0;
-
-		*p_context = context;
-	}
-
-	context->count++;
-
-	spin_unlock_irq(&ds_lock);
-
-	if (context != new_context)
-		kfree(new_context);
-
-	return context;
-}
-
-static void ds_put_context(struct ds_context *context)
-{
-	struct task_struct *task;
-	unsigned long irq;
-
-	if (!context)
-		return;
-
-	spin_lock_irqsave(&ds_lock, irq);
-
-	if (--context->count) {
-		spin_unlock_irqrestore(&ds_lock, irq);
-		return;
-	}
-
-	*(context->this) = NULL;
-
-	task = context->task;
-
-	if (task)
-		clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
-
-	/*
-	 * We leave the (now dangling) pointer to the DS configuration in
-	 * the DS_AREA msr. This is as good or as bad as replacing it with
-	 * NULL - the hardware would crash if we enabled tracing.
-	 *
-	 * This saves us some problems with having to write an msr on a
-	 * different cpu while preventing others from doing the same for the
-	 * next context for that same cpu.
-	 */
-
-	spin_unlock_irqrestore(&ds_lock, irq);
-
-	/* The context might still be in use for context switching. */
-	if (task && (task != current))
-		wait_task_context_switch(task);
-
-	kfree(context);
-}
-
-static void ds_install_ds_area(struct ds_context *context)
-{
-	unsigned long ds;
-
-	ds = (unsigned long)context->ds;
-
-	/*
-	 * There is a race between the bts master and the pebs master.
-	 *
-	 * The thread/cpu access is synchronized via get/put_cpu() for
-	 * task tracing and via wrmsr_on_cpu for cpu tracing.
-	 *
-	 * If bts and pebs are collected for the same task or same cpu,
-	 * the same confiuration is written twice.
-	 */
-	if (context->task) {
-		get_cpu();
-		if (context->task == current)
-			wrmsrl(MSR_IA32_DS_AREA, ds);
-		set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
-		put_cpu();
-	} else
-		wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
-			     (u32)((u64)ds), (u32)((u64)ds >> 32));
-}
-
-/*
- * Call the tracer's callback on a buffer overflow.
- *
- * context: the ds context
- * qual: the buffer type
- */
-static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
-{
-	switch (qual) {
-	case ds_bts:
-		if (context->bts_master &&
-		    context->bts_master->ovfl)
-			context->bts_master->ovfl(context->bts_master);
-		break;
-	case ds_pebs:
-		if (context->pebs_master &&
-		    context->pebs_master->ovfl)
-			context->pebs_master->ovfl(context->pebs_master);
-		break;
-	}
-}
-
-
-/*
- * Write raw data into the BTS or PEBS buffer.
- *
- * The remainder of any partially written record is zeroed out.
- *
- * context: the DS context
- * qual:    the buffer type
- * record:  the data to write
- * size:    the size of the data
- */
-static int ds_write(struct ds_context *context, enum ds_qualifier qual,
-		    const void *record, size_t size)
-{
-	int bytes_written = 0;
-
-	if (!record)
-		return -EINVAL;
-
-	while (size) {
-		unsigned long base, index, end, write_end, int_th;
-		unsigned long write_size, adj_write_size;
-
-		/*
-		 * Write as much as possible without producing an
-		 * overflow interrupt.
-		 *
-		 * Interrupt_threshold must either be
-		 * - bigger than absolute_maximum or
-		 * - point to a record between buffer_base and absolute_maximum
-		 *
-		 * Index points to a valid record.
-		 */
-		base   = ds_get(context->ds, qual, ds_buffer_base);
-		index  = ds_get(context->ds, qual, ds_index);
-		end    = ds_get(context->ds, qual, ds_absolute_maximum);
-		int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
-
-		write_end = min(end, int_th);
-
-		/*
-		 * If we are already beyond the interrupt threshold,
-		 * we fill the entire buffer.
-		 */
-		if (write_end <= index)
-			write_end = end;
-
-		if (write_end <= index)
-			break;
-
-		write_size = min((unsigned long) size, write_end - index);
-		memcpy((void *)index, record, write_size);
-
-		record = (const char *)record + write_size;
-		size -= write_size;
-		bytes_written += write_size;
-
-		adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
-		adj_write_size *= ds_cfg.sizeof_rec[qual];
-
-		/* Zero out trailing bytes. */
-		memset((char *)index + write_size, 0,
-		       adj_write_size - write_size);
-		index += adj_write_size;
-
-		if (index >= end)
-			index = base;
-		ds_set(context->ds, qual, ds_index, index);
-
-		if (index >= int_th)
-			ds_overflow(context, qual);
-	}
-
-	return bytes_written;
-}
-
-
-/*
- * Branch Trace Store (BTS) uses the following format. Different
- * architectures vary in the size of those fields.
- * - source linear address
- * - destination linear address
- * - flags
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- * We compute the base address for the fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- *
- * In order to store additional information in the BTS buffer, we use
- * a special source address to indicate that the record requires
- * special interpretation.
- *
- * Netburst indicated via a bit in the flags field whether the branch
- * was predicted; this is ignored.
- *
- * We use two levels of abstraction:
- * - the raw data level defined here
- * - an arch-independent level defined in ds.h
- */
-
-enum bts_field {
-	bts_from,
-	bts_to,
-	bts_flags,
-
-	bts_qual		= bts_from,
-	bts_clock		= bts_to,
-	bts_pid			= bts_flags,
-
-	bts_qual_mask		= (bts_qual_max - 1),
-	bts_escape		= ((unsigned long)-1 & ~bts_qual_mask)
-};
-
-static inline unsigned long bts_get(const char *base, unsigned long field)
-{
-	base += (ds_cfg.sizeof_ptr_field * field);
-	return *(unsigned long *)base;
-}
-
-static inline void bts_set(char *base, unsigned long field, unsigned long val)
-{
-	base += (ds_cfg.sizeof_ptr_field * field);
-	(*(unsigned long *)base) = val;
-}
-
-
-/*
- * The raw BTS data is architecture dependent.
- *
- * For higher-level users, we give an arch-independent view.
- * - ds.h defines struct bts_struct
- * - bts_read translates one raw bts record into a bts_struct
- * - bts_write translates one bts_struct into the raw format and
- *   writes it into the top of the parameter tracer's buffer.
- *
- * return: bytes read/written on success; -Eerrno, otherwise
- */
-static int
-bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
-{
-	if (!tracer)
-		return -EINVAL;
-
-	if (at < tracer->trace.ds.begin)
-		return -EINVAL;
-
-	if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
-		return -EINVAL;
-
-	memset(out, 0, sizeof(*out));
-	if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
-		out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
-		out->variant.event.clock = bts_get(at, bts_clock);
-		out->variant.event.pid = bts_get(at, bts_pid);
-	} else {
-		out->qualifier = bts_branch;
-		out->variant.lbr.from = bts_get(at, bts_from);
-		out->variant.lbr.to   = bts_get(at, bts_to);
-
-		if (!out->variant.lbr.from && !out->variant.lbr.to)
-			out->qualifier = bts_invalid;
-	}
-
-	return ds_cfg.sizeof_rec[ds_bts];
-}
-
-static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
-{
-	unsigned char raw[MAX_SIZEOF_BTS];
-
-	if (!tracer)
-		return -EINVAL;
-
-	if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
-		return -EOVERFLOW;
-
-	switch (in->qualifier) {
-	case bts_invalid:
-		bts_set(raw, bts_from, 0);
-		bts_set(raw, bts_to, 0);
-		bts_set(raw, bts_flags, 0);
-		break;
-	case bts_branch:
-		bts_set(raw, bts_from, in->variant.lbr.from);
-		bts_set(raw, bts_to,   in->variant.lbr.to);
-		bts_set(raw, bts_flags, 0);
-		break;
-	case bts_task_arrives:
-	case bts_task_departs:
-		bts_set(raw, bts_qual, (bts_escape | in->qualifier));
-		bts_set(raw, bts_clock, in->variant.event.clock);
-		bts_set(raw, bts_pid, in->variant.event.pid);
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return ds_write(tracer->ds.context, ds_bts, raw,
-			ds_cfg.sizeof_rec[ds_bts]);
-}
-
-
-static void ds_write_config(struct ds_context *context,
-			    struct ds_trace *cfg, enum ds_qualifier qual)
-{
-	unsigned char *ds = context->ds;
-
-	ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
-	ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
-	ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
-	ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
-}
-
-static void ds_read_config(struct ds_context *context,
-			   struct ds_trace *cfg, enum ds_qualifier qual)
-{
-	unsigned char *ds = context->ds;
-
-	cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
-	cfg->top = (void *)ds_get(ds, qual, ds_index);
-	cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
-	cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
-}
-
-static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
-			     void *base, size_t size, size_t ith,
-			     unsigned int flags) {
-	unsigned long buffer, adj;
-
-	/*
-	 * Adjust the buffer address and size to meet alignment
-	 * constraints:
-	 * - buffer is double-word aligned
-	 * - size is multiple of record size
-	 *
-	 * We checked the size at the very beginning; we have enough
-	 * space to do the adjustment.
-	 */
-	buffer = (unsigned long)base;
-
-	adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
-	buffer += adj;
-	size   -= adj;
-
-	trace->n = size / ds_cfg.sizeof_rec[qual];
-	trace->size = ds_cfg.sizeof_rec[qual];
-
-	size = (trace->n * trace->size);
-
-	trace->begin = (void *)buffer;
-	trace->top = trace->begin;
-	trace->end = (void *)(buffer + size);
-	/*
-	 * The value for 'no threshold' is -1, which will set the
-	 * threshold outside of the buffer, just like we want it.
-	 */
-	ith *= ds_cfg.sizeof_rec[qual];
-	trace->ith = (void *)(buffer + size - ith);
-
-	trace->flags = flags;
-}
-
-
-static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
-		      enum ds_qualifier qual, struct task_struct *task,
-		      int cpu, void *base, size_t size, size_t th)
-{
-	struct ds_context *context;
-	int error;
-	size_t req_size;
-
-	error = -EOPNOTSUPP;
-	if (!ds_cfg.sizeof_rec[qual])
-		goto out;
-
-	error = -EINVAL;
-	if (!base)
-		goto out;
-
-	req_size = ds_cfg.sizeof_rec[qual];
-	/* We might need space for alignment adjustments. */
-	if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
-		req_size += DS_ALIGNMENT;
-
-	error = -EINVAL;
-	if (size < req_size)
-		goto out;
-
-	if (th != (size_t)-1) {
-		th *= ds_cfg.sizeof_rec[qual];
-
-		error = -EINVAL;
-		if (size <= th)
-			goto out;
-	}
-
-	tracer->buffer = base;
-	tracer->size = size;
-
-	error = -ENOMEM;
-	context = ds_get_context(task, cpu);
-	if (!context)
-		goto out;
-	tracer->context = context;
-
-	/*
-	 * Defer any tracer-specific initialization work for the context until
-	 * context ownership has been clarified.
-	 */
-
-	error = 0;
- out:
-	return error;
-}
-
-static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
-					 void *base, size_t size,
-					 bts_ovfl_callback_t ovfl, size_t th,
-					 unsigned int flags)
-{
-	struct bts_tracer *tracer;
-	int error;
-
-	/* Buffer overflow notification is not yet implemented. */
-	error = -EOPNOTSUPP;
-	if (ovfl)
-		goto out;
-
-	error = get_tracer(task);
-	if (error < 0)
-		goto out;
-
-	error = -ENOMEM;
-	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
-	if (!tracer)
-		goto out_put_tracer;
-	tracer->ovfl = ovfl;
-
-	/* Do some more error checking and acquire a tracing context. */
-	error = ds_request(&tracer->ds, &tracer->trace.ds,
-			   ds_bts, task, cpu, base, size, th);
-	if (error < 0)
-		goto out_tracer;
-
-	/* Claim the bts part of the tracing context we acquired above. */
-	spin_lock_irq(&ds_lock);
-
-	error = -EPERM;
-	if (tracer->ds.context->bts_master)
-		goto out_unlock;
-	tracer->ds.context->bts_master = tracer;
-
-	spin_unlock_irq(&ds_lock);
-
-	/*
-	 * Now that we own the bts part of the context, let's complete the
-	 * initialization for that part.
-	 */
-	ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
-	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
-	ds_install_ds_area(tracer->ds.context);
-
-	tracer->trace.read  = bts_read;
-	tracer->trace.write = bts_write;
-
-	/* Start tracing. */
-	ds_resume_bts(tracer);
-
-	return tracer;
-
- out_unlock:
-	spin_unlock_irq(&ds_lock);
-	ds_put_context(tracer->ds.context);
- out_tracer:
-	kfree(tracer);
- out_put_tracer:
-	put_tracer(task);
- out:
-	return ERR_PTR(error);
-}
-
-struct bts_tracer *ds_request_bts_task(struct task_struct *task,
-				       void *base, size_t size,
-				       bts_ovfl_callback_t ovfl,
-				       size_t th, unsigned int flags)
-{
-	return ds_request_bts(task, 0, base, size, ovfl, th, flags);
-}
-
-struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
-				      bts_ovfl_callback_t ovfl,
-				      size_t th, unsigned int flags)
-{
-	return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
-}
-
-static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
-					   void *base, size_t size,
-					   pebs_ovfl_callback_t ovfl, size_t th,
-					   unsigned int flags)
-{
-	struct pebs_tracer *tracer;
-	int error;
-
-	/* Buffer overflow notification is not yet implemented. */
-	error = -EOPNOTSUPP;
-	if (ovfl)
-		goto out;
-
-	error = get_tracer(task);
-	if (error < 0)
-		goto out;
-
-	error = -ENOMEM;
-	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
-	if (!tracer)
-		goto out_put_tracer;
-	tracer->ovfl = ovfl;
-
-	/* Do some more error checking and acquire a tracing context. */
-	error = ds_request(&tracer->ds, &tracer->trace.ds,
-			   ds_pebs, task, cpu, base, size, th);
-	if (error < 0)
-		goto out_tracer;
-
-	/* Claim the pebs part of the tracing context we acquired above. */
-	spin_lock_irq(&ds_lock);
-
-	error = -EPERM;
-	if (tracer->ds.context->pebs_master)
-		goto out_unlock;
-	tracer->ds.context->pebs_master = tracer;
-
-	spin_unlock_irq(&ds_lock);
-
-	/*
-	 * Now that we own the pebs part of the context, let's complete the
-	 * initialization for that part.
-	 */
-	ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
-	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
-	ds_install_ds_area(tracer->ds.context);
-
-	/* Start tracing. */
-	ds_resume_pebs(tracer);
-
-	return tracer;
-
- out_unlock:
-	spin_unlock_irq(&ds_lock);
-	ds_put_context(tracer->ds.context);
- out_tracer:
-	kfree(tracer);
- out_put_tracer:
-	put_tracer(task);
- out:
-	return ERR_PTR(error);
-}
-
-struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
-					 void *base, size_t size,
-					 pebs_ovfl_callback_t ovfl,
-					 size_t th, unsigned int flags)
-{
-	return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
-}
-
-struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
-					pebs_ovfl_callback_t ovfl,
-					size_t th, unsigned int flags)
-{
-	return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
-}
-
-static void ds_free_bts(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-
-	task = tracer->ds.context->task;
-
-	WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
-	tracer->ds.context->bts_master = NULL;
-
-	/* Make sure tracing stopped and the tracer is not in use. */
-	if (task && (task != current))
-		wait_task_context_switch(task);
-
-	ds_put_context(tracer->ds.context);
-	put_tracer(task);
-
-	kfree(tracer);
-}
-
-void ds_release_bts(struct bts_tracer *tracer)
-{
-	might_sleep();
-
-	if (!tracer)
-		return;
-
-	ds_suspend_bts(tracer);
-	ds_free_bts(tracer);
-}
-
-int ds_release_bts_noirq(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long irq;
-	int error;
-
-	if (!tracer)
-		return 0;
-
-	task = tracer->ds.context->task;
-
-	local_irq_save(irq);
-
-	error = -EPERM;
-	if (!task &&
-	    (tracer->ds.context->cpu != smp_processor_id()))
-		goto out;
-
-	error = -EPERM;
-	if (task && (task != current))
-		goto out;
-
-	ds_suspend_bts_noirq(tracer);
-	ds_free_bts(tracer);
-
-	error = 0;
- out:
-	local_irq_restore(irq);
-	return error;
-}
-
-static void update_task_debugctlmsr(struct task_struct *task,
-				    unsigned long debugctlmsr)
-{
-	task->thread.debugctlmsr = debugctlmsr;
-
-	get_cpu();
-	if (task == current)
-		update_debugctlmsr(debugctlmsr);
-	put_cpu();
-}
-
-void ds_suspend_bts(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long debugctlmsr;
-	int cpu;
-
-	if (!tracer)
-		return;
-
-	tracer->flags = 0;
-
-	task = tracer->ds.context->task;
-	cpu  = tracer->ds.context->cpu;
-
-	WARN_ON(!task && irqs_disabled());
-
-	debugctlmsr = (task ?
-		       task->thread.debugctlmsr :
-		       get_debugctlmsr_on_cpu(cpu));
-	debugctlmsr &= ~BTS_CONTROL;
-
-	if (task)
-		update_task_debugctlmsr(task, debugctlmsr);
-	else
-		update_debugctlmsr_on_cpu(cpu, debugctlmsr);
-}
-
-int ds_suspend_bts_noirq(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long debugctlmsr, irq;
-	int cpu, error = 0;
-
-	if (!tracer)
-		return 0;
-
-	tracer->flags = 0;
-
-	task = tracer->ds.context->task;
-	cpu  = tracer->ds.context->cpu;
-
-	local_irq_save(irq);
-
-	error = -EPERM;
-	if (!task && (cpu != smp_processor_id()))
-		goto out;
-
-	debugctlmsr = (task ?
-		       task->thread.debugctlmsr :
-		       get_debugctlmsr());
-	debugctlmsr &= ~BTS_CONTROL;
-
-	if (task)
-		update_task_debugctlmsr(task, debugctlmsr);
-	else
-		update_debugctlmsr(debugctlmsr);
-
-	error = 0;
- out:
-	local_irq_restore(irq);
-	return error;
-}
-
-static unsigned long ds_bts_control(struct bts_tracer *tracer)
-{
-	unsigned long control;
-
-	control = ds_cfg.ctl[dsf_bts];
-	if (!(tracer->trace.ds.flags & BTS_KERNEL))
-		control |= ds_cfg.ctl[dsf_bts_kernel];
-	if (!(tracer->trace.ds.flags & BTS_USER))
-		control |= ds_cfg.ctl[dsf_bts_user];
-
-	return control;
-}
-
-void ds_resume_bts(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long debugctlmsr;
-	int cpu;
-
-	if (!tracer)
-		return;
-
-	tracer->flags = tracer->trace.ds.flags;
-
-	task = tracer->ds.context->task;
-	cpu  = tracer->ds.context->cpu;
-
-	WARN_ON(!task && irqs_disabled());
-
-	debugctlmsr = (task ?
-		       task->thread.debugctlmsr :
-		       get_debugctlmsr_on_cpu(cpu));
-	debugctlmsr |= ds_bts_control(tracer);
-
-	if (task)
-		update_task_debugctlmsr(task, debugctlmsr);
-	else
-		update_debugctlmsr_on_cpu(cpu, debugctlmsr);
-}
-
-int ds_resume_bts_noirq(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long debugctlmsr, irq;
-	int cpu, error = 0;
-
-	if (!tracer)
-		return 0;
-
-	tracer->flags = tracer->trace.ds.flags;
-
-	task = tracer->ds.context->task;
-	cpu  = tracer->ds.context->cpu;
-
-	local_irq_save(irq);
-
-	error = -EPERM;
-	if (!task && (cpu != smp_processor_id()))
-		goto out;
-
-	debugctlmsr = (task ?
-		       task->thread.debugctlmsr :
-		       get_debugctlmsr());
-	debugctlmsr |= ds_bts_control(tracer);
-
-	if (task)
-		update_task_debugctlmsr(task, debugctlmsr);
-	else
-		update_debugctlmsr(debugctlmsr);
-
-	error = 0;
- out:
-	local_irq_restore(irq);
-	return error;
-}
-
-static void ds_free_pebs(struct pebs_tracer *tracer)
-{
-	struct task_struct *task;
-
-	task = tracer->ds.context->task;
-
-	WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
-	tracer->ds.context->pebs_master = NULL;
-
-	ds_put_context(tracer->ds.context);
-	put_tracer(task);
-
-	kfree(tracer);
-}
-
-void ds_release_pebs(struct pebs_tracer *tracer)
-{
-	might_sleep();
-
-	if (!tracer)
-		return;
-
-	ds_suspend_pebs(tracer);
-	ds_free_pebs(tracer);
-}
-
-int ds_release_pebs_noirq(struct pebs_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long irq;
-	int error;
-
-	if (!tracer)
-		return 0;
-
-	task = tracer->ds.context->task;
-
-	local_irq_save(irq);
-
-	error = -EPERM;
-	if (!task &&
-	    (tracer->ds.context->cpu != smp_processor_id()))
-		goto out;
-
-	error = -EPERM;
-	if (task && (task != current))
-		goto out;
-
-	ds_suspend_pebs_noirq(tracer);
-	ds_free_pebs(tracer);
-
-	error = 0;
- out:
-	local_irq_restore(irq);
-	return error;
-}
-
-void ds_suspend_pebs(struct pebs_tracer *tracer)
-{
-
-}
-
-int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
-{
-	return 0;
-}
-
-void ds_resume_pebs(struct pebs_tracer *tracer)
-{
-
-}
-
-int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
-{
-	return 0;
-}
-
-const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
-{
-	if (!tracer)
-		return NULL;
-
-	ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
-	return &tracer->trace;
-}
-
-const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
-{
-	if (!tracer)
-		return NULL;
-
-	ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
-
-	tracer->trace.counters = ds_cfg.nr_counter_reset;
-	memcpy(tracer->trace.counter_reset,
-	       tracer->ds.context->ds +
-	       (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
-	       ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
-
-	return &tracer->trace;
-}
-
-int ds_reset_bts(struct bts_tracer *tracer)
-{
-	if (!tracer)
-		return -EINVAL;
-
-	tracer->trace.ds.top = tracer->trace.ds.begin;
-
-	ds_set(tracer->ds.context->ds, ds_bts, ds_index,
-	       (unsigned long)tracer->trace.ds.top);
-
-	return 0;
-}
-
-int ds_reset_pebs(struct pebs_tracer *tracer)
-{
-	if (!tracer)
-		return -EINVAL;
-
-	tracer->trace.ds.top = tracer->trace.ds.begin;
-
-	ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
-	       (unsigned long)tracer->trace.ds.top);
-
-	return 0;
-}
-
-int ds_set_pebs_reset(struct pebs_tracer *tracer,
-		      unsigned int counter, u64 value)
-{
-	if (!tracer)
-		return -EINVAL;
-
-	if (ds_cfg.nr_counter_reset < counter)
-		return -EINVAL;
-
-	*(u64 *)(tracer->ds.context->ds +
-		 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
-		 (counter * PEBS_RESET_FIELD_SIZE)) = value;
-
-	return 0;
-}
-
-static const struct ds_configuration ds_cfg_netburst = {
-	.name = "Netburst",
-	.ctl[dsf_bts]		= (1 << 2) | (1 << 3),
-	.ctl[dsf_bts_kernel]	= (1 << 5),
-	.ctl[dsf_bts_user]	= (1 << 6),
-	.nr_counter_reset	= 1,
-};
-static const struct ds_configuration ds_cfg_pentium_m = {
-	.name = "Pentium M",
-	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
-	.nr_counter_reset	= 1,
-};
-static const struct ds_configuration ds_cfg_core2_atom = {
-	.name = "Core 2/Atom",
-	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
-	.ctl[dsf_bts_kernel]	= (1 << 9),
-	.ctl[dsf_bts_user]	= (1 << 10),
-	.nr_counter_reset	= 1,
-};
-static const struct ds_configuration ds_cfg_core_i7 = {
-	.name = "Core i7",
-	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
-	.ctl[dsf_bts_kernel]	= (1 << 9),
-	.ctl[dsf_bts_user]	= (1 << 10),
-	.nr_counter_reset	= 4,
-};
-
-static void
-ds_configure(const struct ds_configuration *cfg,
-	     struct cpuinfo_x86 *cpu)
-{
-	unsigned long nr_pebs_fields = 0;
-
-	printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
-
-#ifdef __i386__
-	nr_pebs_fields = 10;
-#else
-	nr_pebs_fields = 18;
-#endif
-
-	/*
-	 * Starting with version 2, architectural performance
-	 * monitoring supports a format specifier.
-	 */
-	if ((cpuid_eax(0xa) & 0xff) > 1) {
-		unsigned long perf_capabilities, format;
-
-		rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
-
-		format = (perf_capabilities >> 8) & 0xf;
-
-		switch (format) {
-		case 0:
-			nr_pebs_fields = 18;
-			break;
-		case 1:
-			nr_pebs_fields = 22;
-			break;
-		default:
-			printk(KERN_INFO
-			       "[ds] unknown PEBS format: %lu\n", format);
-			nr_pebs_fields = 0;
-			break;
-		}
-	}
-
-	memset(&ds_cfg, 0, sizeof(ds_cfg));
-	ds_cfg = *cfg;
-
-	ds_cfg.sizeof_ptr_field =
-		(cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
-
-	ds_cfg.sizeof_rec[ds_bts]  = ds_cfg.sizeof_ptr_field * 3;
-	ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
-
-	if (!cpu_has(cpu, X86_FEATURE_BTS)) {
-		ds_cfg.sizeof_rec[ds_bts] = 0;
-		printk(KERN_INFO "[ds] bts not available\n");
-	}
-	if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
-		ds_cfg.sizeof_rec[ds_pebs] = 0;
-		printk(KERN_INFO "[ds] pebs not available\n");
-	}
-
-	printk(KERN_INFO "[ds] sizes: address: %u bit, ",
-	       8 * ds_cfg.sizeof_ptr_field);
-	printk("bts/pebs record: %u/%u bytes\n",
-	       ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
-
-	WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
-}
-
-void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
-{
-	/* Only configure the first cpu. Others are identical. */
-	if (ds_cfg.name)
-		return;
-
-	switch (c->x86) {
-	case 0x6:
-		switch (c->x86_model) {
-		case 0x9:
-		case 0xd: /* Pentium M */
-			ds_configure(&ds_cfg_pentium_m, c);
-			break;
-		case 0xf:
-		case 0x17: /* Core2 */
-		case 0x1c: /* Atom */
-			ds_configure(&ds_cfg_core2_atom, c);
-			break;
-		case 0x1a: /* Core i7 */
-			ds_configure(&ds_cfg_core_i7, c);
-			break;
-		default:
-			/* Sorry, don't know about them. */
-			break;
-		}
-		break;
-	case 0xf:
-		switch (c->x86_model) {
-		case 0x0:
-		case 0x1:
-		case 0x2: /* Netburst */
-			ds_configure(&ds_cfg_netburst, c);
-			break;
-		default:
-			/* Sorry, don't know about them. */
-			break;
-		}
-		break;
-	default:
-		/* Sorry, don't know about them. */
-		break;
-	}
-}
-
-static inline void ds_take_timestamp(struct ds_context *context,
-				     enum bts_qualifier qualifier,
-				     struct task_struct *task)
-{
-	struct bts_tracer *tracer = context->bts_master;
-	struct bts_struct ts;
-
-	/* Prevent compilers from reading the tracer pointer twice. */
-	barrier();
-
-	if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
-		return;
-
-	memset(&ts, 0, sizeof(ts));
-	ts.qualifier		= qualifier;
-	ts.variant.event.clock	= trace_clock_global();
-	ts.variant.event.pid	= task->pid;
-
-	bts_write(tracer, &ts);
-}
-
-/*
- * Change the DS configuration from tracing prev to tracing next.
- */
-void ds_switch_to(struct task_struct *prev, struct task_struct *next)
-{
-	struct ds_context *prev_ctx	= prev->thread.ds_ctx;
-	struct ds_context *next_ctx	= next->thread.ds_ctx;
-	unsigned long debugctlmsr	= next->thread.debugctlmsr;
-
-	/* Make sure all data is read before we start. */
-	barrier();
-
-	if (prev_ctx) {
-		update_debugctlmsr(0);
-
-		ds_take_timestamp(prev_ctx, bts_task_departs, prev);
-	}
-
-	if (next_ctx) {
-		ds_take_timestamp(next_ctx, bts_task_arrives, next);
-
-		wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
-	}
-
-	update_debugctlmsr(debugctlmsr);
-}
-
-static __init int ds_selftest(void)
-{
-	if (ds_cfg.sizeof_rec[ds_bts]) {
-		int error;
-
-		error = ds_selftest_bts();
-		if (error) {
-			WARN(1, "[ds] selftest failed. disabling bts.\n");
-			ds_cfg.sizeof_rec[ds_bts] = 0;
-		}
-	}
-
-	if (ds_cfg.sizeof_rec[ds_pebs]) {
-		int error;
-
-		error = ds_selftest_pebs();
-		if (error) {
-			WARN(1, "[ds] selftest failed. disabling pebs.\n");
-			ds_cfg.sizeof_rec[ds_pebs] = 0;
-		}
-	}
-
-	return 0;
-}
-device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
deleted file mode 100644
index 6bc7c199ab99..000000000000
--- a/arch/x86/kernel/ds_selftest.c
+++ /dev/null
@@ -1,408 +0,0 @@
-/*
- * Debug Store support - selftest
- *
- *
- * Copyright (C) 2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2009
- */
-
-#include "ds_selftest.h"
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
-
-#include <asm/ds.h>
-
-
-#define BUFFER_SIZE		521	/* Intentionally chose an odd size. */
-#define SMALL_BUFFER_SIZE	24	/* A single bts entry. */
-
-struct ds_selftest_bts_conf {
-	struct bts_tracer *tracer;
-	int error;
-	int (*suspend)(struct bts_tracer *);
-	int (*resume)(struct bts_tracer *);
-};
-
-static int ds_selftest_bts_consistency(const struct bts_trace *trace)
-{
-	int error = 0;
-
-	if (!trace) {
-		printk(KERN_CONT "failed to access trace...");
-		/* Bail out. Other tests are pointless. */
-		return -1;
-	}
-
-	if (!trace->read) {
-		printk(KERN_CONT "bts read not available...");
-		error = -1;
-	}
-
-	/* Do some sanity checks on the trace configuration. */
-	if (!trace->ds.n) {
-		printk(KERN_CONT "empty bts buffer...");
-		error = -1;
-	}
-	if (!trace->ds.size) {
-		printk(KERN_CONT "bad bts trace setup...");
-		error = -1;
-	}
-	if (trace->ds.end !=
-	    (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
-		printk(KERN_CONT "bad bts buffer setup...");
-		error = -1;
-	}
-	/*
-	 * We allow top in [begin; end], since its not clear when the
-	 * overflow adjustment happens: after the increment or before the
-	 * write.
-	 */
-	if ((trace->ds.top < trace->ds.begin) ||
-	    (trace->ds.end < trace->ds.top)) {
-		printk(KERN_CONT "bts top out of bounds...");
-		error = -1;
-	}
-
-	return error;
-}
-
-static int ds_selftest_bts_read(struct bts_tracer *tracer,
-				const struct bts_trace *trace,
-				const void *from, const void *to)
-{
-	const unsigned char *at;
-
-	/*
-	 * Check a few things which do not belong to this test.
-	 * They should be covered by other tests.
-	 */
-	if (!trace)
-		return -1;
-
-	if (!trace->read)
-		return -1;
-
-	if (to < from)
-		return -1;
-
-	if (from < trace->ds.begin)
-		return -1;
-
-	if (trace->ds.end < to)
-		return -1;
-
-	if (!trace->ds.size)
-		return -1;
-
-	/* Now to the test itself. */
-	for (at = from; (void *)at < to; at += trace->ds.size) {
-		struct bts_struct bts;
-		unsigned long index;
-		int error;
-
-		if (((void *)at - trace->ds.begin) % trace->ds.size) {
-			printk(KERN_CONT
-			       "read from non-integer index...");
-			return -1;
-		}
-		index = ((void *)at - trace->ds.begin) / trace->ds.size;
-
-		memset(&bts, 0, sizeof(bts));
-		error = trace->read(tracer, at, &bts);
-		if (error < 0) {
-			printk(KERN_CONT
-			       "error reading bts trace at [%lu] (0x%p)...",
-			       index, at);
-			return error;
-		}
-
-		switch (bts.qualifier) {
-		case BTS_BRANCH:
-			break;
-		default:
-			printk(KERN_CONT
-			       "unexpected bts entry %llu at [%lu] (0x%p)...",
-			       bts.qualifier, index, at);
-			return -1;
-		}
-	}
-
-	return 0;
-}
-
-static void ds_selftest_bts_cpu(void *arg)
-{
-	struct ds_selftest_bts_conf *conf = arg;
-	const struct bts_trace *trace;
-	void *top;
-
-	if (IS_ERR(conf->tracer)) {
-		conf->error = PTR_ERR(conf->tracer);
-		conf->tracer = NULL;
-
-		printk(KERN_CONT
-		       "initialization failed (err: %d)...", conf->error);
-		return;
-	}
-
-	/* We should meanwhile have enough trace. */
-	conf->error = conf->suspend(conf->tracer);
-	if (conf->error < 0)
-		return;
-
-	/* Let's see if we can access the trace. */
-	trace = ds_read_bts(conf->tracer);
-
-	conf->error = ds_selftest_bts_consistency(trace);
-	if (conf->error < 0)
-		return;
-
-	/* If everything went well, we should have a few trace entries. */
-	if (trace->ds.top == trace->ds.begin) {
-		/*
-		 * It is possible but highly unlikely that we got a
-		 * buffer overflow and end up at exactly the same
-		 * position we started from.
-		 * Let's issue a warning, but continue.
-		 */
-		printk(KERN_CONT "no trace/overflow...");
-	}
-
-	/* Let's try to read the trace we collected. */
-	conf->error =
-		ds_selftest_bts_read(conf->tracer, trace,
-				     trace->ds.begin, trace->ds.top);
-	if (conf->error < 0)
-		return;
-
-	/*
-	 * Let's read the trace again.
-	 * Since we suspended tracing, we should get the same result.
-	 */
-	top = trace->ds.top;
-
-	trace = ds_read_bts(conf->tracer);
-	conf->error = ds_selftest_bts_consistency(trace);
-	if (conf->error < 0)
-		return;
-
-	if (top != trace->ds.top) {
-		printk(KERN_CONT "suspend not working...");
-		conf->error = -1;
-		return;
-	}
-
-	/* Let's collect some more trace - see if resume is working. */
-	conf->error = conf->resume(conf->tracer);
-	if (conf->error < 0)
-		return;
-
-	conf->error = conf->suspend(conf->tracer);
-	if (conf->error < 0)
-		return;
-
-	trace = ds_read_bts(conf->tracer);
-
-	conf->error = ds_selftest_bts_consistency(trace);
-	if (conf->error < 0)
-		return;
-
-	if (trace->ds.top == top) {
-		/*
-		 * It is possible but highly unlikely that we got a
-		 * buffer overflow and end up at exactly the same
-		 * position we started from.
-		 * Let's issue a warning and check the full trace.
-		 */
-		printk(KERN_CONT
-		       "no resume progress/overflow...");
-
-		conf->error =
-			ds_selftest_bts_read(conf->tracer, trace,
-					     trace->ds.begin, trace->ds.end);
-	} else if (trace->ds.top < top) {
-		/*
-		 * We had a buffer overflow - the entire buffer should
-		 * contain trace records.
-		 */
-		conf->error =
-			ds_selftest_bts_read(conf->tracer, trace,
-					     trace->ds.begin, trace->ds.end);
-	} else {
-		/*
-		 * It is quite likely that the buffer did not overflow.
-		 * Let's just check the delta trace.
-		 */
-		conf->error =
-			ds_selftest_bts_read(conf->tracer, trace, top,
-					     trace->ds.top);
-	}
-	if (conf->error < 0)
-		return;
-
-	conf->error = 0;
-}
-
-static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
-{
-	ds_suspend_bts(tracer);
-	return 0;
-}
-
-static int ds_resume_bts_wrap(struct bts_tracer *tracer)
-{
-	ds_resume_bts(tracer);
-	return 0;
-}
-
-static void ds_release_bts_noirq_wrap(void *tracer)
-{
-	(void)ds_release_bts_noirq(tracer);
-}
-
-static int ds_selftest_bts_bad_release_noirq(int cpu,
-					     struct bts_tracer *tracer)
-{
-	int error = -EPERM;
-
-	/* Try to release the tracer on the wrong cpu. */
-	get_cpu();
-	if (cpu != smp_processor_id()) {
-		error = ds_release_bts_noirq(tracer);
-		if (error != -EPERM)
-			printk(KERN_CONT "release on wrong cpu...");
-	}
-	put_cpu();
-
-	return error ? 0 : -1;
-}
-
-static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
-{
-	struct bts_tracer *tracer;
-	int error;
-
-	/* Try to request cpu tracing while task tracing is active. */
-	tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
-				    (size_t)-1, BTS_KERNEL);
-	error = PTR_ERR(tracer);
-	if (!IS_ERR(tracer)) {
-		ds_release_bts(tracer);
-		error = 0;
-	}
-
-	if (error != -EPERM)
-		printk(KERN_CONT "cpu/task tracing overlap...");
-
-	return error ? 0 : -1;
-}
-
-static int ds_selftest_bts_bad_request_task(void *buffer)
-{
-	struct bts_tracer *tracer;
-	int error;
-
-	/* Try to request cpu tracing while task tracing is active. */
-	tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
-				    (size_t)-1, BTS_KERNEL);
-	error = PTR_ERR(tracer);
-	if (!IS_ERR(tracer)) {
-		error = 0;
-		ds_release_bts(tracer);
-	}
-
-	if (error != -EPERM)
-		printk(KERN_CONT "task/cpu tracing overlap...");
-
-	return error ? 0 : -1;
-}
-
-int ds_selftest_bts(void)
-{
-	struct ds_selftest_bts_conf conf;
-	unsigned char buffer[BUFFER_SIZE], *small_buffer;
-	unsigned long irq;
-	int cpu;
-
-	printk(KERN_INFO "[ds] bts selftest...");
-	conf.error = 0;
-
-	small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		conf.suspend = ds_suspend_bts_wrap;
-		conf.resume = ds_resume_bts_wrap;
-		conf.tracer =
-			ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
-					   NULL, (size_t)-1, BTS_KERNEL);
-		ds_selftest_bts_cpu(&conf);
-		if (conf.error >= 0)
-			conf.error = ds_selftest_bts_bad_request_task(buffer);
-		ds_release_bts(conf.tracer);
-		if (conf.error < 0)
-			goto out;
-
-		conf.suspend = ds_suspend_bts_noirq;
-		conf.resume = ds_resume_bts_noirq;
-		conf.tracer =
-			ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
-					   NULL, (size_t)-1, BTS_KERNEL);
-		smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
-		if (conf.error >= 0) {
-			conf.error =
-				ds_selftest_bts_bad_release_noirq(cpu,
-								  conf.tracer);
-			/* We must not release the tracer twice. */
-			if (conf.error < 0)
-				conf.tracer = NULL;
-		}
-		if (conf.error >= 0)
-			conf.error = ds_selftest_bts_bad_request_task(buffer);
-		smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
-					 conf.tracer, 1);
-		if (conf.error < 0)
-			goto out;
-	}
-
-	conf.suspend = ds_suspend_bts_wrap;
-	conf.resume = ds_resume_bts_wrap;
-	conf.tracer =
-		ds_request_bts_task(current, buffer, BUFFER_SIZE,
-				    NULL, (size_t)-1, BTS_KERNEL);
-	ds_selftest_bts_cpu(&conf);
-	if (conf.error >= 0)
-		conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
-	ds_release_bts(conf.tracer);
-	if (conf.error < 0)
-		goto out;
-
-	conf.suspend = ds_suspend_bts_noirq;
-	conf.resume = ds_resume_bts_noirq;
-	conf.tracer =
-		ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
-				   NULL, (size_t)-1, BTS_KERNEL);
-	local_irq_save(irq);
-	ds_selftest_bts_cpu(&conf);
-	if (conf.error >= 0)
-		conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
-	ds_release_bts_noirq(conf.tracer);
-	local_irq_restore(irq);
-	if (conf.error < 0)
-		goto out;
-
-	conf.error = 0;
- out:
-	put_online_cpus();
-	printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
-
-	return conf.error;
-}
-
-int ds_selftest_pebs(void)
-{
-	return 0;
-}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
deleted file mode 100644
index 2ba8745c6663..000000000000
--- a/arch/x86/kernel/ds_selftest.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Debug Store support - selftest
- *
- *
- * Copyright (C) 2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2009
- */
-
-#ifdef CONFIG_X86_DS_SELFTEST
-extern int ds_selftest_bts(void);
-extern int ds_selftest_pebs(void);
-#else
-static inline int ds_selftest_bts(void) { return 0; }
-static inline int ds_selftest_pebs(void) { return 0; }
-#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6d817554780a..c89a386930b7 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -224,11 +224,6 @@ unsigned __kprobes long oops_begin(void)
 	int cpu;
 	unsigned long flags;
 
-	/* notify the hw-branch tracer so it may disable tracing and
-	   add the last trace to the trace buffer -
-	   the earlier this happens, the more useful the trace. */
-	trace_hw_branch_oops();
-
 	oops_enter();
 
 	/* racy, but better than risking deadlock. */
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b43bbaebe2c0..7a880ad3a208 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -422,14 +422,12 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 
 static void __kprobes clear_btf(void)
 {
-	if (test_thread_flag(TIF_DEBUGCTLMSR))
-		update_debugctlmsr(0);
+	/* XXX */
 }
 
 static void __kprobes restore_btf(void)
 {
-	if (test_thread_flag(TIF_DEBUGCTLMSR))
-		update_debugctlmsr(current->thread.debugctlmsr);
+	/* XXX */
 }
 
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ad9540676fcc..1a60beb32ede 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -20,7 +20,6 @@
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
-#include <asm/ds.h>
 #include <asm/debugreg.h>
 
 unsigned long idle_halt;
@@ -50,8 +49,6 @@ void free_thread_xstate(struct task_struct *tsk)
 		kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
 		tsk->thread.xstate = NULL;
 	}
-
-	WARN(tsk->thread.ds_ctx, "leaking DS context\n");
 }
 
 void free_thread_info(struct thread_info *ti)
@@ -198,12 +195,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 	prev = &prev_p->thread;
 	next = &next_p->thread;
 
-	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
-	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
-		ds_switch_to(prev_p, next_p);
-	else if (next->debugctlmsr != prev->debugctlmsr)
-		update_debugctlmsr(next->debugctlmsr);
-
 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
 		/* prev and next are different */
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f6c62667e30c..75090c589b7a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,7 +55,6 @@
 #include <asm/cpu.h>
 #include <asm/idle.h>
 #include <asm/syscalls.h>
-#include <asm/ds.h>
 #include <asm/debugreg.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -238,13 +237,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
-
-	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
-	p->thread.ds_ctx = NULL;
-
-	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
-	p->thread.debugctlmsr = 0;
-
 	return err;
 }
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index dc9690b4c4cc..cc4258f2beb5 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -49,7 +49,6 @@
 #include <asm/ia32.h>
 #include <asm/idle.h>
 #include <asm/syscalls.h>
-#include <asm/ds.h>
 #include <asm/debugreg.h>
 
 asmlinkage extern void ret_from_fork(void);
@@ -313,13 +312,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 		if (err)
 			goto out;
 	}
-
-	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
-	p->thread.ds_ctx = NULL;
-
-	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
-	p->thread.debugctlmsr = 0;
-
 	err = 0;
 out:
 	if (err && p->thread.io_bitmap_ptr) {
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index a503b1fd04e5..f2fd3b80e565 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -2,9 +2,6 @@
 /*
  * Pentium III FXSR, SSE support
  *	Gareth Hughes <gareth@valinux.com>, May 2000
- *
- * BTS tracing
- *	Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
  */
 
 #include <linux/kernel.h>
@@ -21,7 +18,6 @@
 #include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/signal.h>
-#include <linux/workqueue.h>
 #include <linux/perf_event.h>
 #include <linux/hw_breakpoint.h>
 
@@ -35,7 +31,6 @@
 #include <asm/desc.h>
 #include <asm/prctl.h>
 #include <asm/proto.h>
-#include <asm/ds.h>
 #include <asm/hw_breakpoint.h>
 
 #include "tls.h"
@@ -788,342 +783,6 @@ static int ioperm_get(struct task_struct *target,
 				   0, IO_BITMAP_BYTES);
 }
 
-#ifdef CONFIG_X86_PTRACE_BTS
-/*
- * A branch trace store context.
- *
- * Contexts may only be installed by ptrace_bts_config() and only for
- * ptraced tasks.
- *
- * Contexts are destroyed when the tracee is detached from the tracer.
- * The actual destruction work requires interrupts enabled, so the
- * work is deferred and will be scheduled during __ptrace_unlink().
- *
- * Contexts hold an additional task_struct reference on the traced
- * task, as well as a reference on the tracer's mm.
- *
- * Ptrace already holds a task_struct for the duration of ptrace operations,
- * but since destruction is deferred, it may be executed after both
- * tracer and tracee exited.
- */
-struct bts_context {
-	/* The branch trace handle. */
-	struct bts_tracer	*tracer;
-
-	/* The buffer used to store the branch trace and its size. */
-	void			*buffer;
-	unsigned int		size;
-
-	/* The mm that paid for the above buffer. */
-	struct mm_struct	*mm;
-
-	/* The task this context belongs to. */
-	struct task_struct	*task;
-
-	/* The signal to send on a bts buffer overflow. */
-	unsigned int		bts_ovfl_signal;
-
-	/* The work struct to destroy a context. */
-	struct work_struct	work;
-};
-
-static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
-{
-	void *buffer = NULL;
-	int err = -ENOMEM;
-
-	err = account_locked_memory(current->mm, current->signal->rlim, size);
-	if (err < 0)
-		return err;
-
-	buffer = kzalloc(size, GFP_KERNEL);
-	if (!buffer)
-		goto out_refund;
-
-	context->buffer = buffer;
-	context->size = size;
-	context->mm = get_task_mm(current);
-
-	return 0;
-
- out_refund:
-	refund_locked_memory(current->mm, size);
-	return err;
-}
-
-static inline void free_bts_buffer(struct bts_context *context)
-{
-	if (!context->buffer)
-		return;
-
-	kfree(context->buffer);
-	context->buffer = NULL;
-
-	refund_locked_memory(context->mm, context->size);
-	context->size = 0;
-
-	mmput(context->mm);
-	context->mm = NULL;
-}
-
-static void free_bts_context_work(struct work_struct *w)
-{
-	struct bts_context *context;
-
-	context = container_of(w, struct bts_context, work);
-
-	ds_release_bts(context->tracer);
-	put_task_struct(context->task);
-	free_bts_buffer(context);
-	kfree(context);
-}
-
-static inline void free_bts_context(struct bts_context *context)
-{
-	INIT_WORK(&context->work, free_bts_context_work);
-	schedule_work(&context->work);
-}
-
-static inline struct bts_context *alloc_bts_context(struct task_struct *task)
-{
-	struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (context) {
-		context->task = task;
-		task->bts = context;
-
-		get_task_struct(task);
-	}
-
-	return context;
-}
-
-static int ptrace_bts_read_record(struct task_struct *child, size_t index,
-				  struct bts_struct __user *out)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-	struct bts_struct bts;
-	const unsigned char *at;
-	int error;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	at = trace->ds.top - ((index + 1) * trace->ds.size);
-	if ((void *)at < trace->ds.begin)
-		at += (trace->ds.n * trace->ds.size);
-
-	if (!trace->read)
-		return -EOPNOTSUPP;
-
-	error = trace->read(context->tracer, at, &bts);
-	if (error < 0)
-		return error;
-
-	if (copy_to_user(out, &bts, sizeof(bts)))
-		return -EFAULT;
-
-	return sizeof(bts);
-}
-
-static int ptrace_bts_drain(struct task_struct *child,
-			    long size,
-			    struct bts_struct __user *out)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-	const unsigned char *at;
-	int error, drained = 0;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	if (!trace->read)
-		return -EOPNOTSUPP;
-
-	if (size < (trace->ds.top - trace->ds.begin))
-		return -EIO;
-
-	for (at = trace->ds.begin; (void *)at < trace->ds.top;
-	     out++, drained++, at += trace->ds.size) {
-		struct bts_struct bts;
-
-		error = trace->read(context->tracer, at, &bts);
-		if (error < 0)
-			return error;
-
-		if (copy_to_user(out, &bts, sizeof(bts)))
-			return -EFAULT;
-	}
-
-	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
-
-	error = ds_reset_bts(context->tracer);
-	if (error < 0)
-		return error;
-
-	return drained;
-}
-
-static int ptrace_bts_config(struct task_struct *child,
-			     long cfg_size,
-			     const struct ptrace_bts_config __user *ucfg)
-{
-	struct bts_context *context;
-	struct ptrace_bts_config cfg;
-	unsigned int flags = 0;
-
-	if (cfg_size < sizeof(cfg))
-		return -EIO;
-
-	if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
-		return -EFAULT;
-
-	context = child->bts;
-	if (!context)
-		context = alloc_bts_context(child);
-	if (!context)
-		return -ENOMEM;
-
-	if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
-		if (!cfg.signal)
-			return -EINVAL;
-
-		return -EOPNOTSUPP;
-		context->bts_ovfl_signal = cfg.signal;
-	}
-
-	ds_release_bts(context->tracer);
-	context->tracer = NULL;
-
-	if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
-		int err;
-
-		free_bts_buffer(context);
-		if (!cfg.size)
-			return 0;
-
-		err = alloc_bts_buffer(context, cfg.size);
-		if (err < 0)
-			return err;
-	}
-
-	if (cfg.flags & PTRACE_BTS_O_TRACE)
-		flags |= BTS_USER;
-
-	if (cfg.flags & PTRACE_BTS_O_SCHED)
-		flags |= BTS_TIMESTAMPS;
-
-	context->tracer =
-		ds_request_bts_task(child, context->buffer, context->size,
-				    NULL, (size_t)-1, flags);
-	if (unlikely(IS_ERR(context->tracer))) {
-		int error = PTR_ERR(context->tracer);
-
-		free_bts_buffer(context);
-		context->tracer = NULL;
-		return error;
-	}
-
-	return sizeof(cfg);
-}
-
-static int ptrace_bts_status(struct task_struct *child,
-			     long cfg_size,
-			     struct ptrace_bts_config __user *ucfg)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-	struct ptrace_bts_config cfg;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	if (cfg_size < sizeof(cfg))
-		return -EIO;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	memset(&cfg, 0, sizeof(cfg));
-	cfg.size	= trace->ds.end - trace->ds.begin;
-	cfg.signal	= context->bts_ovfl_signal;
-	cfg.bts_size	= sizeof(struct bts_struct);
-
-	if (cfg.signal)
-		cfg.flags |= PTRACE_BTS_O_SIGNAL;
-
-	if (trace->ds.flags & BTS_USER)
-		cfg.flags |= PTRACE_BTS_O_TRACE;
-
-	if (trace->ds.flags & BTS_TIMESTAMPS)
-		cfg.flags |= PTRACE_BTS_O_SCHED;
-
-	if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
-		return -EFAULT;
-
-	return sizeof(cfg);
-}
-
-static int ptrace_bts_clear(struct task_struct *child)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
-
-	return ds_reset_bts(context->tracer);
-}
-
-static int ptrace_bts_size(struct task_struct *child)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	return (trace->ds.top - trace->ds.begin) / trace->ds.size;
-}
-
-/*
- * Called from __ptrace_unlink() after the child has been moved back
- * to its original parent.
- */
-void ptrace_bts_untrace(struct task_struct *child)
-{
-	if (unlikely(child->bts)) {
-		free_bts_context(child->bts);
-		child->bts = NULL;
-	}
-}
-#endif /* CONFIG_X86_PTRACE_BTS */
-
 /*
  * Called by kernel/ptrace.c when detaching..
  *
@@ -1251,39 +910,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 #endif
 
-	/*
-	 * These bits need more cooking - not enabled yet:
-	 */
-#ifdef CONFIG_X86_PTRACE_BTS
-	case PTRACE_BTS_CONFIG:
-		ret = ptrace_bts_config
-			(child, data, (struct ptrace_bts_config __user *)addr);
-		break;
-
-	case PTRACE_BTS_STATUS:
-		ret = ptrace_bts_status
-			(child, data, (struct ptrace_bts_config __user *)addr);
-		break;
-
-	case PTRACE_BTS_SIZE:
-		ret = ptrace_bts_size(child);
-		break;
-
-	case PTRACE_BTS_GET:
-		ret = ptrace_bts_read_record
-			(child, data, (struct bts_struct __user *) addr);
-		break;
-
-	case PTRACE_BTS_CLEAR:
-		ret = ptrace_bts_clear(child);
-		break;
-
-	case PTRACE_BTS_DRAIN:
-		ret = ptrace_bts_drain
-			(child, data, (struct bts_struct __user *) addr);
-		break;
-#endif /* CONFIG_X86_PTRACE_BTS */
-
 	default:
 		ret = ptrace_request(child, request, addr, data);
 		break;
@@ -1543,14 +1169,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 
 	case PTRACE_GET_THREAD_AREA:
 	case PTRACE_SET_THREAD_AREA:
-#ifdef CONFIG_X86_PTRACE_BTS
-	case PTRACE_BTS_CONFIG:
-	case PTRACE_BTS_STATUS:
-	case PTRACE_BTS_SIZE:
-	case PTRACE_BTS_GET:
-	case PTRACE_BTS_CLEAR:
-	case PTRACE_BTS_DRAIN:
-#endif /* CONFIG_X86_PTRACE_BTS */
 		return arch_ptrace(child, request, addr, data);
 
 	default:
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 3149032ff107..7beba0769a8c 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -157,22 +157,6 @@ static int enable_single_step(struct task_struct *child)
 	return 1;
 }
 
-/*
- * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
- */
-static void write_debugctlmsr(struct task_struct *child, unsigned long val)
-{
-	if (child->thread.debugctlmsr == val)
-		return;
-
-	child->thread.debugctlmsr = val;
-
-	if (child != current)
-		return;
-
-	update_debugctlmsr(val);
-}
-
 /*
  * Enable single or block step.
  */
@@ -185,17 +169,9 @@ static void enable_step(struct task_struct *child, bool block)
 	 * So noone should try to use debugger block stepping in a program
 	 * that uses user-mode single stepping itself.
 	 */
-	if (enable_single_step(child) && block) {
-		set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-		write_debugctlmsr(child,
-				  child->thread.debugctlmsr | DEBUGCTLMSR_BTF);
-	} else {
-		write_debugctlmsr(child,
-				  child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
-
-		if (!child->thread.debugctlmsr)
-			clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-	}
+	if (!enable_single_step(child))
+		return;
+	/* XXX */
 }
 
 void user_enable_single_step(struct task_struct *child)
@@ -213,11 +189,7 @@ void user_disable_single_step(struct task_struct *child)
 	/*
 	 * Make sure block stepping (BTF) is disabled.
 	 */
-	write_debugctlmsr(child,
-			  child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
-
-	if (!child->thread.debugctlmsr)
-		clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+	/* XXX */
 
 	/* Always clear TIF_SINGLESTEP... */
 	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1168e4454188..e3da5d726a37 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -543,11 +543,6 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 
 	/* DR6 may or may not be cleared by the CPU */
 	set_debugreg(0, 6);
-	/*
-	 * The processor cleared BTF, so don't mark that we need it set.
-	 */
-	clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
-	tsk->thread.debugctlmsr = 0;
 
 	/* Store the virtualized DR6 value */
 	tsk->thread.debugreg6 = dr6;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 01e6adea07ec..cc12b3c556b3 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -504,18 +504,6 @@ extern int ftrace_dump_on_oops;
 #define INIT_TRACE_RECURSION
 #endif
 
-#ifdef CONFIG_HW_BRANCH_TRACER
-
-void trace_hw_branch(u64 from, u64 to);
-void trace_hw_branch_oops(void);
-
-#else /* CONFIG_HW_BRANCH_TRACER */
-
-static inline void trace_hw_branch(u64 from, u64 to) {}
-static inline void trace_hw_branch_oops(void) {}
-
-#endif /* CONFIG_HW_BRANCH_TRACER */
-
 #ifdef CONFIG_FTRACE_SYSCALLS
 
 unsigned long arch_syscall_addr(int nr);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e70f21beb4b4..c8442b655111 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -19,7 +19,6 @@ struct anon_vma;
 struct file_ra_state;
 struct user_struct;
 struct writeback_control;
-struct rlimit;
 
 #ifndef CONFIG_DISCONTIGMEM          /* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
@@ -1449,9 +1448,6 @@ int vmemmap_populate_basepages(struct page *start_page,
 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 void vmemmap_populate_print_last(void);
 
-extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
-				 size_t size);
-extern void refund_locked_memory(struct mm_struct *mm, size_t size);
 
 enum mf_flags {
 	MF_COUNT_INCREASED = 1 << 0,
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index e1fb60729979..4272521e29e9 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -345,18 +345,6 @@ static inline void user_single_step_siginfo(struct task_struct *tsk,
 #define arch_ptrace_stop(code, info)		do { } while (0)
 #endif
 
-#ifndef arch_ptrace_untrace
-/*
- * Do machine-specific work before untracing child.
- *
- * This is called for a normal detach as well as from ptrace_exit()
- * when the tracing task dies.
- *
- * Called with write_lock(&tasklist_lock) held.
- */
-#define arch_ptrace_untrace(task)		do { } while (0)
-#endif
-
 extern int task_current_syscall(struct task_struct *target, long *callno,
 				unsigned long args[6], unsigned int maxargs,
 				unsigned long *sp, unsigned long *pc);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dad7f668ebf7..e0447c64af6a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -99,7 +99,6 @@ struct futex_pi_state;
 struct robust_list_head;
 struct bio_list;
 struct fs_struct;
-struct bts_context;
 struct perf_event_context;
 
 /*
@@ -1272,12 +1271,6 @@ struct task_struct {
 	struct list_head ptraced;
 	struct list_head ptrace_entry;
 
-	/*
-	 * This is the tracer handle for the ptrace BTS extension.
-	 * This field actually belongs to the ptracer task.
-	 */
-	struct bts_context *bts;
-
 	/* PID/PID hash table linkage. */
 	struct pid_link pids[PIDTYPE_MAX];
 	struct list_head thread_group;
@@ -2123,10 +2116,8 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
 extern char *get_task_comm(char *to, struct task_struct *tsk);
 
 #ifdef CONFIG_SMP
-extern void wait_task_context_switch(struct task_struct *p);
 extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 #else
-static inline void wait_task_context_switch(struct task_struct *p) {}
 static inline unsigned long wait_task_inactive(struct task_struct *p,
 					       long match_state)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 4799c5f0e6d0..d67f1dbfbe03 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1108,9 +1108,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->memcg_batch.do_batch = 0;
 	p->memcg_batch.memcg = NULL;
 #endif
-
-	p->bts = NULL;
-
 	p->stack_start = stack_start;
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 42ad8ae729a0..9fb51237b18c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -76,7 +76,6 @@ void __ptrace_unlink(struct task_struct *child)
 	child->parent = child->real_parent;
 	list_del_init(&child->ptrace_entry);
 
-	arch_ptrace_untrace(child);
 	if (task_is_traced(child))
 		ptrace_untrace(child);
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 9ab3cd7858d3..117b7cad31b3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2076,49 +2076,6 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 	return 1;
 }
 
-/*
- * wait_task_context_switch -	wait for a thread to complete at least one
- *				context switch.
- *
- * @p must not be current.
- */
-void wait_task_context_switch(struct task_struct *p)
-{
-	unsigned long nvcsw, nivcsw, flags;
-	int running;
-	struct rq *rq;
-
-	nvcsw	= p->nvcsw;
-	nivcsw	= p->nivcsw;
-	for (;;) {
-		/*
-		 * The runqueue is assigned before the actual context
-		 * switch. We need to take the runqueue lock.
-		 *
-		 * We could check initially without the lock but it is
-		 * very likely that we need to take the lock in every
-		 * iteration.
-		 */
-		rq = task_rq_lock(p, &flags);
-		running = task_running(rq, p);
-		task_rq_unlock(rq, &flags);
-
-		if (likely(!running))
-			break;
-		/*
-		 * The switch count is incremented before the actual
-		 * context switch. We thus wait for two switches to be
-		 * sure at least one completed.
-		 */
-		if ((p->nvcsw - nvcsw) > 1)
-			break;
-		if ((p->nivcsw - nivcsw) > 1)
-			break;
-
-		cpu_relax();
-	}
-}
-
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 13e13d428cd3..8b1797c4545b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -44,9 +44,6 @@ config HAVE_FTRACE_MCOUNT_RECORD
 	help
 	  See Documentation/trace/ftrace-design.txt
 
-config HAVE_HW_BRANCH_TRACER
-	bool
-
 config HAVE_SYSCALL_TRACEPOINTS
 	bool
 	help
@@ -374,14 +371,6 @@ config STACK_TRACER
 
 	  Say N if unsure.
 
-config HW_BRANCH_TRACER
-	depends on HAVE_HW_BRANCH_TRACER
-	bool "Trace hw branches"
-	select GENERIC_TRACER
-	help
-	  This tracer records all branches on the system in a circular
-	  buffer, giving access to the last N branches for each cpu.
-
 config KMEMTRACE
 	bool "Trace SLAB allocations"
 	select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 78edc6490038..ffb1a5b0550e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2825ef2c0b15..bec2c973ff0c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -34,7 +34,6 @@ enum trace_type {
 	TRACE_GRAPH_RET,
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
-	TRACE_HW_BRANCHES,
 	TRACE_KMEM_ALLOC,
 	TRACE_KMEM_FREE,
 	TRACE_BLK,
@@ -229,7 +228,6 @@ extern void __ftrace_bad_type(void);
 			  TRACE_GRAPH_ENT);		\
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
-		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
 		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
 			  TRACE_KMEM_ALLOC);	\
 		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
@@ -467,8 +465,6 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
 					       struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
 					 struct trace_array *tr);
-extern int trace_selftest_startup_hw_branches(struct tracer *trace,
-					      struct trace_array *tr);
 extern int trace_selftest_startup_ksym(struct tracer *trace,
 					 struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c16a08f399df..dc008c1240da 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -318,18 +318,6 @@ FTRACE_ENTRY(branch, trace_branch,
 		 __entry->func, __entry->file, __entry->correct)
 );
 
-FTRACE_ENTRY(hw_branch, hw_branch_entry,
-
-	TRACE_HW_BRANCHES,
-
-	F_STRUCT(
-		__field(	u64,	from	)
-		__field(	u64,	to	)
-	),
-
-	F_printk("from: %llx to: %llx", __entry->from, __entry->to)
-);
-
 FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
 
 	TRACE_KMEM_ALLOC,
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
deleted file mode 100644
index 7b97000745f5..000000000000
--- a/kernel/trace/trace_hw_branches.c
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * h/w branch tracer for x86 based on BTS
- *
- * Copyright (C) 2008-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
- */
-#include <linux/kallsyms.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/fs.h>
-
-#include <asm/ds.h>
-
-#include "trace_output.h"
-#include "trace.h"
-
-
-#define BTS_BUFFER_SIZE (1 << 13)
-
-static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
-static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
-
-#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
-
-static int trace_hw_branches_enabled __read_mostly;
-static int trace_hw_branches_suspended __read_mostly;
-static struct trace_array *hw_branch_trace __read_mostly;
-
-
-static void bts_trace_init_cpu(int cpu)
-{
-	per_cpu(hwb_tracer, cpu) =
-		ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
-				   BTS_BUFFER_SIZE, NULL, (size_t)-1,
-				   BTS_KERNEL);
-
-	if (IS_ERR(per_cpu(hwb_tracer, cpu)))
-		per_cpu(hwb_tracer, cpu) = NULL;
-}
-
-static int bts_trace_init(struct trace_array *tr)
-{
-	int cpu;
-
-	hw_branch_trace = tr;
-	trace_hw_branches_enabled = 0;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		bts_trace_init_cpu(cpu);
-
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			trace_hw_branches_enabled = 1;
-	}
-	trace_hw_branches_suspended = 0;
-	put_online_cpus();
-
-	/* If we could not enable tracing on a single cpu, we fail. */
-	return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
-}
-
-static void bts_trace_reset(struct trace_array *tr)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		if (likely(per_cpu(hwb_tracer, cpu))) {
-			ds_release_bts(per_cpu(hwb_tracer, cpu));
-			per_cpu(hwb_tracer, cpu) = NULL;
-		}
-	}
-	trace_hw_branches_enabled = 0;
-	trace_hw_branches_suspended = 0;
-	put_online_cpus();
-}
-
-static void bts_trace_start(struct trace_array *tr)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_resume_bts(per_cpu(hwb_tracer, cpu));
-	trace_hw_branches_suspended = 0;
-	put_online_cpus();
-}
-
-static void bts_trace_stop(struct trace_array *tr)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_suspend_bts(per_cpu(hwb_tracer, cpu));
-	trace_hw_branches_suspended = 1;
-	put_online_cpus();
-}
-
-static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
-				     unsigned long action, void *hcpu)
-{
-	int cpu = (long)hcpu;
-
-	switch (action) {
-	case CPU_ONLINE:
-	case CPU_DOWN_FAILED:
-		/* The notification is sent with interrupts enabled. */
-		if (trace_hw_branches_enabled) {
-			bts_trace_init_cpu(cpu);
-
-			if (trace_hw_branches_suspended &&
-			    likely(per_cpu(hwb_tracer, cpu)))
-				ds_suspend_bts(per_cpu(hwb_tracer, cpu));
-		}
-		break;
-
-	case CPU_DOWN_PREPARE:
-		/* The notification is sent with interrupts enabled. */
-		if (likely(per_cpu(hwb_tracer, cpu))) {
-			ds_release_bts(per_cpu(hwb_tracer, cpu));
-			per_cpu(hwb_tracer, cpu) = NULL;
-		}
-	}
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
-	.notifier_call = bts_hotcpu_handler
-};
-
-static void bts_trace_print_header(struct seq_file *m)
-{
-	seq_puts(m, "# CPU#        TO  <-  FROM\n");
-}
-
-static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
-{
-	unsigned long symflags = TRACE_ITER_SYM_OFFSET;
-	struct trace_entry *entry = iter->ent;
-	struct trace_seq *seq = &iter->seq;
-	struct hw_branch_entry *it;
-
-	trace_assign_type(it, entry);
-
-	if (entry->type == TRACE_HW_BRANCHES) {
-		if (trace_seq_printf(seq, "%4d  ", iter->cpu) &&
-		    seq_print_ip_sym(seq, it->to, symflags) &&
-		    trace_seq_printf(seq, "\t  <-  ") &&
-		    seq_print_ip_sym(seq, it->from, symflags) &&
-		    trace_seq_printf(seq, "\n"))
-			return TRACE_TYPE_HANDLED;
-		return TRACE_TYPE_PARTIAL_LINE;
-	}
-	return TRACE_TYPE_UNHANDLED;
-}
-
-void trace_hw_branch(u64 from, u64 to)
-{
-	struct ftrace_event_call *call = &event_hw_branch;
-	struct trace_array *tr = hw_branch_trace;
-	struct ring_buffer_event *event;
-	struct ring_buffer *buf;
-	struct hw_branch_entry *entry;
-	unsigned long irq1;
-	int cpu;
-
-	if (unlikely(!tr))
-		return;
-
-	if (unlikely(!trace_hw_branches_enabled))
-		return;
-
-	local_irq_save(irq1);
-	cpu = raw_smp_processor_id();
-	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
-		goto out;
-
-	buf = tr->buffer;
-	event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, from);
-	entry->ent.type = TRACE_HW_BRANCHES;
-	entry->from = from;
-	entry->to   = to;
-	if (!filter_check_discard(call, entry, buf, event))
-		trace_buffer_unlock_commit(buf, event, 0, 0);
-
- out:
-	atomic_dec(&tr->data[cpu]->disabled);
-	local_irq_restore(irq1);
-}
-
-static void trace_bts_at(const struct bts_trace *trace, void *at)
-{
-	struct bts_struct bts;
-	int err = 0;
-
-	WARN_ON_ONCE(!trace->read);
-	if (!trace->read)
-		return;
-
-	err = trace->read(this_tracer, at, &bts);
-	if (err < 0)
-		return;
-
-	switch (bts.qualifier) {
-	case BTS_BRANCH:
-		trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
-		break;
-	}
-}
-
-/*
- * Collect the trace on the current cpu and write it into the ftrace buffer.
- *
- * pre: tracing must be suspended on the current cpu
- */
-static void trace_bts_cpu(void *arg)
-{
-	struct trace_array *tr = (struct trace_array *)arg;
-	const struct bts_trace *trace;
-	unsigned char *at;
-
-	if (unlikely(!tr))
-		return;
-
-	if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
-		return;
-
-	if (unlikely(!this_tracer))
-		return;
-
-	trace = ds_read_bts(this_tracer);
-	if (!trace)
-		return;
-
-	for (at = trace->ds.top; (void *)at < trace->ds.end;
-	     at += trace->ds.size)
-		trace_bts_at(trace, at);
-
-	for (at = trace->ds.begin; (void *)at < trace->ds.top;
-	     at += trace->ds.size)
-		trace_bts_at(trace, at);
-}
-
-static void trace_bts_prepare(struct trace_iterator *iter)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_suspend_bts(per_cpu(hwb_tracer, cpu));
-	/*
-	 * We need to collect the trace on the respective cpu since ftrace
-	 * implicitly adds the record for the current cpu.
-	 * Once that is more flexible, we could collect the data from any cpu.
-	 */
-	on_each_cpu(trace_bts_cpu, iter->tr, 1);
-
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_resume_bts(per_cpu(hwb_tracer, cpu));
-	put_online_cpus();
-}
-
-static void trace_bts_close(struct trace_iterator *iter)
-{
-	tracing_reset_online_cpus(iter->tr);
-}
-
-void trace_hw_branch_oops(void)
-{
-	if (this_tracer) {
-		ds_suspend_bts_noirq(this_tracer);
-		trace_bts_cpu(hw_branch_trace);
-		ds_resume_bts_noirq(this_tracer);
-	}
-}
-
-struct tracer bts_tracer __read_mostly =
-{
-	.name		= "hw-branch-tracer",
-	.init		= bts_trace_init,
-	.reset		= bts_trace_reset,
-	.print_header	= bts_trace_print_header,
-	.print_line	= bts_trace_print_line,
-	.start		= bts_trace_start,
-	.stop		= bts_trace_stop,
-	.open		= trace_bts_prepare,
-	.close		= trace_bts_close,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest	= trace_selftest_startup_hw_branches,
-#endif /* CONFIG_FTRACE_SELFTEST */
-};
-
-__init static int init_bts_trace(void)
-{
-	register_hotcpu_notifier(&bts_hotcpu_notifier);
-	return register_tracer(&bts_tracer);
-}
-device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 280fea470d67..a7084e7c0427 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -16,7 +16,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_BRANCH:
 	case TRACE_GRAPH_ENT:
 	case TRACE_GRAPH_RET:
-	case TRACE_HW_BRANCHES:
 	case TRACE_KSYM:
 		return 1;
 	}
@@ -754,62 +753,6 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
-#ifdef CONFIG_HW_BRANCH_TRACER
-int
-trace_selftest_startup_hw_branches(struct tracer *trace,
-				   struct trace_array *tr)
-{
-	struct trace_iterator *iter;
-	struct tracer tracer;
-	unsigned long count;
-	int ret;
-
-	if (!trace->open) {
-		printk(KERN_CONT "missing open function...");
-		return -1;
-	}
-
-	ret = tracer_init(trace, tr);
-	if (ret) {
-		warn_failed_init_tracer(trace, ret);
-		return ret;
-	}
-
-	/*
-	 * The hw-branch tracer needs to collect the trace from the various
-	 * cpu trace buffers - before tracing is stopped.
-	 */
-	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
-	if (!iter)
-		return -ENOMEM;
-
-	memcpy(&tracer, trace, sizeof(tracer));
-
-	iter->trace = &tracer;
-	iter->tr = tr;
-	iter->pos = -1;
-	mutex_init(&iter->mutex);
-
-	trace->open(iter);
-
-	mutex_destroy(&iter->mutex);
-	kfree(iter);
-
-	tracing_stop();
-
-	ret = trace_test_buffer(tr, &count);
-	trace->reset(tr);
-	tracing_start();
-
-	if (!ret && !count) {
-		printk(KERN_CONT "no entries found..");
-		ret = -1;
-	}
-
-	return ret;
-}
-#endif /* CONFIG_HW_BRANCH_TRACER */
-
 #ifdef CONFIG_KSYM_TRACER
 static int ksym_selftest_dummy;
 
diff --git a/mm/mlock.c b/mm/mlock.c
index 8f4e2dfceec1..3f82720e0515 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -607,44 +607,3 @@ void user_shm_unlock(size_t size, struct user_struct *user)
 	spin_unlock(&shmlock_user_lock);
 	free_uid(user);
 }
-
-int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
-			  size_t size)
-{
-	unsigned long lim, vm, pgsz;
-	int error = -ENOMEM;
-
-	pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
-	down_write(&mm->mmap_sem);
-
-	lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT;
-	vm   = mm->total_vm + pgsz;
-	if (lim < vm)
-		goto out;
-
-	lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT;
-	vm   = mm->locked_vm + pgsz;
-	if (lim < vm)
-		goto out;
-
-	mm->total_vm  += pgsz;
-	mm->locked_vm += pgsz;
-
-	error = 0;
- out:
-	up_write(&mm->mmap_sem);
-	return error;
-}
-
-void refund_locked_memory(struct mm_struct *mm, size_t size)
-{
-	unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
-	down_write(&mm->mmap_sem);
-
-	mm->total_vm  -= pgsz;
-	mm->locked_vm -= pgsz;
-
-	up_write(&mm->mmap_sem);
-}
-- 
cgit v1.2.3-55-g7522


From ea8e61b7bbc4a2faef77db34eb2db2a2c2372ff6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 25 Mar 2010 14:51:51 +0100
Subject: x86, ptrace: Fix block-step

Implement ptrace-block-step using TIF_BLOCKSTEP which will set
DEBUGCTLMSR_BTF when set for a task while preserving any other
DEBUGCTLMSR bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100325135414.017536066@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/processor.h   |  4 ++--
 arch/x86/include/asm/thread_info.h |  4 +++-
 arch/x86/kernel/kprobes.c          | 14 ++++++++++++--
 arch/x86/kernel/process.c          | 11 +++++++++++
 arch/x86/kernel/step.c             | 24 ++++++++++++++++++++----
 arch/x86/kernel/traps.c            |  5 +++++
 6 files changed, 53 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5bec21a66dc5..32428b410b55 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -799,7 +799,7 @@ extern void cpu_init(void);
 
 static inline unsigned long get_debugctlmsr(void)
 {
-    unsigned long debugctlmsr = 0;
+	unsigned long debugctlmsr = 0;
 
 #ifndef CONFIG_X86_DEBUGCTLMSR
 	if (boot_cpu_data.x86 < 6)
@@ -807,7 +807,7 @@ static inline unsigned long get_debugctlmsr(void)
 #endif
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
 
-    return debugctlmsr;
+	return debugctlmsr;
 }
 
 static inline void update_debugctlmsr(unsigned long debugctlmsr)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index dc85e12d1405..d017ed5502e2 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -92,6 +92,7 @@ struct thread_info {
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
 #define TIF_FREEZE		23	/* is freezing for suspend */
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
+#define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
 
@@ -113,6 +114,7 @@ struct thread_info {
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1 << TIF_FREEZE)
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
+#define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 
@@ -143,7 +145,7 @@ struct thread_info {
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
-	(_TIF_IO_BITMAP|_TIF_NOTSC)
+	(_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
 
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7a880ad3a208..f2f56c0967b6 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -422,12 +422,22 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 
 static void __kprobes clear_btf(void)
 {
-	/* XXX */
+	if (test_thread_flag(TIF_BLOCKSTEP)) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl &= ~DEBUGCTLMSR_BTF;
+		update_debugctlmsr(debugctl);
+	}
 }
 
 static void __kprobes restore_btf(void)
 {
-	/* XXX */
+	if (test_thread_flag(TIF_BLOCKSTEP)) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl |= DEBUGCTLMSR_BTF;
+		update_debugctlmsr(debugctl);
+	}
 }
 
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1a60beb32ede..8328009416d7 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -195,6 +195,17 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 	prev = &prev_p->thread;
 	next = &next_p->thread;
 
+	if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
+	    test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl &= ~DEBUGCTLMSR_BTF;
+		if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
+			debugctl |= DEBUGCTLMSR_BTF;
+
+		update_debugctlmsr(debugctl);
+	}
+
 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
 		/* prev and next are different */
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 7beba0769a8c..58de45ee08b6 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -169,9 +169,19 @@ static void enable_step(struct task_struct *child, bool block)
 	 * So noone should try to use debugger block stepping in a program
 	 * that uses user-mode single stepping itself.
 	 */
-	if (!enable_single_step(child))
-		return;
-	/* XXX */
+	if (enable_single_step(child) && block) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl |= DEBUGCTLMSR_BTF;
+		update_debugctlmsr(debugctl);
+		set_tsk_thread_flag(child, TIF_BLOCKSTEP);
+	} else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl &= ~DEBUGCTLMSR_BTF;
+		update_debugctlmsr(debugctl);
+		clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
+	}
 }
 
 void user_enable_single_step(struct task_struct *child)
@@ -189,7 +199,13 @@ void user_disable_single_step(struct task_struct *child)
 	/*
 	 * Make sure block stepping (BTF) is disabled.
 	 */
-	/* XXX */
+	if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl &= ~DEBUGCTLMSR_BTF;
+		update_debugctlmsr(debugctl);
+		clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
+	}
 
 	/* Always clear TIF_SINGLESTEP... */
 	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index e3da5d726a37..36f1bd9f8e76 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -544,6 +544,11 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	/* DR6 may or may not be cleared by the CPU */
 	set_debugreg(0, 6);
 
+	/*
+	 * The processor cleared BTF, so don't mark that we need it set.
+	 */
+	clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
+
 	/* Store the virtualized DR6 value */
 	tsk->thread.debugreg6 = dr6;
 
-- 
cgit v1.2.3-55-g7522


From 11164cd4f6dab326a88bdf27f2f8f7c11977e91a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 26 Mar 2010 14:08:44 +0100
Subject: perf, x86: Add Nehelem PMU programming errata workaround

Implement the workaround for Intel Errata AAK100 and AAP53.

Also, remove the Core-i7 name for Nehalem events since there are
also Westmere based i7 chips.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
LKML-Reference: <1269608924.12097.147.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       |  8 +++----
 arch/x86/kernel/cpu/perf_event_intel.c | 43 ++++++++++++++++++++++++++++++----
 arch/x86/kernel/cpu/perf_event_p4.c    |  2 +-
 arch/x86/kernel/cpu/perf_event_p6.c    |  2 +-
 4 files changed, 45 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index f571f514de2a..6f66d4a845ff 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -184,7 +184,7 @@ struct x86_pmu {
 	int		version;
 	int		(*handle_irq)(struct pt_regs *);
 	void		(*disable_all)(void);
-	void		(*enable_all)(void);
+	void		(*enable_all)(int added);
 	void		(*enable)(struct perf_event *);
 	void		(*disable)(struct perf_event *);
 	int		(*hw_config)(struct perf_event_attr *attr, struct hw_perf_event *hwc);
@@ -576,7 +576,7 @@ void hw_perf_disable(void)
 	x86_pmu.disable_all();
 }
 
-static void x86_pmu_enable_all(void)
+static void x86_pmu_enable_all(int added)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
@@ -784,7 +784,7 @@ void hw_perf_enable(void)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct perf_event *event;
 	struct hw_perf_event *hwc;
-	int i;
+	int i, added = cpuc->n_added;
 
 	if (!x86_pmu_initialized())
 		return;
@@ -836,7 +836,7 @@ void hw_perf_enable(void)
 	cpuc->enabled = 1;
 	barrier();
 
-	x86_pmu.enable_all();
+	x86_pmu.enable_all(added);
 }
 
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 044b8436b19d..676aac27aca4 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -483,7 +483,7 @@ static void intel_pmu_disable_all(void)
 	intel_pmu_lbr_disable_all();
 }
 
-static void intel_pmu_enable_all(void)
+static void intel_pmu_enable_all(int added)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -502,6 +502,40 @@ static void intel_pmu_enable_all(void)
 	}
 }
 
+/*
+ * Workaround for:
+ *   Intel Errata AAK100 (model 26)
+ *   Intel Errata AAP53  (model 30)
+ *
+ * These chips need to be 'reset' when adding counters by programming
+ * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
+ * either in sequence on the same PMC or on different PMCs.
+ */
+static void intel_pmu_nhm_enable_all(int added)
+{
+	if (added) {
+		struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+		int i;
+
+		wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2);
+		wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1);
+		wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5);
+
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
+
+		for (i = 0; i < 3; i++) {
+			struct perf_event *event = cpuc->events[i];
+
+			if (!event)
+				continue;
+
+			__x86_pmu_enable_event(&event->hw);
+		}
+	}
+	intel_pmu_enable_all(added);
+}
+
 static inline u64 intel_pmu_get_status(void)
 {
 	u64 status;
@@ -658,7 +692,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 	intel_pmu_drain_bts_buffer();
 	status = intel_pmu_get_status();
 	if (!status) {
-		intel_pmu_enable_all();
+		intel_pmu_enable_all(0);
 		return 0;
 	}
 
@@ -707,7 +741,7 @@ again:
 		goto again;
 
 done:
-	intel_pmu_enable_all();
+	intel_pmu_enable_all(0);
 	return 1;
 }
 
@@ -920,7 +954,8 @@ static __init int intel_pmu_init(void)
 		intel_pmu_lbr_init_nhm();
 
 		x86_pmu.event_constraints = intel_nehalem_event_constraints;
-		pr_cont("Nehalem/Corei7 events, ");
+		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+		pr_cont("Nehalem events, ");
 		break;
 
 	case 28: /* Atom */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index f8fe069f14e2..0d1be36cbe9e 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -535,7 +535,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
 				(cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
 }
 
-static void p4_pmu_enable_all(void)
+static void p4_pmu_enable_all(int added)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 6ff4d01d880f..877182c850df 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -66,7 +66,7 @@ static void p6_pmu_disable_all(void)
 	wrmsrl(MSR_P6_EVNTSEL0, val);
 }
 
-static void p6_pmu_enable_all(void)
+static void p6_pmu_enable_all(int added)
 {
 	unsigned long val;
 
-- 
cgit v1.2.3-55-g7522


From 948b1bb89a44561560531394c18da4a99215f772 Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Mon, 29 Mar 2010 18:36:50 +0200
Subject: perf, x86: Undo some some *_counter* -> *_event* renames

The big rename:

 cdd6c48 perf: Do the big rename: Performance Counters -> Performance Events

accidentally renamed some members of stucts that were named after
registers in the spec. To avoid confusion this patch reverts some
changes. The related specs are MSR descriptions in AMD's BKDGs and the
ARCHITECTURAL PERFORMANCE MONITORING section in the Intel 64 and IA-32
Architectures Software Developer's Manuals.

This patch does:

 $ sed -i -e 's:num_events:num_counters:g' \
   arch/x86/include/asm/perf_event.h \
   arch/x86/kernel/cpu/perf_event_amd.c \
   arch/x86/kernel/cpu/perf_event.c \
   arch/x86/kernel/cpu/perf_event_intel.c \
   arch/x86/kernel/cpu/perf_event_p6.c \
   arch/x86/kernel/cpu/perf_event_p4.c \
   arch/x86/oprofile/op_model_ppro.c

 $ sed -i -e 's:event_bits:cntval_bits:g' -e 's:event_mask:cntval_mask:g' \
   arch/x86/kernel/cpu/perf_event_amd.c \
   arch/x86/kernel/cpu/perf_event.c \
   arch/x86/kernel/cpu/perf_event_intel.c \
   arch/x86/kernel/cpu/perf_event_p6.c \
   arch/x86/kernel/cpu/perf_event_p4.c

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1269880612-25800-2-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event.h      |  4 +-
 arch/x86/kernel/cpu/perf_event.c       | 74 +++++++++++++++++-----------------
 arch/x86/kernel/cpu/perf_event_amd.c   | 12 +++---
 arch/x86/kernel/cpu/perf_event_intel.c | 16 ++++----
 arch/x86/kernel/cpu/perf_event_p4.c    | 14 +++----
 arch/x86/kernel/cpu/perf_event_p6.c    |  6 +--
 arch/x86/oprofile/op_model_ppro.c      |  4 +-
 7 files changed, 65 insertions(+), 65 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 124dddd598f3..987bf673141e 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -67,7 +67,7 @@
 union cpuid10_eax {
 	struct {
 		unsigned int version_id:8;
-		unsigned int num_events:8;
+		unsigned int num_counters:8;
 		unsigned int bit_width:8;
 		unsigned int mask_length:8;
 	} split;
@@ -76,7 +76,7 @@ union cpuid10_eax {
 
 union cpuid10_edx {
 	struct {
-		unsigned int num_events_fixed:4;
+		unsigned int num_counters_fixed:4;
 		unsigned int reserved:28;
 	} split;
 	unsigned int full;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b53435661813..9daaa1ef504c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -195,10 +195,10 @@ struct x86_pmu {
 	u64		(*event_map)(int);
 	u64		(*raw_event)(u64);
 	int		max_events;
-	int		num_events;
-	int		num_events_fixed;
-	int		event_bits;
-	u64		event_mask;
+	int		num_counters;
+	int		num_counters_fixed;
+	int		cntval_bits;
+	u64		cntval_mask;
 	int		apic;
 	u64		max_period;
 	struct event_constraint *
@@ -268,7 +268,7 @@ static u64
 x86_perf_event_update(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	int shift = 64 - x86_pmu.event_bits;
+	int shift = 64 - x86_pmu.cntval_bits;
 	u64 prev_raw_count, new_raw_count;
 	int idx = hwc->idx;
 	s64 delta;
@@ -320,12 +320,12 @@ static bool reserve_pmc_hardware(void)
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		disable_lapic_nmi_watchdog();
 
-	for (i = 0; i < x86_pmu.num_events; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
 			goto perfctr_fail;
 	}
 
-	for (i = 0; i < x86_pmu.num_events; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
 			goto eventsel_fail;
 	}
@@ -336,7 +336,7 @@ eventsel_fail:
 	for (i--; i >= 0; i--)
 		release_evntsel_nmi(x86_pmu.eventsel + i);
 
-	i = x86_pmu.num_events;
+	i = x86_pmu.num_counters;
 
 perfctr_fail:
 	for (i--; i >= 0; i--)
@@ -352,7 +352,7 @@ static void release_pmc_hardware(void)
 {
 	int i;
 
-	for (i = 0; i < x86_pmu.num_events; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		release_perfctr_nmi(x86_pmu.perfctr + i);
 		release_evntsel_nmi(x86_pmu.eventsel + i);
 	}
@@ -547,7 +547,7 @@ static void x86_pmu_disable_all(void)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
 		if (!test_bit(idx, cpuc->active_mask))
@@ -582,7 +582,7 @@ static void x86_pmu_enable_all(int added)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		struct perf_event *event = cpuc->events[idx];
 		u64 val;
 
@@ -657,14 +657,14 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	 * assign events to counters starting with most
 	 * constrained events.
 	 */
-	wmax = x86_pmu.num_events;
+	wmax = x86_pmu.num_counters;
 
 	/*
 	 * when fixed event counters are present,
 	 * wmax is incremented by 1 to account
 	 * for one more choice
 	 */
-	if (x86_pmu.num_events_fixed)
+	if (x86_pmu.num_counters_fixed)
 		wmax++;
 
 	for (w = 1, num = n; num && w <= wmax; w++) {
@@ -714,7 +714,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
 	struct perf_event *event;
 	int n, max_count;
 
-	max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
+	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
 
 	/* current number of events already accepted */
 	n = cpuc->n_events;
@@ -904,7 +904,7 @@ x86_perf_event_set_period(struct perf_event *event)
 	atomic64_set(&hwc->prev_count, (u64)-left);
 
 	wrmsrl(hwc->event_base + idx,
-			(u64)(-left) & x86_pmu.event_mask);
+			(u64)(-left) & x86_pmu.cntval_mask);
 
 	perf_event_update_userpage(event);
 
@@ -987,7 +987,7 @@ void perf_event_print_debug(void)
 	unsigned long flags;
 	int cpu, idx;
 
-	if (!x86_pmu.num_events)
+	if (!x86_pmu.num_counters)
 		return;
 
 	local_irq_save(flags);
@@ -1011,7 +1011,7 @@ void perf_event_print_debug(void)
 	}
 	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
 		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
 
@@ -1024,7 +1024,7 @@ void perf_event_print_debug(void)
 		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
 			cpu, idx, prev_left);
 	}
-	for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
 
 		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1089,7 +1089,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 
 	cpuc = &__get_cpu_var(cpu_hw_events);
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 
@@ -1097,7 +1097,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 		hwc = &event->hw;
 
 		val = x86_perf_event_update(event);
-		if (val & (1ULL << (x86_pmu.event_bits - 1)))
+		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
 			continue;
 
 		/*
@@ -1401,46 +1401,46 @@ void __init init_hw_perf_events(void)
 	if (x86_pmu.quirks)
 		x86_pmu.quirks();
 
-	if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
+	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
 		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
-		     x86_pmu.num_events, X86_PMC_MAX_GENERIC);
-		x86_pmu.num_events = X86_PMC_MAX_GENERIC;
+		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
+		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
 	}
-	x86_pmu.intel_ctrl = (1 << x86_pmu.num_events) - 1;
-	perf_max_events = x86_pmu.num_events;
+	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+	perf_max_events = x86_pmu.num_counters;
 
-	if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
+	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
 		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
-		     x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED);
-		x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
+		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
+		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
 	}
 
 	x86_pmu.intel_ctrl |=
-		((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
+		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
 	perf_events_lapic_init();
 	register_die_notifier(&perf_event_nmi_notifier);
 
 	unconstrained = (struct event_constraint)
-		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,
-				   0, x86_pmu.num_events);
+		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
+				   0, x86_pmu.num_counters);
 
 	if (x86_pmu.event_constraints) {
 		for_each_event_constraint(c, x86_pmu.event_constraints) {
 			if (c->cmask != INTEL_ARCH_FIXED_MASK)
 				continue;
 
-			c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1;
-			c->weight += x86_pmu.num_events;
+			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+			c->weight += x86_pmu.num_counters;
 		}
 	}
 
 	pr_info("... version:                %d\n",     x86_pmu.version);
-	pr_info("... bit width:              %d\n",     x86_pmu.event_bits);
-	pr_info("... generic registers:      %d\n",     x86_pmu.num_events);
-	pr_info("... value mask:             %016Lx\n", x86_pmu.event_mask);
+	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
+	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
+	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
 	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
-	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_events_fixed);
+	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
 	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
 
 	perf_cpu_notifier(x86_pmu_notifier);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 285623bc3cc8..7753a5c76535 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -165,7 +165,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
 	 * be removed on one CPU at a time AND PMU is disabled
 	 * when we come here
 	 */
-	for (i = 0; i < x86_pmu.num_events; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		if (nb->owners[i] == event) {
 			cmpxchg(nb->owners+i, event, NULL);
 			break;
@@ -215,7 +215,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	struct amd_nb *nb = cpuc->amd_nb;
 	struct perf_event *old = NULL;
-	int max = x86_pmu.num_events;
+	int max = x86_pmu.num_counters;
 	int i, j, k = -1;
 
 	/*
@@ -293,7 +293,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
 	/*
 	 * initialize all possible NB constraints
 	 */
-	for (i = 0; i < x86_pmu.num_events; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		__set_bit(i, nb->event_constraints[i].idxmsk);
 		nb->event_constraints[i].weight = 1;
 	}
@@ -385,9 +385,9 @@ static __initconst struct x86_pmu amd_pmu = {
 	.event_map		= amd_pmu_event_map,
 	.raw_event		= amd_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
-	.num_events		= 4,
-	.event_bits		= 48,
-	.event_mask		= (1ULL << 48) - 1,
+	.num_counters		= 4,
+	.cntval_bits		= 48,
+	.cntval_mask		= (1ULL << 48) - 1,
 	.apic			= 1,
 	/* use highest bit to detect overflow */
 	.max_period		= (1ULL << 47) - 1,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 676aac27aca4..cc4d90a13d53 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -653,20 +653,20 @@ static void intel_pmu_reset(void)
 	unsigned long flags;
 	int idx;
 
-	if (!x86_pmu.num_events)
+	if (!x86_pmu.num_counters)
 		return;
 
 	local_irq_save(flags);
 
 	printk("clearing PMU state on CPU#%d\n", smp_processor_id());
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
 		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
 	}
-	for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
 		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
-	}
+
 	if (ds)
 		ds->bts_index = ds->bts_buffer_base;
 
@@ -901,16 +901,16 @@ static __init int intel_pmu_init(void)
 		x86_pmu = intel_pmu;
 
 	x86_pmu.version			= version;
-	x86_pmu.num_events		= eax.split.num_events;
-	x86_pmu.event_bits		= eax.split.bit_width;
-	x86_pmu.event_mask		= (1ULL << eax.split.bit_width) - 1;
+	x86_pmu.num_counters		= eax.split.num_counters;
+	x86_pmu.cntval_bits		= eax.split.bit_width;
+	x86_pmu.cntval_mask		= (1ULL << eax.split.bit_width) - 1;
 
 	/*
 	 * Quirk: v2 perfmon does not report fixed-purpose events, so
 	 * assume at least 3 events:
 	 */
 	if (version > 1)
-		x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
+		x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
 
 	/*
 	 * v2 and above have a perf capabilities MSR
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 0d1be36cbe9e..4139100404e8 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -483,7 +483,7 @@ static void p4_pmu_disable_all(void)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		struct perf_event *event = cpuc->events[idx];
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
@@ -540,7 +540,7 @@ static void p4_pmu_enable_all(int added)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		struct perf_event *event = cpuc->events[idx];
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
@@ -562,7 +562,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 
 	cpuc = &__get_cpu_var(cpu_hw_events);
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
@@ -579,7 +579,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 		p4_pmu_clear_cccr_ovf(hwc);
 
 		val = x86_perf_event_update(event);
-		if (val & (1ULL << (x86_pmu.event_bits - 1)))
+		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
 			continue;
 
 		/*
@@ -794,10 +794,10 @@ static __initconst struct x86_pmu p4_pmu = {
 	 * though leave it restricted at moment assuming
 	 * HT is on
 	 */
-	.num_events		= ARCH_P4_MAX_CCCR,
+	.num_counters		= ARCH_P4_MAX_CCCR,
 	.apic			= 1,
-	.event_bits		= 40,
-	.event_mask		= (1ULL << 40) - 1,
+	.cntval_bits		= 40,
+	.cntval_mask		= (1ULL << 40) - 1,
 	.max_period		= (1ULL << 39) - 1,
 	.hw_config		= p4_hw_config,
 	.schedule_events	= p4_pmu_schedule_events,
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 877182c850df..b26fbc7eb93c 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -119,7 +119,7 @@ static __initconst struct x86_pmu p6_pmu = {
 	.apic			= 1,
 	.max_period		= (1ULL << 31) - 1,
 	.version		= 0,
-	.num_events		= 2,
+	.num_counters		= 2,
 	/*
 	 * Events have 40 bits implemented. However they are designed such
 	 * that bits [32-39] are sign extensions of bit 31. As such the
@@ -127,8 +127,8 @@ static __initconst struct x86_pmu p6_pmu = {
 	 *
 	 * See IA-32 Intel Architecture Software developer manual Vol 3B
 	 */
-	.event_bits		= 32,
-	.event_mask		= (1ULL << 32) - 1,
+	.cntval_bits		= 32,
+	.cntval_mask		= (1ULL << 32) - 1,
 	.get_event_constraints	= x86_get_event_constraints,
 	.event_constraints	= p6_event_constraints,
 };
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 2bf90fafa7b5..c8abc4d1bf35 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -239,11 +239,11 @@ static void arch_perfmon_setup_counters(void)
 	if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 &&
 		current_cpu_data.x86_model == 15) {
 		eax.split.version_id = 2;
-		eax.split.num_events = 2;
+		eax.split.num_counters = 2;
 		eax.split.bit_width = 40;
 	}
 
-	num_counters = eax.split.num_events;
+	num_counters = eax.split.num_counters;
 
 	op_arch_perfmon_spec.num_counters = num_counters;
 	op_arch_perfmon_spec.num_controls = num_counters;
-- 
cgit v1.2.3-55-g7522


From a098f4484bc7dae23f5b62360954007b99b64600 Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Tue, 30 Mar 2010 11:28:21 +0200
Subject: perf, x86: implement ARCH_PERFMON_EVENTSEL bit masks

ARCH_PERFMON_EVENTSEL bit masks are often used in the kernel. This
patch adds macros for the bit masks and removes local defines. The
function intel_pmu_raw_event() becomes x86_pmu_raw_event() which is
generic for x86 models and same also for p6. Duplicate code is
removed.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100330092821.GH11907@erda.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event.h      | 58 +++++++++++++++-------------------
 arch/x86/kernel/cpu/perf_event.c       | 19 +++++++++--
 arch/x86/kernel/cpu/perf_event_amd.c   | 15 +--------
 arch/x86/kernel/cpu/perf_event_intel.c | 22 ++-----------
 arch/x86/kernel/cpu/perf_event_p6.c    | 20 +-----------
 5 files changed, 45 insertions(+), 89 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 987bf673141e..f6d43dbfd8e7 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -18,39 +18,31 @@
 #define MSR_ARCH_PERFMON_EVENTSEL0			     0x186
 #define MSR_ARCH_PERFMON_EVENTSEL1			     0x187
 
-#define ARCH_PERFMON_EVENTSEL_ENABLE			  (1 << 22)
-#define ARCH_PERFMON_EVENTSEL_ANY			  (1 << 21)
-#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
-
-/*
- * Includes eventsel and unit mask as well:
- */
-
-
-#define INTEL_ARCH_EVTSEL_MASK		0x000000FFULL
-#define INTEL_ARCH_UNIT_MASK		0x0000FF00ULL
-#define INTEL_ARCH_EDGE_MASK		0x00040000ULL
-#define INTEL_ARCH_INV_MASK		0x00800000ULL
-#define INTEL_ARCH_CNT_MASK		0xFF000000ULL
-#define INTEL_ARCH_EVENT_MASK	(INTEL_ARCH_UNIT_MASK|INTEL_ARCH_EVTSEL_MASK)
-
-/*
- * filter mask to validate fixed counter events.
- * the following filters disqualify for fixed counters:
- *  - inv
- *  - edge
- *  - cnt-mask
- *  The other filters are supported by fixed counters.
- *  The any-thread option is supported starting with v3.
- */
-#define INTEL_ARCH_FIXED_MASK \
-	(INTEL_ARCH_CNT_MASK| \
-	 INTEL_ARCH_INV_MASK| \
-	 INTEL_ARCH_EDGE_MASK|\
-	 INTEL_ARCH_UNIT_MASK|\
-	 INTEL_ARCH_EVENT_MASK)
+#define ARCH_PERFMON_EVENTSEL_EVENT			0x000000FFULL
+#define ARCH_PERFMON_EVENTSEL_UMASK			0x0000FF00ULL
+#define ARCH_PERFMON_EVENTSEL_USR			(1ULL << 16)
+#define ARCH_PERFMON_EVENTSEL_OS			(1ULL << 17)
+#define ARCH_PERFMON_EVENTSEL_EDGE			(1ULL << 18)
+#define ARCH_PERFMON_EVENTSEL_INT			(1ULL << 20)
+#define ARCH_PERFMON_EVENTSEL_ANY			(1ULL << 21)
+#define ARCH_PERFMON_EVENTSEL_ENABLE			(1ULL << 22)
+#define ARCH_PERFMON_EVENTSEL_INV			(1ULL << 23)
+#define ARCH_PERFMON_EVENTSEL_CMASK			0xFF000000ULL
+
+#define AMD64_EVENTSEL_EVENT	\
+	(ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32))
+#define INTEL_ARCH_EVENT_MASK	\
+	(ARCH_PERFMON_EVENTSEL_UMASK | ARCH_PERFMON_EVENTSEL_EVENT)
+
+#define X86_RAW_EVENT_MASK		\
+	(ARCH_PERFMON_EVENTSEL_EVENT |	\
+	 ARCH_PERFMON_EVENTSEL_UMASK |	\
+	 ARCH_PERFMON_EVENTSEL_EDGE  |	\
+	 ARCH_PERFMON_EVENTSEL_INV   |	\
+	 ARCH_PERFMON_EVENTSEL_CMASK)
+#define AMD64_RAW_EVENT_MASK		\
+	(X86_RAW_EVENT_MASK          |  \
+	 AMD64_EVENTSEL_EVENT)
 
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9daaa1ef504c..1dd42c18f1cb 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -143,13 +143,21 @@ struct cpu_hw_events {
  * Constraint on the Event code.
  */
 #define INTEL_EVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
+	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
 
 /*
  * Constraint on the Event code + UMask + fixed-mask
+ *
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ *  - inv
+ *  - edge
+ *  - cnt-mask
+ *  The other filters are supported by fixed counters.
+ *  The any-thread option is supported starting with v3.
  */
 #define FIXED_EVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
+	EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
 
 /*
  * Constraint on the Event code + UMask
@@ -437,6 +445,11 @@ static int x86_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc
 	return 0;
 }
 
+static u64 x86_pmu_raw_event(u64 hw_event)
+{
+	return hw_event & X86_RAW_EVENT_MASK;
+}
+
 /*
  * Setup the hardware configuration for a given attr_type
  */
@@ -1427,7 +1440,7 @@ void __init init_hw_perf_events(void)
 
 	if (x86_pmu.event_constraints) {
 		for_each_event_constraint(c, x86_pmu.event_constraints) {
-			if (c->cmask != INTEL_ARCH_FIXED_MASK)
+			if (c->cmask != X86_RAW_EVENT_MASK)
 				continue;
 
 			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 7753a5c76535..37e9517729df 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -113,20 +113,7 @@ static u64 amd_pmu_event_map(int hw_event)
 
 static u64 amd_pmu_raw_event(u64 hw_event)
 {
-#define K7_EVNTSEL_EVENT_MASK	0xF000000FFULL
-#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
-#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL
-#define K7_EVNTSEL_INV_MASK	0x000800000ULL
-#define K7_EVNTSEL_REG_MASK	0x0FF000000ULL
-
-#define K7_EVNTSEL_MASK			\
-	(K7_EVNTSEL_EVENT_MASK |	\
-	 K7_EVNTSEL_UNIT_MASK  |	\
-	 K7_EVNTSEL_EDGE_MASK  |	\
-	 K7_EVNTSEL_INV_MASK   |	\
-	 K7_EVNTSEL_REG_MASK)
-
-	return hw_event & K7_EVNTSEL_MASK;
+	return hw_event & AMD64_RAW_EVENT_MASK;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index cc4d90a13d53..dfdd6f90fc8e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -452,24 +452,6 @@ static __initconst u64 atom_hw_cache_event_ids
  },
 };
 
-static u64 intel_pmu_raw_event(u64 hw_event)
-{
-#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
-#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
-#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL
-#define CORE_EVNTSEL_INV_MASK		0x00800000ULL
-#define CORE_EVNTSEL_REG_MASK		0xFF000000ULL
-
-#define CORE_EVNTSEL_MASK		\
-	(INTEL_ARCH_EVTSEL_MASK |	\
-	 INTEL_ARCH_UNIT_MASK   |	\
-	 INTEL_ARCH_EDGE_MASK   |	\
-	 INTEL_ARCH_INV_MASK    |	\
-	 INTEL_ARCH_CNT_MASK)
-
-	return hw_event & CORE_EVNTSEL_MASK;
-}
-
 static void intel_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -788,7 +770,7 @@ static __initconst struct x86_pmu core_pmu = {
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
-	.raw_event		= intel_pmu_raw_event,
+	.raw_event		= x86_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
 	/*
@@ -827,7 +809,7 @@ static __initconst struct x86_pmu intel_pmu = {
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
-	.raw_event		= intel_pmu_raw_event,
+	.raw_event		= x86_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index b26fbc7eb93c..03c139a67baa 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -27,24 +27,6 @@ static u64 p6_pmu_event_map(int hw_event)
  */
 #define P6_NOP_EVENT			0x0000002EULL
 
-static u64 p6_pmu_raw_event(u64 hw_event)
-{
-#define P6_EVNTSEL_EVENT_MASK		0x000000FFULL
-#define P6_EVNTSEL_UNIT_MASK		0x0000FF00ULL
-#define P6_EVNTSEL_EDGE_MASK		0x00040000ULL
-#define P6_EVNTSEL_INV_MASK		0x00800000ULL
-#define P6_EVNTSEL_REG_MASK		0xFF000000ULL
-
-#define P6_EVNTSEL_MASK			\
-	(P6_EVNTSEL_EVENT_MASK |	\
-	 P6_EVNTSEL_UNIT_MASK  |	\
-	 P6_EVNTSEL_EDGE_MASK  |	\
-	 P6_EVNTSEL_INV_MASK   |	\
-	 P6_EVNTSEL_REG_MASK)
-
-	return hw_event & P6_EVNTSEL_MASK;
-}
-
 static struct event_constraint p6_event_constraints[] =
 {
 	INTEL_EVENT_CONSTRAINT(0xc1, 0x1),	/* FLOPS */
@@ -114,7 +96,7 @@ static __initconst struct x86_pmu p6_pmu = {
 	.eventsel		= MSR_P6_EVNTSEL0,
 	.perfctr		= MSR_P6_PERFCTR0,
 	.event_map		= p6_pmu_event_map,
-	.raw_event		= p6_pmu_raw_event,
+	.raw_event		= x86_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(p6_perfmon_event_map),
 	.apic			= 1,
 	.max_period		= (1ULL << 31) - 1,
-- 
cgit v1.2.3-55-g7522


From b4cdc5c264b35c67007800dec3928e9547a9d70b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Tue, 30 Mar 2010 17:00:06 +0200
Subject: perf, x86: Fix up the ANY flag stuff

Stephane noticed that the ANY flag was in generic arch code, and Cyrill
reported that it broke the P4 code.

Solve this by merging x86_pmu::raw_event into x86_pmu::hw_config and
provide intel_pmu and amd_pmu specific versions of this callback.

The intel_pmu one deals with the ANY flag, the amd_pmu adds the few extra
event bits AMD64 has.

Reported-by: Stephane Eranian <eranian@google.com>
Reported-by: Cyrill Gorcunov <gorcunov@gmail.com>
Acked-by: Robert Richter <robert.richter@amd.com>
Acked-by: Cyrill Gorcunov <gorcunov@gmail.com>
Acked-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1269968113.5258.442.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       | 35 ++++++++++-------------------
 arch/x86/kernel/cpu/perf_event_amd.c   | 17 +++++++++++----
 arch/x86/kernel/cpu/perf_event_intel.c | 30 +++++++++++++++++++++----
 arch/x86/kernel/cpu/perf_event_p4.c    | 40 +++++++++++++++++++---------------
 arch/x86/kernel/cpu/perf_event_p6.c    |  3 +--
 5 files changed, 74 insertions(+), 51 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1dd42c18f1cb..65e9c5efb618 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -196,12 +196,11 @@ struct x86_pmu {
 	void		(*enable_all)(int added);
 	void		(*enable)(struct perf_event *);
 	void		(*disable)(struct perf_event *);
-	int		(*hw_config)(struct perf_event_attr *attr, struct hw_perf_event *hwc);
+	int		(*hw_config)(struct perf_event *event);
 	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
 	unsigned	eventsel;
 	unsigned	perfctr;
 	u64		(*event_map)(int);
-	u64		(*raw_event)(u64);
 	int		max_events;
 	int		num_counters;
 	int		num_counters_fixed;
@@ -426,28 +425,26 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
 	return 0;
 }
 
-static int x86_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
+static int x86_pmu_hw_config(struct perf_event *event)
 {
 	/*
 	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
 	 */
-	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
+	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
 
 	/*
 	 * Count user and OS events unless requested not to
 	 */
-	if (!attr->exclude_user)
-		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
-	if (!attr->exclude_kernel)
-		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+	if (!event->attr.exclude_user)
+		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
+	if (!event->attr.exclude_kernel)
+		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
 
-	return 0;
-}
+	if (event->attr.type == PERF_TYPE_RAW)
+		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
-static u64 x86_pmu_raw_event(u64 hw_event)
-{
-	return hw_event & X86_RAW_EVENT_MASK;
+	return 0;
 }
 
 /*
@@ -489,7 +486,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 	hwc->last_tag = ~0ULL;
 
 	/* Processor specifics */
-	err = x86_pmu.hw_config(attr, hwc);
+	err = x86_pmu.hw_config(event);
 	if (err)
 		return err;
 
@@ -508,16 +505,8 @@ static int __hw_perf_event_init(struct perf_event *event)
 			return -EOPNOTSUPP;
 	}
 
-	/*
-	 * Raw hw_event type provide the config in the hw_event structure
-	 */
-	if (attr->type == PERF_TYPE_RAW) {
-		hwc->config |= x86_pmu.raw_event(attr->config);
-		if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
-		    perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-			return -EACCES;
+	if (attr->type == PERF_TYPE_RAW)
 		return 0;
-	}
 
 	if (attr->type == PERF_TYPE_HW_CACHE)
 		return set_ext_hw_attr(hwc, attr);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 37e9517729df..bbd7339f08a9 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -111,9 +111,19 @@ static u64 amd_pmu_event_map(int hw_event)
 	return amd_perfmon_event_map[hw_event];
 }
 
-static u64 amd_pmu_raw_event(u64 hw_event)
+static int amd_pmu_hw_config(struct perf_event *event)
 {
-	return hw_event & AMD64_RAW_EVENT_MASK;
+	int ret = x86_pmu_hw_config(event);
+
+	if (ret)
+		return ret;
+
+	if (event->attr.type != PERF_TYPE_RAW)
+		return 0;
+
+	event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
+
+	return 0;
 }
 
 /*
@@ -365,12 +375,11 @@ static __initconst struct x86_pmu amd_pmu = {
 	.enable_all		= x86_pmu_enable_all,
 	.enable			= x86_pmu_enable_event,
 	.disable		= x86_pmu_disable_event,
-	.hw_config		= x86_hw_config,
+	.hw_config		= amd_pmu_hw_config,
 	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_K7_EVNTSEL0,
 	.perfctr		= MSR_K7_PERFCTR0,
 	.event_map		= amd_pmu_event_map,
-	.raw_event		= amd_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
 	.num_counters		= 4,
 	.cntval_bits		= 48,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index dfdd6f90fc8e..30bf10c55f1e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -758,6 +758,30 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
 	return x86_get_event_constraints(cpuc, event);
 }
 
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+	int ret = x86_pmu_hw_config(event);
+
+	if (ret)
+		return ret;
+
+	if (event->attr.type != PERF_TYPE_RAW)
+		return 0;
+
+	if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
+		return 0;
+
+	if (x86_pmu.version < 3)
+		return -EINVAL;
+
+	if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
+
+	return 0;
+}
+
 static __initconst struct x86_pmu core_pmu = {
 	.name			= "core",
 	.handle_irq		= x86_pmu_handle_irq,
@@ -765,12 +789,11 @@ static __initconst struct x86_pmu core_pmu = {
 	.enable_all		= x86_pmu_enable_all,
 	.enable			= x86_pmu_enable_event,
 	.disable		= x86_pmu_disable_event,
-	.hw_config		= x86_hw_config,
+	.hw_config		= x86_pmu_hw_config,
 	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
-	.raw_event		= x86_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
 	/*
@@ -804,12 +827,11 @@ static __initconst struct x86_pmu intel_pmu = {
 	.enable_all		= intel_pmu_enable_all,
 	.enable			= intel_pmu_enable_event,
 	.disable		= intel_pmu_disable_event,
-	.hw_config		= x86_hw_config,
+	.hw_config		= intel_pmu_hw_config,
 	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
-	.raw_event		= x86_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 4139100404e8..acd237d29f11 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -419,20 +419,7 @@ static u64 p4_pmu_event_map(int hw_event)
 	return config;
 }
 
-/*
- * We don't control raw events so it's up to the caller
- * to pass sane values (and we don't count the thread number
- * on HT machine but allow HT-compatible specifics to be
- * passed on)
- */
-static u64 p4_pmu_raw_event(u64 hw_event)
-{
-	return hw_event &
-		(p4_config_pack_escr(P4_ESCR_MASK_HT) |
-		 p4_config_pack_cccr(P4_CCCR_MASK_HT));
-}
-
-static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
+static int p4_hw_config(struct perf_event *event)
 {
 	int cpu = raw_smp_processor_id();
 	u32 escr, cccr;
@@ -444,11 +431,29 @@ static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
 	 */
 
 	cccr = p4_default_cccr_conf(cpu);
-	escr = p4_default_escr_conf(cpu, attr->exclude_kernel, attr->exclude_user);
-	hwc->config = p4_config_pack_escr(escr) | p4_config_pack_cccr(cccr);
+	escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
+					 event->attr.exclude_user);
+	event->hw.config = p4_config_pack_escr(escr) |
+			   p4_config_pack_cccr(cccr);
 
 	if (p4_ht_active() && p4_ht_thread(cpu))
-		hwc->config = p4_set_ht_bit(hwc->config);
+		event->hw.config = p4_set_ht_bit(event->hw.config);
+
+	if (event->attr.type != PERF_TYPE_RAW)
+		return 0;
+
+	/*
+	 * We don't control raw events so it's up to the caller
+	 * to pass sane values (and we don't count the thread number
+	 * on HT machine but allow HT-compatible specifics to be
+	 * passed on)
+	 *
+	 * XXX: HT wide things should check perf_paranoid_cpu() &&
+	 *      CAP_SYS_ADMIN
+	 */
+	event->hw.config |= event->attr.config &
+		(p4_config_pack_escr(P4_ESCR_MASK_HT) |
+		 p4_config_pack_cccr(P4_CCCR_MASK_HT));
 
 	return 0;
 }
@@ -785,7 +790,6 @@ static __initconst struct x86_pmu p4_pmu = {
 	.eventsel		= MSR_P4_BPU_CCCR0,
 	.perfctr		= MSR_P4_BPU_PERFCTR0,
 	.event_map		= p4_pmu_event_map,
-	.raw_event		= p4_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(p4_general_events),
 	.get_event_constraints	= x86_get_event_constraints,
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 03c139a67baa..9123e8ec9958 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -91,12 +91,11 @@ static __initconst struct x86_pmu p6_pmu = {
 	.enable_all		= p6_pmu_enable_all,
 	.enable			= p6_pmu_enable_event,
 	.disable		= p6_pmu_disable_event,
-	.hw_config		= x86_hw_config,
+	.hw_config		= x86_pmu_hw_config,
 	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_P6_EVNTSEL0,
 	.perfctr		= MSR_P6_PERFCTR0,
 	.event_map		= p6_pmu_event_map,
-	.raw_event		= x86_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(p6_perfmon_event_map),
 	.apic			= 1,
 	.max_period		= (1ULL << 31) - 1,
-- 
cgit v1.2.3-55-g7522


From caaa8be3b6707cb9664e573a28b00f845ce9f32e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Mon, 29 Mar 2010 13:09:53 +0200
Subject: perf, x86: Fix __initconst vs const

All variables that have __initconst should also be const.

Suggested-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_amd.c   |  4 ++--
 arch/x86/kernel/cpu/perf_event_intel.c | 12 ++++++------
 arch/x86/kernel/cpu/perf_event_p4.c    |  4 ++--
 arch/x86/kernel/cpu/perf_event_p6.c    |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index bbd7339f08a9..611df11ba15e 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -2,7 +2,7 @@
 
 static DEFINE_RAW_SPINLOCK(amd_nb_lock);
 
-static __initconst u64 amd_hw_cache_event_ids
+static __initconst const u64 amd_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -368,7 +368,7 @@ static void amd_pmu_cpu_dead(int cpu)
 	raw_spin_unlock(&amd_nb_lock);
 }
 
-static __initconst struct x86_pmu amd_pmu = {
+static __initconst const struct x86_pmu amd_pmu = {
 	.name			= "AMD",
 	.handle_irq		= x86_pmu_handle_irq,
 	.disable_all		= x86_pmu_disable_all,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 30bf10c55f1e..1957e3f14c04 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -88,7 +88,7 @@ static u64 intel_pmu_event_map(int hw_event)
 	return intel_perfmon_event_map[hw_event];
 }
 
-static __initconst u64 westmere_hw_cache_event_ids
+static __initconst const u64 westmere_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -179,7 +179,7 @@ static __initconst u64 westmere_hw_cache_event_ids
  },
 };
 
-static __initconst u64 nehalem_hw_cache_event_ids
+static __initconst const u64 nehalem_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -270,7 +270,7 @@ static __initconst u64 nehalem_hw_cache_event_ids
  },
 };
 
-static __initconst u64 core2_hw_cache_event_ids
+static __initconst const u64 core2_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -361,7 +361,7 @@ static __initconst u64 core2_hw_cache_event_ids
  },
 };
 
-static __initconst u64 atom_hw_cache_event_ids
+static __initconst const u64 atom_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -782,7 +782,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
 	return 0;
 }
 
-static __initconst struct x86_pmu core_pmu = {
+static __initconst const struct x86_pmu core_pmu = {
 	.name			= "core",
 	.handle_irq		= x86_pmu_handle_irq,
 	.disable_all		= x86_pmu_disable_all,
@@ -820,7 +820,7 @@ static void intel_pmu_cpu_dying(int cpu)
 	fini_debug_store_on_cpu(cpu);
 }
 
-static __initconst struct x86_pmu intel_pmu = {
+static __initconst const struct x86_pmu intel_pmu = {
 	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
 	.disable_all		= intel_pmu_disable_all,
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index acd237d29f11..15367cce66bd 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -287,7 +287,7 @@ static struct p4_event_bind p4_event_bind_map[] = {
 	p4_config_pack_cccr(cache_event					| \
 			    P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
 
-static __initconst u64 p4_hw_cache_event_ids
+static __initconst const u64 p4_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -780,7 +780,7 @@ done:
 	return num ? -ENOSPC : 0;
 }
 
-static __initconst struct x86_pmu p4_pmu = {
+static __initconst const struct x86_pmu p4_pmu = {
 	.name			= "Netburst P4/Xeon",
 	.handle_irq		= p4_pmu_handle_irq,
 	.disable_all		= p4_pmu_disable_all,
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 9123e8ec9958..34ba07be2cda 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -84,7 +84,7 @@ static void p6_pmu_enable_event(struct perf_event *event)
 	(void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
 }
 
-static __initconst struct x86_pmu p6_pmu = {
+static __initconst const struct x86_pmu p6_pmu = {
 	.name			= "p6",
 	.handle_irq		= x86_pmu_handle_irq,
 	.disable_all		= p6_pmu_disable_all,
-- 
cgit v1.2.3-55-g7522


From 40b91cd10f000b4c4934e48e2e5c0bec66def144 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Mon, 29 Mar 2010 16:37:17 +0200
Subject: perf, x86: Add Nehalem programming quirk to Westmere

According to the Xeon-5600 errata the Westmere suffers the same PMU
programming bug as the original Nehalem did.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 1957e3f14c04..f168b4030d40 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -488,6 +488,7 @@ static void intel_pmu_enable_all(int added)
  * Workaround for:
  *   Intel Errata AAK100 (model 26)
  *   Intel Errata AAP53  (model 30)
+ *   Intel Errata BD53   (model 44)
  *
  * These chips need to be 'reset' when adding counters by programming
  * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
@@ -980,6 +981,7 @@ static __init int intel_pmu_init(void)
 		intel_pmu_lbr_init_nhm();
 
 		x86_pmu.event_constraints = intel_westmere_event_constraints;
+		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
 		pr_cont("Westmere events, ");
 		break;
 
-- 
cgit v1.2.3-55-g7522


From 57283776b2b821ba4d592f61cad04d0293412740 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas
Date: Thu, 11 Mar 2010 12:20:11 -0700
Subject: ACPI: pci_root: pass acpi_pci_root to arch-specific scan

The acpi_pci_root structure contains all the individual items (acpi_device,
domain, bus number) we pass to pci_acpi_scan_root(), so just pass the
single acpi_pci_root pointer directly.

This will make it easier to add _CBA support later.  For _CBA, we need the
entire downstream bus range, not just the base bus number.  We have that in
the acpi_pci_root structure, so passing the pointer makes it available to
the arch-specific code.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Reviewed-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/ia64/pci/pci.c         | 5 ++++-
 arch/x86/pci/acpi.c         | 5 ++++-
 drivers/acpi/pci_root.c     | 2 +-
 include/acpi/acpi_drivers.h | 3 +--
 4 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index 64aff520b899..aa2533ae7e9e 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -335,8 +335,11 @@ pcibios_setup_root_windows(struct pci_bus *bus, struct pci_controller *ctrl)
 }
 
 struct pci_bus * __devinit
-pci_acpi_scan_root(struct acpi_device *device, int domain, int bus)
+pci_acpi_scan_root(struct acpi_pci_root *root)
 {
+	struct acpi_device *device = root->device;
+	int domain = root->segment;
+	int bus = root->secondary.start;
 	struct pci_controller *controller;
 	unsigned int windows = 0;
 	struct pci_bus *pbus;
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index e31160216efb..0b7882dbe784 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -229,8 +229,11 @@ res_alloc_fail:
 	return;
 }
 
-struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum)
+struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 {
+	struct acpi_device *device = root->device;
+	int domain = root->segment;
+	int busnum = root->secondary.start;
 	struct pci_bus *bus;
 	struct pci_sysdata *sd;
 	int node;
diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index bf476fea97ab..680450c905b3 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -498,7 +498,7 @@ static int __devinit acpi_pci_root_add(struct acpi_device *device)
 	 * PCI namespace does not get created until this call is made (and 
 	 * thus the root bridge's pci_dev does not exist).
 	 */
-	root->bus = pci_acpi_scan_root(device, segment, root->secondary.start);
+	root->bus = pci_acpi_scan_root(root);
 	if (!root->bus) {
 		printk(KERN_ERR PREFIX
 			    "Bus %04x:%02x not present in PCI namespace\n",
diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h
index 4f7b44866b76..23d78b4d088b 100644
--- a/include/acpi/acpi_drivers.h
+++ b/include/acpi/acpi_drivers.h
@@ -104,8 +104,7 @@ int acpi_pci_bind_root(struct acpi_device *device);
 
 /* Arch-defined function to add a bus to the system */
 
-struct pci_bus *pci_acpi_scan_root(struct acpi_device *device, int domain,
-				   int bus);
+struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root);
 void pci_acpi_crs_quirks(void);
 
 /* --------------------------------------------------------------------------
-- 
cgit v1.2.3-55-g7522


From 6f4dee06fbf0133917f3d76fa3fb50e18b10c1f5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Thu, 18 Mar 2010 23:47:01 +0100
Subject: perf: Drop the frame reliablity check

It is useless now that we have a pure stack frame
walker, as given addr are always reliable.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 65e9c5efb618..353a174adb44 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1602,8 +1602,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
 {
 	struct perf_callchain_entry *entry = data;
 
-	if (reliable)
-		callchain_store(entry, addr);
+	callchain_store(entry, addr);
 }
 
 static const struct stacktrace_ops backtrace_ops = {
-- 
cgit v1.2.3-55-g7522


From 4f64625412be120cef9e9b97e88c406ec2c78027 Mon Sep 17 00:00:00 2001
From: Pauli Nieminen
Date: Thu, 1 Apr 2010 12:45:01 +0000
Subject: arch/x86: Add array variants for setting memory to wc caching.

Setting single memory pages at a time to wc takes a lot time in cache flush. To
reduce number of cache flush set_pages_array_wc and set_memory_array_wc can be
used to set multiple pages to WC with single cache flush.

This improves allocation performance for wc cached pages in drm/ttm.

CC: Suresh Siddha <suresh.b.siddha@intel.com>
CC: Venkatesh Pallipadi <venkatesh.pallipadi@gmail.com>
Signed-off-by: Pauli Nieminen <suokkos@gmail.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 arch/x86/include/asm/cacheflush.h |  2 ++
 arch/x86/mm/pageattr.c            | 53 +++++++++++++++++++++++++++++++++------
 2 files changed, 47 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 634c40a739a6..d92d63a6286b 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -139,9 +139,11 @@ int set_memory_np(unsigned long addr, int numpages);
 int set_memory_4k(unsigned long addr, int numpages);
 
 int set_memory_array_uc(unsigned long *addr, int addrinarray);
+int set_memory_array_wc(unsigned long *addr, int addrinarray);
 int set_memory_array_wb(unsigned long *addr, int addrinarray);
 
 int set_pages_array_uc(struct page **pages, int addrinarray);
+int set_pages_array_wc(struct page **pages, int addrinarray);
 int set_pages_array_wb(struct page **pages, int addrinarray);
 
 /*
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index cf07c26d9a4a..0c98a7583bfa 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -997,7 +997,8 @@ out_err:
 }
 EXPORT_SYMBOL(set_memory_uc);
 
-int set_memory_array_uc(unsigned long *addr, int addrinarray)
+int _set_memory_array(unsigned long *addr, int addrinarray,
+		unsigned long new_type)
 {
 	int i, j;
 	int ret;
@@ -1007,13 +1008,19 @@ int set_memory_array_uc(unsigned long *addr, int addrinarray)
 	 */
 	for (i = 0; i < addrinarray; i++) {
 		ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
-					_PAGE_CACHE_UC_MINUS, NULL);
+					new_type, NULL);
 		if (ret)
 			goto out_free;
 	}
 
 	ret = change_page_attr_set(addr, addrinarray,
 				    __pgprot(_PAGE_CACHE_UC_MINUS), 1);
+
+	if (!ret && new_type == _PAGE_CACHE_WC)
+		ret = change_page_attr_set_clr(addr, addrinarray,
+					       __pgprot(_PAGE_CACHE_WC),
+					       __pgprot(_PAGE_CACHE_MASK),
+					       0, CPA_ARRAY, NULL);
 	if (ret)
 		goto out_free;
 
@@ -1025,8 +1032,19 @@ out_free:
 
 	return ret;
 }
+
+int set_memory_array_uc(unsigned long *addr, int addrinarray)
+{
+	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_UC_MINUS);
+}
 EXPORT_SYMBOL(set_memory_array_uc);
 
+int set_memory_array_wc(unsigned long *addr, int addrinarray)
+{
+	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_WC);
+}
+EXPORT_SYMBOL(set_memory_array_wc);
+
 int _set_memory_wc(unsigned long addr, int numpages)
 {
 	int ret;
@@ -1153,26 +1171,34 @@ int set_pages_uc(struct page *page, int numpages)
 }
 EXPORT_SYMBOL(set_pages_uc);
 
-int set_pages_array_uc(struct page **pages, int addrinarray)
+static int _set_pages_array(struct page **pages, int addrinarray,
+		unsigned long new_type)
 {
 	unsigned long start;
 	unsigned long end;
 	int i;
 	int free_idx;
+	int ret;
 
 	for (i = 0; i < addrinarray; i++) {
 		if (PageHighMem(pages[i]))
 			continue;
 		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
 		end = start + PAGE_SIZE;
-		if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
+		if (reserve_memtype(start, end, new_type, NULL))
 			goto err_out;
 	}
 
-	if (cpa_set_pages_array(pages, addrinarray,
-			__pgprot(_PAGE_CACHE_UC_MINUS)) == 0) {
-		return 0; /* Success */
-	}
+	ret = cpa_set_pages_array(pages, addrinarray,
+			__pgprot(_PAGE_CACHE_UC_MINUS));
+	if (!ret && new_type == _PAGE_CACHE_WC)
+		ret = change_page_attr_set_clr(NULL, addrinarray,
+					       __pgprot(_PAGE_CACHE_WC),
+					       __pgprot(_PAGE_CACHE_MASK),
+					       0, CPA_PAGES_ARRAY, pages);
+	if (ret)
+		goto err_out;
+	return 0; /* Success */
 err_out:
 	free_idx = i;
 	for (i = 0; i < free_idx; i++) {
@@ -1184,8 +1210,19 @@ err_out:
 	}
 	return -EINVAL;
 }
+
+int set_pages_array_uc(struct page **pages, int addrinarray)
+{
+	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_UC_MINUS);
+}
 EXPORT_SYMBOL(set_pages_array_uc);
 
+int set_pages_array_wc(struct page **pages, int addrinarray)
+{
+	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_WC);
+}
+EXPORT_SYMBOL(set_pages_array_wc);
+
 int set_pages_wb(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
-- 
cgit v1.2.3-55-g7522


From d61931d89be506372d01a90d1755f6d0a9fafe2d Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Fri, 5 Mar 2010 17:34:46 +0100
Subject: x86: Add optimized popcnt variants

Add support for the hardware version of the Hamming weight function,
popcnt, present in CPUs which advertize it under CPUID, Function
0x0000_0001_ECX[23]. On CPUs which don't support it, we fallback to the
default lib/hweight.c sw versions.

A synthetic benchmark comparing popcnt with __sw_hweight64 showed almost
a 3x speedup on a F10h machine.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20100318112015.GC11152@aftab>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig                          |  5 +++
 arch/x86/include/asm/alternative.h        |  9 +++--
 arch/x86/include/asm/arch_hweight.h       | 59 +++++++++++++++++++++++++++++++
 arch/x86/include/asm/bitops.h             |  4 ++-
 include/asm-generic/bitops/arch_hweight.h | 22 +++++++++---
 lib/Makefile                              |  3 ++
 lib/hweight.c                             | 20 +++++------
 scripts/Makefile.lib                      |  4 +++
 8 files changed, 108 insertions(+), 18 deletions(-)
 create mode 100644 arch/x86/include/asm/arch_hweight.h

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0eacb1ffb421..89d8c54cdd37 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -238,6 +238,11 @@ config X86_32_LAZY_GS
 	def_bool y
 	depends on X86_32 && !CC_STACKPROTECTOR
 
+config ARCH_HWEIGHT_CFLAGS
+	string
+	default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
+	default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
+
 config KTIME_SCALAR
 	def_bool X86_32
 source "init/Kconfig"
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index b09ec55650b3..67dae51e7fd0 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -39,9 +39,6 @@
 #define LOCK_PREFIX ""
 #endif
 
-/* This must be included *after* the definition of LOCK_PREFIX */
-#include <asm/cpufeature.h>
-
 struct alt_instr {
 	u8 *instr;		/* original instruction */
 	u8 *replacement;
@@ -95,6 +92,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
       "663:\n\t" newinstr "\n664:\n"		/* replacement     */	\
       ".previous"
 
+/*
+ * This must be included *after* the definition of ALTERNATIVE due to
+ * <asm/arch_hweight.h>
+ */
+#include <asm/cpufeature.h>
+
 /*
  * Alternative instructions for different CPU types or capabilities.
  *
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
new file mode 100644
index 000000000000..d1fc3c219ae6
--- /dev/null
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -0,0 +1,59 @@
+#ifndef _ASM_X86_HWEIGHT_H
+#define _ASM_X86_HWEIGHT_H
+
+#ifdef CONFIG_64BIT
+/* popcnt %rdi, %rax */
+#define POPCNT ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
+#define REG_IN "D"
+#define REG_OUT "a"
+#else
+/* popcnt %eax, %eax */
+#define POPCNT ".byte 0xf3,0x0f,0xb8,0xc0"
+#define REG_IN "a"
+#define REG_OUT "a"
+#endif
+
+/*
+ * __sw_hweightXX are called from within the alternatives below
+ * and callee-clobbered registers need to be taken care of. See
+ * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
+ * compiler switches.
+ */
+static inline unsigned int __arch_hweight32(unsigned int w)
+{
+	unsigned int res = 0;
+
+	asm (ALTERNATIVE("call __sw_hweight32", POPCNT, X86_FEATURE_POPCNT)
+		     : "="REG_OUT (res)
+		     : REG_IN (w));
+
+	return res;
+}
+
+static inline unsigned int __arch_hweight16(unsigned int w)
+{
+	return __arch_hweight32(w & 0xffff);
+}
+
+static inline unsigned int __arch_hweight8(unsigned int w)
+{
+	return __arch_hweight32(w & 0xff);
+}
+
+static inline unsigned long __arch_hweight64(__u64 w)
+{
+	unsigned long res = 0;
+
+#ifdef CONFIG_X86_32
+	return  __arch_hweight32((u32)w) +
+		__arch_hweight32((u32)(w >> 32));
+#else
+	asm (ALTERNATIVE("call __sw_hweight64", POPCNT, X86_FEATURE_POPCNT)
+		     : "="REG_OUT (res)
+		     : REG_IN (w));
+#endif /* CONFIG_X86_32 */
+
+	return res;
+}
+
+#endif
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 02b47a603fc8..545776efeb16 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -444,7 +444,9 @@ static inline int fls(int x)
 
 #define ARCH_HAS_FAST_MULTIPLIER 1
 
-#include <asm-generic/bitops/hweight.h>
+#include <asm/arch_hweight.h>
+
+#include <asm-generic/bitops/const_hweight.h>
 
 #endif /* __KERNEL__ */
 
diff --git a/include/asm-generic/bitops/arch_hweight.h b/include/asm-generic/bitops/arch_hweight.h
index 3a7be842cdce..9a81c1e9436c 100644
--- a/include/asm-generic/bitops/arch_hweight.h
+++ b/include/asm-generic/bitops/arch_hweight.h
@@ -3,9 +3,23 @@
 
 #include <asm/types.h>
 
-extern unsigned int __arch_hweight32(unsigned int w);
-extern unsigned int __arch_hweight16(unsigned int w);
-extern unsigned int __arch_hweight8(unsigned int w);
-extern unsigned long __arch_hweight64(__u64 w);
+inline unsigned int __arch_hweight32(unsigned int w)
+{
+	return __sw_hweight32(w);
+}
 
+inline unsigned int __arch_hweight16(unsigned int w)
+{
+	return __sw_hweight16(w);
+}
+
+inline unsigned int __arch_hweight8(unsigned int w)
+{
+	return __sw_hweight8(w);
+}
+
+inline unsigned long __arch_hweight64(__u64 w)
+{
+	return __sw_hweight64(w);
+}
 #endif /* _ASM_GENERIC_BITOPS_HWEIGHT_H_ */
diff --git a/lib/Makefile b/lib/Makefile
index 2e152aed7198..abe63a8ad143 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -39,7 +39,10 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
 obj-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o
+
+CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
+
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
 obj-$(CONFIG_BTREE) += btree.o
 obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
diff --git a/lib/hweight.c b/lib/hweight.c
index a6927e76840f..3c79d50814cf 100644
--- a/lib/hweight.c
+++ b/lib/hweight.c
@@ -9,7 +9,7 @@
  * The Hamming Weight of a number is the total number of bits set in it.
  */
 
-unsigned int __arch_hweight32(unsigned int w)
+unsigned int __sw_hweight32(unsigned int w)
 {
 #ifdef ARCH_HAS_FAST_MULTIPLIER
 	w -= (w >> 1) & 0x55555555;
@@ -24,30 +24,30 @@ unsigned int __arch_hweight32(unsigned int w)
 	return (res + (res >> 16)) & 0x000000FF;
 #endif
 }
-EXPORT_SYMBOL(__arch_hweight32);
+EXPORT_SYMBOL(__sw_hweight32);
 
-unsigned int __arch_hweight16(unsigned int w)
+unsigned int __sw_hweight16(unsigned int w)
 {
 	unsigned int res = w - ((w >> 1) & 0x5555);
 	res = (res & 0x3333) + ((res >> 2) & 0x3333);
 	res = (res + (res >> 4)) & 0x0F0F;
 	return (res + (res >> 8)) & 0x00FF;
 }
-EXPORT_SYMBOL(__arch_hweight16);
+EXPORT_SYMBOL(__sw_hweight16);
 
-unsigned int __arch_hweight8(unsigned int w)
+unsigned int __sw_hweight8(unsigned int w)
 {
 	unsigned int res = w - ((w >> 1) & 0x55);
 	res = (res & 0x33) + ((res >> 2) & 0x33);
 	return (res + (res >> 4)) & 0x0F;
 }
-EXPORT_SYMBOL(__arch_hweight8);
+EXPORT_SYMBOL(__sw_hweight8);
 
-unsigned long __arch_hweight64(__u64 w)
+unsigned long __sw_hweight64(__u64 w)
 {
 #if BITS_PER_LONG == 32
-	return __arch_hweight32((unsigned int)(w >> 32)) +
-	       __arch_hweight32((unsigned int)w);
+	return __sw_hweight32((unsigned int)(w >> 32)) +
+	       __sw_hweight32((unsigned int)w);
 #elif BITS_PER_LONG == 64
 #ifdef ARCH_HAS_FAST_MULTIPLIER
 	w -= (w >> 1) & 0x5555555555555555ul;
@@ -64,4 +64,4 @@ unsigned long __arch_hweight64(__u64 w)
 #endif
 #endif
 }
-EXPORT_SYMBOL(__arch_hweight64);
+EXPORT_SYMBOL(__sw_hweight64);
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index f9bdf264473d..cbcd654215e6 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -245,3 +245,7 @@ quiet_cmd_lzo = LZO    $@
 cmd_lzo = (cat $(filter-out FORCE,$^) | \
 	lzop -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
 	(rm -f $@ ; false)
+
+# misc stuff
+# ---------------------------------------------------------------------------
+quote:="
-- 
cgit v1.2.3-55-g7522


From 5958f1d5d722df7a9e5d129676614a8e5219bacd Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Wed, 31 Mar 2010 21:56:41 +0200
Subject: x86, cpu: Add AMD core boosting feature flag to /proc/cpuinfo

By semi-popular demand, this adds the Core Performance Boost feature
flag to /proc/cpuinfo. Possible use case for this is userspace tools
like cpufreq-aperf, for example, so that they don't have to jump through
hoops of accessing "/dev/cpu/%d/cpuid" in order to check for CPB hw
support, or call cpuid from userspace.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1270065406-1814-2-git-send-email-bp@amd64.org>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/cpufeature.h          | 1 +
 arch/x86/kernel/cpu/addon_cpuid_features.c | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 0cd82d068613..630e623f61e0 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -161,6 +161,7 @@
  */
 #define X86_FEATURE_IDA		(7*32+ 0) /* Intel Dynamic Acceleration */
 #define X86_FEATURE_ARAT	(7*32+ 1) /* Always Running APIC Timer */
+#define X86_FEATURE_CPB		(7*32+ 2) /* AMD Core Performance Boost */
 
 /* Virtualization flags: Linux defined */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 97ad79cdf688..ead2a1cfa570 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -30,8 +30,9 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 	const struct cpuid_bit *cb;
 
 	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
-		{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
-		{ X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
+		{ X86_FEATURE_IDA,   CR_EAX, 1, 0x00000006 },
+		{ X86_FEATURE_ARAT,  CR_EAX, 2, 0x00000006 },
+		{ X86_FEATURE_CPB,   CR_EDX, 9, 0x80000007 },
 		{ X86_FEATURE_NPT,   CR_EDX, 0, 0x8000000a },
 		{ X86_FEATURE_LBRV,  CR_EDX, 1, 0x8000000a },
 		{ X86_FEATURE_SVML,  CR_EDX, 2, 0x8000000a },
-- 
cgit v1.2.3-55-g7522


From 73860c6b2fd159a35637e233d735e36887c266ad Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Wed, 31 Mar 2010 21:56:42 +0200
Subject: powernow-k8: Add core performance boost support

Starting with F10h, revE, AMD processors add support for a dynamic
core boosting feature called Core Performance Boost. When a specific
condition is present, a subset of the cores on a system are boosted
beyond their P0 operating frequency to speed up the performance of
single-threaded applications.

In the normal case, the system comes out of reset with core boosting
enabled. This patch adds a sysfs knob with which core boosting can be
switched on or off for benchmarking purposes.

While at it, make the CPB code hotplug-aware so that taking cores
offline wouldn't interfere with boosting the remaining online cores.
Furthermore, add cpu_online_mask hotplug protection as suggested by
Andrew.

Finally, cleanup the driver init codepath and update copyrights.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1270065406-1814-3-git-send-email-bp@amd64.org>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 161 ++++++++++++++++++++++++++++--
 arch/x86/kernel/cpu/cpufreq/powernow-k8.h |   2 -
 2 files changed, 151 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index d360b56e9825..74ca34b5c003 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1,6 +1,5 @@
-
 /*
- *   (c) 2003-2006 Advanced Micro Devices, Inc.
+ *   (c) 2003-2010 Advanced Micro Devices, Inc.
  *  Your use of this code is subject to the terms and conditions of the
  *  GNU general public license version 2. See "COPYING" or
  *  http://www.gnu.org/licenses/gpl.html
@@ -54,6 +53,10 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
 
 static int cpu_family = CPU_OPTERON;
 
+/* core performance boost */
+static bool cpb_capable, cpb_enabled;
+static struct msr __percpu *msrs;
+
 #ifndef CONFIG_SMP
 static inline const struct cpumask *cpu_core_mask(int cpu)
 {
@@ -1393,8 +1396,77 @@ out:
 	return khz;
 }
 
+static void _cpb_toggle_msrs(bool t)
+{
+	int cpu;
+
+	get_online_cpus();
+
+	rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+	for_each_cpu(cpu, cpu_online_mask) {
+		struct msr *reg = per_cpu_ptr(msrs, cpu);
+		if (t)
+			reg->l &= ~BIT(25);
+		else
+			reg->l |= BIT(25);
+	}
+	wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+	put_online_cpus();
+}
+
+/*
+ * Switch on/off core performance boosting.
+ *
+ * 0=disable
+ * 1=enable.
+ */
+static void cpb_toggle(bool t)
+{
+	if (!cpb_capable)
+		return;
+
+	if (t && !cpb_enabled) {
+		cpb_enabled = true;
+		_cpb_toggle_msrs(t);
+		printk(KERN_INFO PFX "Core Boosting enabled.\n");
+	} else if (!t && cpb_enabled) {
+		cpb_enabled = false;
+		_cpb_toggle_msrs(t);
+		printk(KERN_INFO PFX "Core Boosting disabled.\n");
+	}
+}
+
+static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
+				 size_t count)
+{
+	int ret = -EINVAL;
+	unsigned long val = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (!ret && (val == 0 || val == 1) && cpb_capable)
+		cpb_toggle(val);
+	else
+		return -EINVAL;
+
+	return count;
+}
+
+static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
+{
+	return sprintf(buf, "%u\n", cpb_enabled);
+}
+
+#define define_one_rw(_name) \
+static struct freq_attr _name = \
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+define_one_rw(cpb);
+
 static struct freq_attr *powernow_k8_attr[] = {
 	&cpufreq_freq_attr_scaling_available_freqs,
+	&cpb,
 	NULL,
 };
 
@@ -1410,10 +1482,51 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
 	.attr		= powernow_k8_attr,
 };
 
+/*
+ * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
+ * cannot block the remaining ones from boosting. On the CPU_UP path we
+ * simply keep the boost-disable flag in sync with the current global
+ * state.
+ */
+static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action,
+				void *hcpu)
+{
+	unsigned cpu = (long)hcpu;
+	u32 lo, hi;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+
+		if (!cpb_enabled) {
+			rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
+			lo |= BIT(25);
+			wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
+		}
+		break;
+
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
+		lo &= ~BIT(25);
+		wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
+		break;
+
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata cpb_nb = {
+	.notifier_call		= cpb_notify,
+};
+
 /* driver entry point for init */
 static int __cpuinit powernowk8_init(void)
 {
-	unsigned int i, supported_cpus = 0;
+	unsigned int i, supported_cpus = 0, cpu;
 
 	for_each_online_cpu(i) {
 		int rc;
@@ -1422,15 +1535,36 @@ static int __cpuinit powernowk8_init(void)
 			supported_cpus++;
 	}
 
-	if (supported_cpus == num_online_cpus()) {
-		printk(KERN_INFO PFX "Found %d %s "
-			"processors (%d cpu cores) (" VERSION ")\n",
-			num_online_nodes(),
-			boot_cpu_data.x86_model_id, supported_cpus);
-		return cpufreq_register_driver(&cpufreq_amd64_driver);
+	if (supported_cpus != num_online_cpus())
+		return -ENODEV;
+
+	printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
+		num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
+
+	if (boot_cpu_has(X86_FEATURE_CPB)) {
+
+		cpb_capable = true;
+
+		register_cpu_notifier(&cpb_nb);
+
+		msrs = msrs_alloc();
+		if (!msrs) {
+			printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
+			return -ENOMEM;
+		}
+
+		rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+		for_each_cpu(cpu, cpu_online_mask) {
+			struct msr *reg = per_cpu_ptr(msrs, cpu);
+			cpb_enabled |= !(!!(reg->l & BIT(25)));
+		}
+
+		printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
+			(cpb_enabled ? "on" : "off"));
 	}
 
-	return -ENODEV;
+	return cpufreq_register_driver(&cpufreq_amd64_driver);
 }
 
 /* driver entry point for term */
@@ -1438,6 +1572,13 @@ static void __exit powernowk8_exit(void)
 {
 	dprintk("exit\n");
 
+	if (boot_cpu_has(X86_FEATURE_CPB)) {
+		msrs_free(msrs);
+		msrs = NULL;
+
+		unregister_cpu_notifier(&cpb_nb);
+	}
+
 	cpufreq_unregister_driver(&cpufreq_amd64_driver);
 }
 
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 02ce824073cb..df3529b1c02d 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -5,7 +5,6 @@
  *  http://www.gnu.org/licenses/gpl.html
  */
 
-
 enum pstate {
 	HW_PSTATE_INVALID = 0xff,
 	HW_PSTATE_0 = 0,
@@ -55,7 +54,6 @@ struct powernow_k8_data {
 	struct cpumask *available_cores;
 };
 
-
 /* processor's cpuid instruction support */
 #define CPUID_PROCESSOR_SIGNATURE	1	/* function 1 */
 #define CPUID_XFAM			0x0ff00000	/* extended family */
-- 
cgit v1.2.3-55-g7522


From d65ad45cd82a0db9544469b8c54f5dc5cafbb2d8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Wed, 31 Mar 2010 21:56:43 +0200
Subject: x86: Unify APERF/MPERF support

Initialize this CPUID flag feature in common code. It could be made a
standalone function later, maybe, if more functionality is duplicated.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1270065406-1814-4-git-send-email-bp@amd64.org>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/addon_cpuid_features.c | 8 ++++++++
 arch/x86/kernel/cpu/intel.c                | 6 ------
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index ead2a1cfa570..fd1fc1902a47 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -54,6 +54,14 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		if (regs[cb->reg] & (1 << cb->bit))
 			set_cpu_cap(c, cb->feature);
 	}
+
+	/*
+	 * common AMD/Intel features
+	 */
+	if (c->cpuid_level >= 6) {
+		if (cpuid_ecx(6) & 0x1)
+			set_cpu_cap(c, X86_FEATURE_APERFMPERF);
+	}
 }
 
 /* leaf 0xb SMT level */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7e1cca13af35..3830258a5f53 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -352,12 +352,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
 	}
 
-	if (c->cpuid_level > 6) {
-		unsigned ecx = cpuid_ecx(6);
-		if (ecx & 0x01)
-			set_cpu_cap(c, X86_FEATURE_APERFMPERF);
-	}
-
 	if (cpu_has_xmm2)
 		set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
 	if (cpu_has_ds) {
-- 
cgit v1.2.3-55-g7522


From a2fed573f065e526bfd5cbf26e5491973d9e9aaa Mon Sep 17 00:00:00 2001
From: Mark Langsdorf
Date: Thu, 18 Mar 2010 18:41:46 +0100
Subject: x86, cpufreq: Add APERF/MPERF support for AMD processors

Starting with model 10 of Family 0x10, AMD processors may have
support for APERF/MPERF.  Add support for identifying it and using
it within cpufreq.  Move the APERF/MPERF functions out of the
acpi-cpufreq code and into their own file so they can easily be
shared.

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
LKML-Reference: <20100401141956.GA1930@aftab>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/cpufreq/Makefile       |  4 +--
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 44 ++------------------------
 arch/x86/kernel/cpu/cpufreq/mperf.c        | 51 ++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/cpufreq/mperf.h        |  9 ++++++
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c  |  8 +++++
 5 files changed, 72 insertions(+), 44 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/cpufreq/mperf.c
 create mode 100644 arch/x86/kernel/cpu/cpufreq/mperf.h

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 1840c0a5170b..bd54bf67e6fb 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -2,8 +2,8 @@
 # K8 systems. ACPI is preferred to all other hardware-specific drivers.
 # speedstep-* is preferred over p4-clockmod.
 
-obj-$(CONFIG_X86_POWERNOW_K8)		+= powernow-k8.o
-obj-$(CONFIG_X86_ACPI_CPUFREQ)		+= acpi-cpufreq.o
+obj-$(CONFIG_X86_POWERNOW_K8)		+= powernow-k8.o mperf.o
+obj-$(CONFIG_X86_ACPI_CPUFREQ)		+= acpi-cpufreq.o mperf.o
 obj-$(CONFIG_X86_PCC_CPUFREQ)		+= pcc-cpufreq.o
 obj-$(CONFIG_X86_POWERNOW_K6)		+= powernow-k6.o
 obj-$(CONFIG_X86_POWERNOW_K7)		+= powernow-k7.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 1b1920fa7c80..dc68e5c2c071 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -45,6 +45,7 @@
 #include <asm/msr.h>
 #include <asm/processor.h>
 #include <asm/cpufeature.h>
+#include "mperf.h"
 
 #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
 		"acpi-cpufreq", msg)
@@ -70,8 +71,6 @@ struct acpi_cpufreq_data {
 
 static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
 
-static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
-
 /* acpi_perf_data is a pointer to percpu data. */
 static struct acpi_processor_performance *acpi_perf_data;
 
@@ -239,45 +238,6 @@ static u32 get_cur_val(const struct cpumask *mask)
 	return cmd.val;
 }
 
-/* Called via smp_call_function_single(), on the target CPU */
-static void read_measured_perf_ctrs(void *_cur)
-{
-	struct aperfmperf *am = _cur;
-
-	get_aperfmperf(am);
-}
-
-/*
- * Return the measured active (C0) frequency on this CPU since last call
- * to this function.
- * Input: cpu number
- * Return: Average CPU frequency in terms of max frequency (zero on error)
- *
- * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
- * over a period of time, while CPU is in C0 state.
- * IA32_MPERF counts at the rate of max advertised frequency
- * IA32_APERF counts at the rate of actual CPU frequency
- * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
- * no meaning should be associated with absolute values of these MSRs.
- */
-static unsigned int get_measured_perf(struct cpufreq_policy *policy,
-				      unsigned int cpu)
-{
-	struct aperfmperf perf;
-	unsigned long ratio;
-	unsigned int retval;
-
-	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
-		return 0;
-
-	ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
-	per_cpu(acfreq_old_perf, cpu) = perf;
-
-	retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
-
-	return retval;
-}
-
 static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
 {
 	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
@@ -701,7 +661,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	/* Check for APERF/MPERF support in hardware */
 	if (cpu_has(c, X86_FEATURE_APERFMPERF))
-		acpi_cpufreq_driver.getavg = get_measured_perf;
+		acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
 
 	dprintk("CPU%u - ACPI performance management activated.\n", cpu);
 	for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
new file mode 100644
index 000000000000..911e193018ae
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.c
@@ -0,0 +1,51 @@
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/slab.h>
+
+#include "mperf.h"
+
+static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
+
+/* Called via smp_call_function_single(), on the target CPU */
+static void read_measured_perf_ctrs(void *_cur)
+{
+	struct aperfmperf *am = _cur;
+
+	get_aperfmperf(am);
+}
+
+/*
+ * Return the measured active (C0) frequency on this CPU since last call
+ * to this function.
+ * Input: cpu number
+ * Return: Average CPU frequency in terms of max frequency (zero on error)
+ *
+ * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
+ * over a period of time, while CPU is in C0 state.
+ * IA32_MPERF counts at the rate of max advertised frequency
+ * IA32_APERF counts at the rate of actual CPU frequency
+ * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
+ * no meaning should be associated with absolute values of these MSRs.
+ */
+unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
+					unsigned int cpu)
+{
+	struct aperfmperf perf;
+	unsigned long ratio;
+	unsigned int retval;
+
+	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
+		return 0;
+
+	ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
+	per_cpu(acfreq_old_perf, cpu) = perf;
+
+	retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
new file mode 100644
index 000000000000..5dbf2950dc22
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.h
@@ -0,0 +1,9 @@
+/*
+ *  (c) 2010 Advanced Micro Devices, Inc.
+ *  Your use of this code is subject to the terms and conditions of the
+ *  GNU general public license version 2. See "COPYING" or
+ *  http://www.gnu.org/licenses/gpl.html
+ */
+
+unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
+					unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 74ca34b5c003..52fce638f444 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -45,6 +45,7 @@
 #define PFX "powernow-k8: "
 #define VERSION "version 2.20.00"
 #include "powernow-k8.h"
+#include "mperf.h"
 
 /* serialize freq changes  */
 static DEFINE_MUTEX(fidvid_mutex);
@@ -57,6 +58,8 @@ static int cpu_family = CPU_OPTERON;
 static bool cpb_capable, cpb_enabled;
 static struct msr __percpu *msrs;
 
+static struct cpufreq_driver cpufreq_amd64_driver;
+
 #ifndef CONFIG_SMP
 static inline const struct cpumask *cpu_core_mask(int cpu)
 {
@@ -1251,6 +1254,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 	struct powernow_k8_data *data;
 	struct init_on_cpu init_on_cpu;
 	int rc;
+	struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
 
 	if (!cpu_online(pol->cpu))
 		return -ENODEV;
@@ -1325,6 +1329,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 		return -EINVAL;
 	}
 
+	/* Check for APERF/MPERF support in hardware */
+	if (cpu_has(c, X86_FEATURE_APERFMPERF))
+		cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
+
 	cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
 
 	if (cpu_family == CPU_HW_PSTATE)
-- 
cgit v1.2.3-55-g7522


From 679370641e3675633cad222449262abbe93a4a2a Mon Sep 17 00:00:00 2001
From: Mark Langsdorf
Date: Wed, 31 Mar 2010 21:56:45 +0200
Subject: powernow-k8: Fix frequency reporting

With F10, model 10, all valid frequencies are in the ACPI _PST table.

Cc: <stable@kernel.org> # 33.x 32.x
Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
LKML-Reference: <1270065406-1814-6-git-send-email-bp@amd64.org>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 52fce638f444..6f3dc8fbbfdc 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -935,7 +935,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
 		powernow_table[i].index = index;
 
 		/* Frequency may be rounded for these */
-		if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) {
+		if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
+				 || boot_cpu_data.x86 == 0x11) {
 			powernow_table[i].frequency =
 				freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
 		} else
-- 
cgit v1.2.3-55-g7522


From 3f10940e4fb69d312602078f2c5234206797ca31 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann
Date: Sat, 10 Apr 2010 16:46:21 +0200
Subject: x86/microcode: Use nonseekable_open()

No need to seek on this file, so prevent it outright so we can
avoid using default_llseek - removes one more BKL usage.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
[drop useless llseek = no_llseek and smp_lock.h inclusion]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnd Bergmann <arnd@relay.de.ibm.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Dmitry Adamushko <dmitry.adamushko@gmail.com>
LKML-Reference: <1270910781-8786-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index cceb5bc3c3c2..2cd8c544e41a 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -201,9 +201,9 @@ static int do_microcode_update(const void __user *buf, size_t size)
 	return error;
 }
 
-static int microcode_open(struct inode *unused1, struct file *unused2)
+static int microcode_open(struct inode *inode, struct file *file)
 {
-	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+	return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
 }
 
 static ssize_t microcode_write(struct file *file, const char __user *buf,
-- 
cgit v1.2.3-55-g7522


From b8f7fb13d2d7ff14818fd1d3edd8b834d38b0217 Mon Sep 17 00:00:00 2001
From: Cliff Wickman
Date: Wed, 14 Apr 2010 11:35:46 -0500
Subject: x86, UV: Improve BAU performance and error recovery

- increase performance of the interrupt handler

- release timed-out software acknowledge resources

- recover from continuous-busy status due to a hardware issue

- add a 'throttle' to keep a uvhub from sending more than a
  specified number of broadcasts concurrently (work around the hardware issue)

- provide a 'nobau' boot command line option

- rename 'pnode' and 'node' to 'uvhub' (the 'node' terminology
  is ambiguous)

- add some new statistics about the scope of broadcasts, retries, the
  hardware issue and the 'throttle'

- split off new function uv_bau_retry_msg() from
  uv_bau_process_message() per community coding style feedback.

- simplify the argument list to uv_bau_process_message(), per
  community coding style feedback.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: linux-mm@kvack.org
Cc: Jack Steiner <steiner@sgi.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Mike Travis <travis@sgi.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <E1O25Z4-0004Ur-PB@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |  247 +++++---
 arch/x86/kernel/tlb_uv.c         | 1270 +++++++++++++++++++++++++++-----------
 2 files changed, 1075 insertions(+), 442 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index b414d2b401f6..aa558ac0306e 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -27,13 +27,14 @@
  * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
  *
  * We will use 31 sets, one for sending BAU messages from each of the 32
- * cpu's on the node.
+ * cpu's on the uvhub.
  *
  * TLB shootdown will use the first of the 8 descriptors of each set.
  * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
  */
 
 #define UV_ITEMS_PER_DESCRIPTOR		8
+#define MAX_BAU_CONCURRENT		3
 #define UV_CPUS_PER_ACT_STATUS		32
 #define UV_ACT_STATUS_MASK		0x3
 #define UV_ACT_STATUS_SIZE		2
@@ -45,6 +46,9 @@
 #define UV_PAYLOADQ_PNODE_SHIFT		49
 #define UV_PTC_BASENAME			"sgi_uv/ptc_statistics"
 #define uv_physnodeaddr(x)		((__pa((unsigned long)(x)) & uv_mmask))
+#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
 
 /*
  * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
@@ -55,15 +59,29 @@
 #define DESC_STATUS_SOURCE_TIMEOUT	3
 
 /*
- * source side thresholds at which message retries print a warning
+ * source side threshholds at which message retries print a warning
  */
 #define SOURCE_TIMEOUT_LIMIT		20
 #define DESTINATION_TIMEOUT_LIMIT	20
 
+/*
+ * misc. delays, in microseconds
+ */
+#define THROTTLE_DELAY			10
+#define TIMEOUT_DELAY			10
+#define BIOS_TO				1000
+/* BIOS is assumed to set the destination timeout to 1003520 nanoseconds */
+
+/*
+ * threshholds at which to use IPI to free resources
+ */
+#define PLUGSB4RESET 100
+#define TIMEOUTSB4RESET 100
+
 /*
  * number of entries in the destination side payload queue
  */
-#define DEST_Q_SIZE			17
+#define DEST_Q_SIZE			20
 /*
  * number of destination side software ack resources
  */
@@ -72,9 +90,10 @@
 /*
  * completion statuses for sending a TLB flush message
  */
-#define	FLUSH_RETRY			1
-#define	FLUSH_GIVEUP			2
-#define	FLUSH_COMPLETE			3
+#define FLUSH_RETRY_PLUGGED		1
+#define FLUSH_RETRY_TIMEOUT		2
+#define FLUSH_GIVEUP			3
+#define FLUSH_COMPLETE			4
 
 /*
  * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
@@ -86,14 +105,14 @@
  * 'base_dest_nodeid' field of the header corresponds to the
  * destination nodeID associated with that specified bit.
  */
-struct bau_target_nodemask {
-	unsigned long bits[BITS_TO_LONGS(256)];
+struct bau_target_uvhubmask {
+	unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)];
 };
 
 /*
- * mask of cpu's on a node
+ * mask of cpu's on a uvhub
  * (during initialization we need to check that unsigned long has
- *  enough bits for max. cpu's per node)
+ *  enough bits for max. cpu's per uvhub)
  */
 struct bau_local_cpumask {
 	unsigned long bits;
@@ -135,8 +154,8 @@ struct bau_msg_payload {
 struct bau_msg_header {
 	unsigned int dest_subnodeid:6;	/* must be 0x10, for the LB */
 	/* bits 5:0 */
-	unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */
-	/* bits 20:6 */			  /* first bit in node_map */
+	unsigned int base_dest_nodeid:15; /* nasid (pnode<<1) of */
+	/* bits 20:6 */			  /* first bit in uvhub map */
 	unsigned int command:8;	/* message type */
 	/* bits 28:21 */
 				/* 0x38: SN3net EndPoint Message */
@@ -146,26 +165,38 @@ struct bau_msg_header {
 	unsigned int rsvd_2:9;	/* must be zero */
 	/* bits 40:32 */
 				/* Suppl_A is 56-41 */
-	unsigned int payload_2a:8;/* becomes byte 16 of msg */
-	/* bits 48:41 */	/* not currently using */
-	unsigned int payload_2b:8;/* becomes byte 17 of msg */
-	/* bits 56:49 */	/* not currently using */
+	unsigned int sequence:16;/* message sequence number */
+	/* bits 56:41 */	/* becomes bytes 16-17 of msg */
 				/* Address field (96:57) is never used as an
 				   address (these are address bits 42:3) */
+
 	unsigned int rsvd_3:1;	/* must be zero */
 	/* bit 57 */
 				/* address bits 27:4 are payload */
-				/* these 24 bits become bytes 12-14 of msg */
+	/* these next 24  (58-81) bits become bytes 12-14 of msg */
+
+	/* bits 65:58 land in byte 12 */
 	unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */
 	/* bit 58 */
-
-	unsigned int payload_1a:5;/* not currently used */
-	/* bits 63:59 */
-	unsigned int payload_1b:8;/* not currently used */
-	/* bits 71:64 */
-	unsigned int payload_1c:8;/* not currently used */
-	/* bits 79:72 */
-	unsigned int payload_1d:2;/* not currently used */
+	unsigned int msg_type:3; /* software type of the message*/
+	/* bits 61:59 */
+	unsigned int canceled:1; /* message canceled, resource to be freed*/
+	/* bit 62 */
+	unsigned int payload_1a:1;/* not currently used */
+	/* bit 63 */
+	unsigned int payload_1b:2;/* not currently used */
+	/* bits 65:64 */
+
+	/* bits 73:66 land in byte 13 */
+	unsigned int payload_1ca:6;/* not currently used */
+	/* bits 71:66 */
+	unsigned int payload_1c:2;/* not currently used */
+	/* bits 73:72 */
+
+	/* bits 81:74 land in byte 14 */
+	unsigned int payload_1d:6;/* not currently used */
+	/* bits 79:74 */
+	unsigned int payload_1e:2;/* not currently used */
 	/* bits 81:80 */
 
 	unsigned int rsvd_4:7;	/* must be zero */
@@ -178,7 +209,7 @@ struct bau_msg_header {
 	/* bits 95:90 */
 	unsigned int rsvd_6:5;	/* must be zero */
 	/* bits 100:96 */
-	unsigned int int_both:1;/* if 1, interrupt both sockets on the blade */
+	unsigned int int_both:1;/* if 1, interrupt both sockets on the uvhub */
 	/* bit 101*/
 	unsigned int fairness:3;/* usually zero */
 	/* bits 104:102 */
@@ -191,13 +222,18 @@ struct bau_msg_header {
 	/* bits 127:107 */
 };
 
+/* see msg_type: */
+#define MSG_NOOP 0
+#define MSG_REGULAR 1
+#define MSG_RETRY 2
+
 /*
  * The activation descriptor:
  * The format of the message to send, plus all accompanying control
  * Should be 64 bytes
  */
 struct bau_desc {
-	struct bau_target_nodemask distribution;
+	struct bau_target_uvhubmask distribution;
 	/*
 	 * message template, consisting of header and payload:
 	 */
@@ -237,19 +273,25 @@ struct bau_payload_queue_entry {
 	unsigned short acknowledge_count; /* filled in by destination */
 	/* 16 bits, bytes 10-11 */
 
-	unsigned short replied_to:1;	/* sent as 0 by the source */
-	/* 1 bit */
-	unsigned short unused1:7;       /* not currently using */
-	/* 7 bits: byte 12) */
+	/* these next 3 bytes come from bits 58-81 of the message header */
+	unsigned short replied_to:1;    /* sent as 0 by the source */
+	unsigned short msg_type:3;      /* software message type */
+	unsigned short canceled:1;      /* sent as 0 by the source */
+	unsigned short unused1:3;       /* not currently using */
+	/* byte 12 */
 
-	unsigned char unused2[2];	/* not currently using */
-	/* bytes 13-14 */
+	unsigned char unused2a;		/* not currently using */
+	/* byte 13 */
+	unsigned char unused2;		/* not currently using */
+	/* byte 14 */
 
 	unsigned char sw_ack_vector;	/* filled in by the hardware */
 	/* byte 15 (bits 127:120) */
 
-	unsigned char unused4[3];	/* not currently using bytes 17-19 */
-	/* bytes 17-19 */
+	unsigned short sequence;	/* message sequence number */
+	/* bytes 16-17 */
+	unsigned char unused4[2];	/* not currently using bytes 18-19 */
+	/* bytes 18-19 */
 
 	int number_of_cpus;		/* filled in at destination */
 	/* 32 bits, bytes 20-23 (aligned) */
@@ -259,63 +301,93 @@ struct bau_payload_queue_entry {
 };
 
 /*
- * one for every slot in the destination payload queue
- */
-struct bau_msg_status {
-	struct bau_local_cpumask seen_by;	/* map of cpu's */
-};
-
-/*
- * one for every slot in the destination software ack resources
- */
-struct bau_sw_ack_status {
-	struct bau_payload_queue_entry *msg;	/* associated message */
-	int watcher;				/* cpu monitoring, or -1 */
-};
-
-/*
- * one on every node and per-cpu; to locate the software tables
+ * one per-cpu; to locate the software tables
  */
 struct bau_control {
 	struct bau_desc *descriptor_base;
-	struct bau_payload_queue_entry *bau_msg_head;
 	struct bau_payload_queue_entry *va_queue_first;
 	struct bau_payload_queue_entry *va_queue_last;
-	struct bau_msg_status *msg_statuses;
-	int *watching; /* pointer to array */
+	struct bau_payload_queue_entry *bau_msg_head;
+	struct bau_control *uvhub_master;
+	struct bau_control *socket_master;
+	unsigned long timeout_interval;
+	atomic_t active_descriptor_count;
+	int max_concurrent;
+	int max_concurrent_constant;
+	int retry_message_scans;
+	int plugged_tries;
+	int timeout_tries;
+	int ipi_attempts;
+	int conseccompletes;
+	short cpu;
+	short uvhub_cpu;
+	short uvhub;
+	short cpus_in_socket;
+	short cpus_in_uvhub;
+	unsigned short message_number;
+	unsigned short uvhub_quiesce;
+	short socket_acknowledge_count[DEST_Q_SIZE];
+	cycles_t send_message;
+	spinlock_t masks_lock;
+	spinlock_t uvhub_lock;
+	spinlock_t queue_lock;
 };
 
 /*
  * This structure is allocated per_cpu for UV TLB shootdown statistics.
  */
 struct ptc_stats {
-	unsigned long ptc_i;	/* number of IPI-style flushes */
-	unsigned long requestor;	/* number of nodes this cpu sent to */
-	unsigned long requestee;	/* times cpu was remotely requested */
-	unsigned long alltlb;	/* times all tlb's on this cpu were flushed */
-	unsigned long onetlb;	/* times just one tlb on this cpu was flushed */
-	unsigned long s_retry;	/* retries on source side timeouts */
-	unsigned long d_retry;	/* retries on destination side timeouts */
-	unsigned long sflush;	/* cycles spent in uv_flush_tlb_others */
-	unsigned long dflush;	/* cycles spent on destination side */
-	unsigned long retriesok; /* successes on retries */
-	unsigned long nomsg;	/* interrupts with no message */
-	unsigned long multmsg;	/* interrupts with multiple messages */
-	unsigned long ntargeted;/* nodes targeted */
+	/* sender statistics */
+	unsigned long s_giveup; /* number of fall backs to IPI-style flushes */
+	unsigned long s_requestor; /* number of shootdown requests */
+	unsigned long s_stimeout; /* source side timeouts */
+	unsigned long s_dtimeout; /* destination side timeouts */
+	unsigned long s_time; /* time spent in sending side */
+	unsigned long s_retriesok; /* successful retries */
+	unsigned long s_ntargcpu; /* number of cpus targeted */
+	unsigned long s_ntarguvhub; /* number of uvhubs targeted */
+	unsigned long s_ntarguvhub16; /* number of times >= 16 target hubs */
+	unsigned long s_ntarguvhub8; /* number of times >= 8 target hubs */
+	unsigned long s_ntarguvhub4; /* number of times >= 4 target hubs */
+	unsigned long s_ntarguvhub2; /* number of times >= 2 target hubs */
+	unsigned long s_ntarguvhub1; /* number of times == 1 target hub */
+	unsigned long s_resets_plug; /* ipi-style resets from plug state */
+	unsigned long s_resets_timeout; /* ipi-style resets from timeouts */
+	unsigned long s_busy; /* status stayed busy past s/w timer */
+	unsigned long s_throttles; /* waits in throttle */
+	unsigned long s_retry_messages; /* retry broadcasts */
+	/* destination statistics */
+	unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */
+	unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
+	unsigned long d_multmsg; /* interrupts with multiple messages */
+	unsigned long d_nomsg; /* interrupts with no message */
+	unsigned long d_time; /* time spent on destination side */
+	unsigned long d_requestee; /* number of messages processed */
+	unsigned long d_retries; /* number of retry messages processed */
+	unsigned long d_canceled; /* number of messages canceled by retries */
+	unsigned long d_nocanceled; /* retries that found nothing to cancel */
+	unsigned long d_resets; /* number of ipi-style requests processed */
+	unsigned long d_rcanceled; /* number of messages canceled by resets */
 };
 
-static inline int bau_node_isset(int node, struct bau_target_nodemask *dstp)
+static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp)
 {
-	return constant_test_bit(node, &dstp->bits[0]);
+	return constant_test_bit(uvhub, &dstp->bits[0]);
 }
-static inline void bau_node_set(int node, struct bau_target_nodemask *dstp)
+static inline void bau_uvhub_set(int uvhub, struct bau_target_uvhubmask *dstp)
 {
-	__set_bit(node, &dstp->bits[0]);
+	__set_bit(uvhub, &dstp->bits[0]);
 }
-static inline void bau_nodes_clear(struct bau_target_nodemask *dstp, int nbits)
+static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp,
+				    int nbits)
 {
 	bitmap_zero(&dstp->bits[0], nbits);
 }
+static inline int bau_uvhub_weight(struct bau_target_uvhubmask *dstp)
+{
+	return bitmap_weight((unsigned long *)&dstp->bits[0],
+				UV_DISTRIBUTION_SIZE);
+}
 
 static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
 {
@@ -328,4 +400,35 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
 extern void uv_bau_message_intr1(void);
 extern void uv_bau_timeout_intr1(void);
 
+struct atomic_short {
+	short counter;
+};
+
+/**
+ * atomic_read_short - read a short atomic variable
+ * @v: pointer of type atomic_short
+ *
+ * Atomically reads the value of @v.
+ */
+static inline int atomic_read_short(const struct atomic_short *v)
+{
+	return v->counter;
+}
+
+/**
+ * atomic_add_short_return - add and return a short int
+ * @i: short value to add
+ * @v: pointer of type atomic_short
+ *
+ * Atomically adds @i to @v and returns @i + @v
+ */
+static inline int atomic_add_short_return(short i, struct atomic_short *v)
+{
+	short __i = i;
+	asm volatile(LOCK_PREFIX "xaddw %0, %1"
+			: "+r" (i), "+m" (v->counter)
+			: : "memory");
+	return i + __i;
+}
+
 #endif /* _ASM_X86_UV_UV_BAU_H */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index ef68ba48564b..414f7c4fe76c 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1,7 +1,7 @@
 /*
  *	SGI UltraViolet TLB flush routines.
  *
- *	(c) 2008 Cliff Wickman <cpw@sgi.com>, SGI.
+ *	(c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
  *
  *	This code is released under the GNU General Public License version 2 or
  *	later.
@@ -19,44 +19,67 @@
 #include <asm/idle.h>
 #include <asm/tsc.h>
 #include <asm/irq_vectors.h>
+#include <asm/timer.h>
+
+struct msg_desc {
+	struct bau_payload_queue_entry *msg;
+	int msg_slot;
+	int sw_ack_slot;
+	struct bau_payload_queue_entry *va_queue_first;
+	struct bau_payload_queue_entry *va_queue_last;
+};
 
 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD	0x000000000bUL
 
-static struct bau_control	**uv_bau_table_bases __read_mostly;
-static int			uv_bau_retry_limit __read_mostly;
+static int uv_bau_max_concurrent __read_mostly;
 
-/* base pnode in this partition */
-static int			uv_partition_base_pnode __read_mostly;
+static int nobau;
+static int __init setup_nobau(char *arg)
+{
+	nobau = 1;
+	return 0;
+}
+early_param("nobau", setup_nobau);
 
-static unsigned long		uv_mmask __read_mostly;
+/* base pnode in this partition */
+static int uv_partition_base_pnode __read_mostly;
+/* position of pnode (which is nasid>>1): */
+static int uv_nshift __read_mostly;
+static unsigned long uv_mmask __read_mostly;
 
 static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
 static DEFINE_PER_CPU(struct bau_control, bau_control);
+static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
+
+struct reset_args {
+	int sender;
+};
 
 /*
- * Determine the first node on a blade.
+ * Determine the first node on a uvhub. 'Nodes' are used for kernel
+ * memory allocation.
  */
-static int __init blade_to_first_node(int blade)
+static int __init uvhub_to_first_node(int uvhub)
 {
 	int node, b;
 
 	for_each_online_node(node) {
 		b = uv_node_to_blade_id(node);
-		if (blade == b)
+		if (uvhub == b)
 			return node;
 	}
-	return -1; /* shouldn't happen */
+	return -1;
 }
 
 /*
- * Determine the apicid of the first cpu on a blade.
+ * Determine the apicid of the first cpu on a uvhub.
  */
-static int __init blade_to_first_apicid(int blade)
+static int __init uvhub_to_first_apicid(int uvhub)
 {
 	int cpu;
 
 	for_each_present_cpu(cpu)
-		if (blade == uv_cpu_to_blade_id(cpu))
+		if (uvhub == uv_cpu_to_blade_id(cpu))
 			return per_cpu(x86_cpu_to_apicid, cpu);
 	return -1;
 }
@@ -69,195 +92,459 @@ static int __init blade_to_first_apicid(int blade)
  * clear of the Timeout bit (as well) will free the resource. No reply will
  * be sent (the hardware will only do one reply per message).
  */
-static void uv_reply_to_message(int resource,
-				struct bau_payload_queue_entry *msg,
-				struct bau_msg_status *msp)
+static inline void uv_reply_to_message(struct msg_desc *mdp,
+				       struct bau_control *bcp)
 {
 	unsigned long dw;
+	struct bau_payload_queue_entry *msg;
 
-	dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource);
+	msg = mdp->msg;
+	if (!msg->canceled) {
+		dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
+						msg->sw_ack_vector;
+		uv_write_local_mmr(
+				UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
+	}
 	msg->replied_to = 1;
 	msg->sw_ack_vector = 0;
-	if (msp)
-		msp->seen_by.bits = 0;
-	uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
 }
 
 /*
- * Do all the things a cpu should do for a TLB shootdown message.
- * Other cpu's may come here at the same time for this message.
+ * Process the receipt of a RETRY message
  */
-static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
-				   int msg_slot, int sw_ack_slot)
+static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
+					    struct bau_control *bcp)
 {
-	unsigned long this_cpu_mask;
-	struct bau_msg_status *msp;
-	int cpu;
+	int i;
+	int cancel_count = 0;
+	int slot2;
+	unsigned long msg_res;
+	unsigned long mmr = 0;
+	struct bau_payload_queue_entry *msg;
+	struct bau_payload_queue_entry *msg2;
+	struct ptc_stats *stat;
 
-	msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
-	cpu = uv_blade_processor_id();
-	msg->number_of_cpus =
-		uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
-	this_cpu_mask = 1UL << cpu;
-	if (msp->seen_by.bits & this_cpu_mask)
-		return;
-	atomic_or_long(&msp->seen_by.bits, this_cpu_mask);
+	msg = mdp->msg;
+	stat = &per_cpu(ptcstats, bcp->cpu);
+	stat->d_retries++;
+	/*
+	 * cancel any message from msg+1 to the retry itself
+	 */
+	for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
+		if (msg2 > mdp->va_queue_last)
+			msg2 = mdp->va_queue_first;
+		if (msg2 == msg)
+			break;
+
+		/* same conditions for cancellation as uv_do_reset */
+		if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
+		    (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
+			msg->sw_ack_vector) == 0) &&
+		    (msg2->sending_cpu == msg->sending_cpu) &&
+		    (msg2->msg_type != MSG_NOOP)) {
+			slot2 = msg2 - mdp->va_queue_first;
+			mmr = uv_read_local_mmr
+				(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+			msg_res = ((msg2->sw_ack_vector << 8) |
+				   msg2->sw_ack_vector);
+			/*
+			 * This is a message retry; clear the resources held
+			 * by the previous message only if they timed out.
+			 * If it has not timed out we have an unexpected
+			 * situation to report.
+			 */
+			if (mmr & (msg_res << 8)) {
+				/*
+				 * is the resource timed out?
+				 * make everyone ignore the cancelled message.
+				 */
+				msg2->canceled = 1;
+				stat->d_canceled++;
+				cancel_count++;
+				uv_write_local_mmr(
+				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
+					(msg_res << 8) | msg_res);
+			} else
+				printk(KERN_INFO "note bau retry: no effect\n");
+		}
+	}
+	if (!cancel_count)
+		stat->d_nocanceled++;
+}
 
-	if (msg->replied_to == 1)
-		return;
+/*
+ * Do all the things a cpu should do for a TLB shootdown message.
+ * Other cpu's may come here at the same time for this message.
+ */
+static void uv_bau_process_message(struct msg_desc *mdp,
+				   struct bau_control *bcp)
+{
+	int msg_ack_count;
+	short socket_ack_count = 0;
+	struct ptc_stats *stat;
+	struct bau_payload_queue_entry *msg;
+	struct bau_control *smaster = bcp->socket_master;
 
+	/*
+	 * This must be a normal message, or retry of a normal message
+	 */
+	msg = mdp->msg;
+	stat = &per_cpu(ptcstats, bcp->cpu);
 	if (msg->address == TLB_FLUSH_ALL) {
 		local_flush_tlb();
-		__get_cpu_var(ptcstats).alltlb++;
+		stat->d_alltlb++;
 	} else {
 		__flush_tlb_one(msg->address);
-		__get_cpu_var(ptcstats).onetlb++;
+		stat->d_onetlb++;
 	}
+	stat->d_requestee++;
+
+	/*
+	 * One cpu on each uvhub has the additional job on a RETRY
+	 * of releasing the resource held by the message that is
+	 * being retried.  That message is identified by sending
+	 * cpu number.
+	 */
+	if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
+		uv_bau_process_retry_msg(mdp, bcp);
 
-	__get_cpu_var(ptcstats).requestee++;
+	/*
+	 * This is a sw_ack message, so we have to reply to it.
+	 * Count each responding cpu on the socket. This avoids
+	 * pinging the count's cache line back and forth between
+	 * the sockets.
+	 */
+	socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
+			&smaster->socket_acknowledge_count[mdp->msg_slot]);
+	if (socket_ack_count == bcp->cpus_in_socket) {
+		/*
+		 * Both sockets dump their completed count total into
+		 * the message's count.
+		 */
+		smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
+		msg_ack_count = atomic_add_short_return(socket_ack_count,
+				(struct atomic_short *)&msg->acknowledge_count);
+
+		if (msg_ack_count == bcp->cpus_in_uvhub) {
+			/*
+			 * All cpus in uvhub saw it; reply
+			 */
+			uv_reply_to_message(mdp, bcp);
+		}
+	}
 
-	atomic_inc_short(&msg->acknowledge_count);
-	if (msg->number_of_cpus == msg->acknowledge_count)
-		uv_reply_to_message(sw_ack_slot, msg, msp);
+	return;
 }
 
 /*
- * Examine the payload queue on one distribution node to see
- * which messages have not been seen, and which cpu(s) have not seen them.
+ * Determine the first cpu on a uvhub.
+ */
+static int uvhub_to_first_cpu(int uvhub)
+{
+	int cpu;
+	for_each_present_cpu(cpu)
+		if (uvhub == uv_cpu_to_blade_id(cpu))
+			return cpu;
+	return -1;
+}
+
+/*
+ * Last resort when we get a large number of destination timeouts is
+ * to clear resources held by a given cpu.
+ * Do this with IPI so that all messages in the BAU message queue
+ * can be identified by their nonzero sw_ack_vector field.
  *
- * Returns the number of cpu's that have not responded.
+ * This is entered for a single cpu on the uvhub.
+ * The sender want's this uvhub to free a specific message's
+ * sw_ack resources.
  */
-static int uv_examine_destination(struct bau_control *bau_tablesp, int sender)
+static void
+uv_do_reset(void *ptr)
 {
-	struct bau_payload_queue_entry *msg;
-	struct bau_msg_status *msp;
-	int count = 0;
 	int i;
-	int j;
+	int slot;
+	int count = 0;
+	unsigned long mmr;
+	unsigned long msg_res;
+	struct bau_control *bcp;
+	struct reset_args *rap;
+	struct bau_payload_queue_entry *msg;
+	struct ptc_stats *stat;
 
-	for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE;
-	     msg++, i++) {
-		if ((msg->sending_cpu == sender) && (!msg->replied_to)) {
-			msp = bau_tablesp->msg_statuses + i;
-			printk(KERN_DEBUG
-			       "blade %d: address:%#lx %d of %d, not cpu(s): ",
-			       i, msg->address, msg->acknowledge_count,
-			       msg->number_of_cpus);
-			for (j = 0; j < msg->number_of_cpus; j++) {
-				if (!((1L << j) & msp->seen_by.bits)) {
-					count++;
-					printk("%d ", j);
-				}
+	bcp = &per_cpu(bau_control, smp_processor_id());
+	rap = (struct reset_args *)ptr;
+	stat = &per_cpu(ptcstats, bcp->cpu);
+	stat->d_resets++;
+
+	/*
+	 * We're looking for the given sender, and
+	 * will free its sw_ack resource.
+	 * If all cpu's finally responded after the timeout, its
+	 * message 'replied_to' was set.
+	 */
+	for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
+		/* uv_do_reset: same conditions for cancellation as
+		   uv_bau_process_retry_msg() */
+		if ((msg->replied_to == 0) &&
+		    (msg->canceled == 0) &&
+		    (msg->sending_cpu == rap->sender) &&
+		    (msg->sw_ack_vector) &&
+		    (msg->msg_type != MSG_NOOP)) {
+			/*
+			 * make everyone else ignore this message
+			 */
+			msg->canceled = 1;
+			slot = msg - bcp->va_queue_first;
+			count++;
+			/*
+			 * only reset the resource if it is still pending
+			 */
+			mmr = uv_read_local_mmr
+					(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+			msg_res = ((msg->sw_ack_vector << 8) |
+						   msg->sw_ack_vector);
+			if (mmr & msg_res) {
+				stat->d_rcanceled++;
+				uv_write_local_mmr(
+				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
+							msg_res);
 			}
-			printk("\n");
 		}
 	}
-	return count;
+	return;
 }
 
 /*
- * Examine the payload queue on all the distribution nodes to see
- * which messages have not been seen, and which cpu(s) have not seen them.
- *
- * Returns the number of cpu's that have not responded.
+ * Use IPI to get all target uvhubs to release resources held by
+ * a given sending cpu number.
  */
-static int uv_examine_destinations(struct bau_target_nodemask *distribution)
+static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
+			      int sender)
 {
-	int sender;
-	int i;
-	int count = 0;
+	int uvhub;
+	int cpu;
+	cpumask_t mask;
+	struct reset_args reset_args;
 
-	sender = smp_processor_id();
-	for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) {
-		if (!bau_node_isset(i, distribution))
+	reset_args.sender = sender;
+
+	cpus_clear(mask);
+	/* find a single cpu for each uvhub in this distribution mask */
+	for (uvhub = 0;
+		    uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
+		    uvhub++) {
+		if (!bau_uvhub_isset(uvhub, distribution))
 			continue;
-		count += uv_examine_destination(uv_bau_table_bases[i], sender);
+		/* find a cpu for this uvhub */
+		cpu = uvhub_to_first_cpu(uvhub);
+		cpu_set(cpu, mask);
 	}
-	return count;
+	/* IPI all cpus; Preemption is already disabled */
+	smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
+	return;
+}
+
+static inline unsigned long
+cycles_2_us(unsigned long long cyc)
+{
+	unsigned long long ns;
+	unsigned long us;
+	ns =  (cyc * per_cpu(cyc2ns, smp_processor_id()))
+						>> CYC2NS_SCALE_FACTOR;
+	us = ns / 1000;
+	return us;
 }
 
 /*
- * wait for completion of a broadcast message
- *
- * return COMPLETE, RETRY or GIVEUP
+ * wait for all cpus on this hub to finish their sends and go quiet
+ * leaves uvhub_quiesce set so that no new broadcasts are started by
+ * bau_flush_send_and_wait()
+ */
+static inline void
+quiesce_local_uvhub(struct bau_control *hmaster)
+{
+	atomic_add_short_return(1, (struct atomic_short *)
+		 &hmaster->uvhub_quiesce);
+}
+
+/*
+ * mark this quiet-requestor as done
+ */
+static inline void
+end_uvhub_quiesce(struct bau_control *hmaster)
+{
+	atomic_add_short_return(-1, (struct atomic_short *)
+		&hmaster->uvhub_quiesce);
+}
+
+/*
+ * Wait for completion of a broadcast software ack message
+ * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
  */
 static int uv_wait_completion(struct bau_desc *bau_desc,
-			      unsigned long mmr_offset, int right_shift)
+	unsigned long mmr_offset, int right_shift, int this_cpu,
+	struct bau_control *bcp, struct bau_control *smaster, long try)
 {
-	int exams = 0;
-	long destination_timeouts = 0;
-	long source_timeouts = 0;
+	int relaxes = 0;
 	unsigned long descriptor_status;
+	unsigned long mmr;
+	unsigned long mask;
+	cycles_t ttime;
+	cycles_t timeout_time;
+	struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
+	struct bau_control *hmaster;
 
+	hmaster = bcp->uvhub_master;
+	timeout_time = get_cycles() + bcp->timeout_interval;
+
+	/* spin on the status MMR, waiting for it to go idle */
 	while ((descriptor_status = (((unsigned long)
 		uv_read_local_mmr(mmr_offset) >>
 			right_shift) & UV_ACT_STATUS_MASK)) !=
 			DESC_STATUS_IDLE) {
-		if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
-			source_timeouts++;
-			if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
-				source_timeouts = 0;
-			__get_cpu_var(ptcstats).s_retry++;
-			return FLUSH_RETRY;
-		}
 		/*
-		 * spin here looking for progress at the destinations
+		 * Our software ack messages may be blocked because there are
+		 * no swack resources available.  As long as none of them
+		 * has timed out hardware will NACK our message and its
+		 * state will stay IDLE.
 		 */
-		if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) {
-			destination_timeouts++;
-			if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) {
-				/*
-				 * returns number of cpus not responding
-				 */
-				if (uv_examine_destinations
-				    (&bau_desc->distribution) == 0) {
-					__get_cpu_var(ptcstats).d_retry++;
-					return FLUSH_RETRY;
-				}
-				exams++;
-				if (exams >= uv_bau_retry_limit) {
-					printk(KERN_DEBUG
-					       "uv_flush_tlb_others");
-					printk("giving up on cpu %d\n",
-					       smp_processor_id());
+		if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
+			stat->s_stimeout++;
+			return FLUSH_GIVEUP;
+		} else if (descriptor_status ==
+					DESC_STATUS_DESTINATION_TIMEOUT) {
+			stat->s_dtimeout++;
+			ttime = get_cycles();
+
+			/*
+			 * Our retries may be blocked by all destination
+			 * swack resources being consumed, and a timeout
+			 * pending.  In that case hardware returns the
+			 * ERROR that looks like a destination timeout.
+			 */
+			if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) {
+				bcp->conseccompletes = 0;
+				return FLUSH_RETRY_PLUGGED;
+			}
+
+			bcp->conseccompletes = 0;
+			return FLUSH_RETRY_TIMEOUT;
+		} else {
+			/*
+			 * descriptor_status is still BUSY
+			 */
+			cpu_relax();
+			relaxes++;
+			if (relaxes >= 10000) {
+				relaxes = 0;
+				if (get_cycles() > timeout_time) {
+					quiesce_local_uvhub(hmaster);
+
+					/* single-thread the register change */
+					spin_lock(&hmaster->masks_lock);
+					mmr = uv_read_local_mmr(mmr_offset);
+					mask = 0UL;
+					mask |= (3UL < right_shift);
+					mask = ~mask;
+					mmr &= mask;
+					uv_write_local_mmr(mmr_offset, mmr);
+					spin_unlock(&hmaster->masks_lock);
+					end_uvhub_quiesce(hmaster);
+					stat->s_busy++;
 					return FLUSH_GIVEUP;
 				}
-				/*
-				 * delays can hang the simulator
-				   udelay(1000);
-				 */
-				destination_timeouts = 0;
 			}
 		}
-		cpu_relax();
 	}
+	bcp->conseccompletes++;
 	return FLUSH_COMPLETE;
 }
 
+static inline cycles_t
+sec_2_cycles(unsigned long sec)
+{
+	unsigned long ns;
+	cycles_t cyc;
+
+	ns = sec * 1000000000;
+	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
+	return cyc;
+}
+
+/*
+ * conditionally add 1 to *v, unless *v is >= u
+ * return 0 if we cannot add 1 to *v because it is >= u
+ * return 1 if we can add 1 to *v because it is < u
+ * the add is atomic
+ *
+ * This is close to atomic_add_unless(), but this allows the 'u' value
+ * to be lowered below the current 'v'.  atomic_add_unless can only stop
+ * on equal.
+ */
+static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
+{
+	spin_lock(lock);
+	if (atomic_read(v) >= u) {
+		spin_unlock(lock);
+		return 0;
+	}
+	atomic_inc(v);
+	spin_unlock(lock);
+	return 1;
+}
+
 /**
  * uv_flush_send_and_wait
  *
- * Send a broadcast and wait for a broadcast message to complete.
+ * Send a broadcast and wait for it to complete.
  *
- * The flush_mask contains the cpus the broadcast was sent to.
+ * The flush_mask contains the cpus the broadcast is to be sent to, plus
+ * cpus that are on the local uvhub.
  *
- * Returns NULL if all remote flushing was done. The mask is zeroed.
+ * Returns NULL if all flushing represented in the mask was done. The mask
+ * is zeroed.
  * Returns @flush_mask if some remote flushing remains to be done. The
- * mask will have some bits still set.
+ * mask will have some bits still set, representing any cpus on the local
+ * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
  */
-const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
-					     struct bau_desc *bau_desc,
-					     struct cpumask *flush_mask)
+const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
+					     struct cpumask *flush_mask,
+					     struct bau_control *bcp)
 {
-	int completion_status = 0;
 	int right_shift;
-	int tries = 0;
-	int pnode;
+	int uvhub;
 	int bit;
+	int completion_status = 0;
+	int seq_number = 0;
+	long try = 0;
+	int cpu = bcp->uvhub_cpu;
+	int this_cpu = bcp->cpu;
+	int this_uvhub = bcp->uvhub;
 	unsigned long mmr_offset;
 	unsigned long index;
 	cycles_t time1;
 	cycles_t time2;
+	struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
+	struct bau_control *smaster = bcp->socket_master;
+	struct bau_control *hmaster = bcp->uvhub_master;
+
+	/*
+	 * Spin here while there are hmaster->max_concurrent or more active
+	 * descriptors. This is the per-uvhub 'throttle'.
+	 */
+	if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
+			&hmaster->active_descriptor_count,
+			hmaster->max_concurrent)) {
+		stat->s_throttles++;
+		do {
+			cpu_relax();
+		} while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
+			&hmaster->active_descriptor_count,
+			hmaster->max_concurrent));
+	}
+
+	while (hmaster->uvhub_quiesce)
+		cpu_relax();
 
 	if (cpu < UV_CPUS_PER_ACT_STATUS) {
 		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
@@ -269,24 +556,108 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
 	}
 	time1 = get_cycles();
 	do {
-		tries++;
+		/*
+		 * Every message from any given cpu gets a unique message
+		 * sequence number. But retries use that same number.
+		 * Our message may have timed out at the destination because
+		 * all sw-ack resources are in use and there is a timeout
+		 * pending there.  In that case, our last send never got
+		 * placed into the queue and we need to persist until it
+		 * does.
+		 *
+		 * Make any retry a type MSG_RETRY so that the destination will
+		 * free any resource held by a previous message from this cpu.
+		 */
+		if (try == 0) {
+			/* use message type set by the caller the first time */
+			seq_number = bcp->message_number++;
+		} else {
+			/* use RETRY type on all the rest; same sequence */
+			bau_desc->header.msg_type = MSG_RETRY;
+			stat->s_retry_messages++;
+		}
+		bau_desc->header.sequence = seq_number;
 		index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
-			cpu;
+			bcp->uvhub_cpu;
+		bcp->send_message = get_cycles();
+
 		uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+
+		try++;
 		completion_status = uv_wait_completion(bau_desc, mmr_offset,
-					right_shift);
-	} while (completion_status == FLUSH_RETRY);
+			right_shift, this_cpu, bcp, smaster, try);
+
+		if (completion_status == FLUSH_RETRY_PLUGGED) {
+			/*
+			 * Our retries may be blocked by all destination swack
+			 * resources being consumed, and a timeout pending. In
+			 * that case hardware immediately returns the ERROR
+			 * that looks like a destination timeout.
+			 */
+			udelay(TIMEOUT_DELAY);
+			bcp->plugged_tries++;
+			if (bcp->plugged_tries >= PLUGSB4RESET) {
+				bcp->plugged_tries = 0;
+				quiesce_local_uvhub(hmaster);
+				spin_lock(&hmaster->queue_lock);
+				uv_reset_with_ipi(&bau_desc->distribution,
+							this_cpu);
+				spin_unlock(&hmaster->queue_lock);
+				end_uvhub_quiesce(hmaster);
+				bcp->ipi_attempts++;
+				stat->s_resets_plug++;
+			}
+		} else if (completion_status == FLUSH_RETRY_TIMEOUT) {
+			hmaster->max_concurrent = 1;
+			bcp->timeout_tries++;
+			udelay(TIMEOUT_DELAY);
+			if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
+				bcp->timeout_tries = 0;
+				quiesce_local_uvhub(hmaster);
+				spin_lock(&hmaster->queue_lock);
+				uv_reset_with_ipi(&bau_desc->distribution,
+								this_cpu);
+				spin_unlock(&hmaster->queue_lock);
+				end_uvhub_quiesce(hmaster);
+				bcp->ipi_attempts++;
+				stat->s_resets_timeout++;
+			}
+		}
+		if (bcp->ipi_attempts >= 3) {
+			bcp->ipi_attempts = 0;
+			completion_status = FLUSH_GIVEUP;
+			break;
+		}
+		cpu_relax();
+	} while ((completion_status == FLUSH_RETRY_PLUGGED) ||
+		 (completion_status == FLUSH_RETRY_TIMEOUT));
 	time2 = get_cycles();
-	__get_cpu_var(ptcstats).sflush += (time2 - time1);
-	if (tries > 1)
-		__get_cpu_var(ptcstats).retriesok++;
 
-	if (completion_status == FLUSH_GIVEUP) {
+	if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5)
+	    && (hmaster->max_concurrent < hmaster->max_concurrent_constant))
+			hmaster->max_concurrent++;
+
+	/*
+	 * hold any cpu not timing out here; no other cpu currently held by
+	 * the 'throttle' should enter the activation code
+	 */
+	while (hmaster->uvhub_quiesce)
+		cpu_relax();
+	atomic_dec(&hmaster->active_descriptor_count);
+
+	/* guard against cycles wrap */
+	if (time2 > time1)
+		stat->s_time += (time2 - time1);
+	else
+		stat->s_requestor--; /* don't count this one */
+	if (completion_status == FLUSH_COMPLETE && try > 1)
+		stat->s_retriesok++;
+	else if (completion_status == FLUSH_GIVEUP) {
 		/*
 		 * Cause the caller to do an IPI-style TLB shootdown on
-		 * the cpu's, all of which are still in the mask.
+		 * the target cpu's, all of which are still in the mask.
 		 */
-		__get_cpu_var(ptcstats).ptc_i++;
+		stat->s_giveup++;
 		return flush_mask;
 	}
 
@@ -295,18 +666,17 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
 	 * use the IPI method of shootdown on them.
 	 */
 	for_each_cpu(bit, flush_mask) {
-		pnode = uv_cpu_to_pnode(bit);
-		if (pnode == this_pnode)
+		uvhub = uv_cpu_to_blade_id(bit);
+		if (uvhub == this_uvhub)
 			continue;
 		cpumask_clear_cpu(bit, flush_mask);
 	}
 	if (!cpumask_empty(flush_mask))
 		return flush_mask;
+
 	return NULL;
 }
 
-static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
-
 /**
  * uv_flush_tlb_others - globally purge translation cache of a virtual
  * address or all TLB's
@@ -323,8 +693,8 @@ static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
  * The caller has derived the cpumask from the mm_struct.  This function
  * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
  *
- * The cpumask is converted into a nodemask of the nodes containing
- * the cpus.
+ * The cpumask is converted into a uvhubmask of the uvhubs containing
+ * those cpus.
  *
  * Note that this function should be called with preemption disabled.
  *
@@ -336,52 +706,82 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 					  struct mm_struct *mm,
 					  unsigned long va, unsigned int cpu)
 {
-	struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask);
-	int i;
-	int bit;
-	int pnode;
-	int uv_cpu;
-	int this_pnode;
+	int remotes;
+	int tcpu;
+	int uvhub;
 	int locals = 0;
 	struct bau_desc *bau_desc;
+	struct cpumask *flush_mask;
+	struct ptc_stats *stat;
+	struct bau_control *bcp;
 
-	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+	if (nobau)
+		return cpumask;
 
-	uv_cpu = uv_blade_processor_id();
-	this_pnode = uv_hub_info->pnode;
-	bau_desc = __get_cpu_var(bau_control).descriptor_base;
-	bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
+	bcp = &per_cpu(bau_control, cpu);
+	/*
+	 * Each sending cpu has a per-cpu mask which it fills from the caller's
+	 * cpu mask.  Only remote cpus are converted to uvhubs and copied.
+	 */
+	flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
+	/*
+	 * copy cpumask to flush_mask, removing current cpu
+	 * (current cpu should already have been flushed by the caller and
+	 *  should never be returned if we return flush_mask)
+	 */
+	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+	if (cpu_isset(cpu, *cpumask))
+		locals++;  /* current cpu was targeted */
 
-	bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
+	bau_desc = bcp->descriptor_base;
+	bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
 
-	i = 0;
-	for_each_cpu(bit, flush_mask) {
-		pnode = uv_cpu_to_pnode(bit);
-		BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1));
-		if (pnode == this_pnode) {
+	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
+	remotes = 0;
+	for_each_cpu(tcpu, flush_mask) {
+		uvhub = uv_cpu_to_blade_id(tcpu);
+		if (uvhub == bcp->uvhub) {
 			locals++;
 			continue;
 		}
-		bau_node_set(pnode - uv_partition_base_pnode,
-				&bau_desc->distribution);
-		i++;
+		bau_uvhub_set(uvhub, &bau_desc->distribution);
+		remotes++;
 	}
-	if (i == 0) {
+	if (remotes == 0) {
 		/*
-		 * no off_node flushing; return status for local node
+		 * No off_hub flushing; return status for local hub.
+		 * Return the caller's mask if all were local (the current
+		 * cpu may be in that mask).
 		 */
 		if (locals)
-			return flush_mask;
+			return cpumask;
 		else
 			return NULL;
 	}
-	__get_cpu_var(ptcstats).requestor++;
-	__get_cpu_var(ptcstats).ntargeted += i;
+	stat = &per_cpu(ptcstats, cpu);
+	stat->s_requestor++;
+	stat->s_ntargcpu += remotes;
+	remotes = bau_uvhub_weight(&bau_desc->distribution);
+	stat->s_ntarguvhub += remotes;
+	if (remotes >= 16)
+		stat->s_ntarguvhub16++;
+	else if (remotes >= 8)
+		stat->s_ntarguvhub8++;
+	else if (remotes >= 4)
+		stat->s_ntarguvhub4++;
+	else if (remotes >= 2)
+		stat->s_ntarguvhub2++;
+	else
+		stat->s_ntarguvhub1++;
 
 	bau_desc->payload.address = va;
 	bau_desc->payload.sending_cpu = cpu;
 
-	return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask);
+	/*
+	 * uv_flush_send_and_wait returns null if all cpu's were messaged, or
+	 * the adjusted flush_mask if any cpu's were not messaged.
+	 */
+	return uv_flush_send_and_wait(bau_desc, flush_mask, bcp);
 }
 
 /*
@@ -390,87 +790,70 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
  *
  * We received a broadcast assist message.
  *
- * Interrupts may have been disabled; this interrupt could represent
+ * Interrupts are disabled; this interrupt could represent
  * the receipt of several messages.
  *
- * All cores/threads on this node get this interrupt.
- * The last one to see it does the s/w ack.
+ * All cores/threads on this hub get this interrupt.
+ * The last one to see it does the software ack.
  * (the resource will not be freed until noninterruptable cpus see this
- *  interrupt; hardware will timeout the s/w ack and reply ERROR)
+ *  interrupt; hardware may timeout the s/w ack and reply ERROR)
  */
 void uv_bau_message_interrupt(struct pt_regs *regs)
 {
-	struct bau_payload_queue_entry *va_queue_first;
-	struct bau_payload_queue_entry *va_queue_last;
-	struct bau_payload_queue_entry *msg;
-	struct pt_regs *old_regs = set_irq_regs(regs);
-	cycles_t time1;
-	cycles_t time2;
-	int msg_slot;
-	int sw_ack_slot;
-	int fw;
 	int count = 0;
-	unsigned long local_pnode;
-
-	ack_APIC_irq();
-	exit_idle();
-	irq_enter();
-
-	time1 = get_cycles();
-
-	local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
-
-	va_queue_first = __get_cpu_var(bau_control).va_queue_first;
-	va_queue_last = __get_cpu_var(bau_control).va_queue_last;
-
-	msg = __get_cpu_var(bau_control).bau_msg_head;
+	cycles_t time_start;
+	struct bau_payload_queue_entry *msg;
+	struct bau_control *bcp;
+	struct ptc_stats *stat;
+	struct msg_desc msgdesc;
+
+	time_start = get_cycles();
+	bcp = &per_cpu(bau_control, smp_processor_id());
+	stat = &per_cpu(ptcstats, smp_processor_id());
+	msgdesc.va_queue_first = bcp->va_queue_first;
+	msgdesc.va_queue_last = bcp->va_queue_last;
+	msg = bcp->bau_msg_head;
 	while (msg->sw_ack_vector) {
 		count++;
-		fw = msg->sw_ack_vector;
-		msg_slot = msg - va_queue_first;
-		sw_ack_slot = ffs(fw) - 1;
-
-		uv_bau_process_message(msg, msg_slot, sw_ack_slot);
-
+		msgdesc.msg_slot = msg - msgdesc.va_queue_first;
+		msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
+		msgdesc.msg = msg;
+		uv_bau_process_message(&msgdesc, bcp);
 		msg++;
-		if (msg > va_queue_last)
-			msg = va_queue_first;
-		__get_cpu_var(bau_control).bau_msg_head = msg;
+		if (msg > msgdesc.va_queue_last)
+			msg = msgdesc.va_queue_first;
+		bcp->bau_msg_head = msg;
 	}
+	stat->d_time += (get_cycles() - time_start);
 	if (!count)
-		__get_cpu_var(ptcstats).nomsg++;
+		stat->d_nomsg++;
 	else if (count > 1)
-		__get_cpu_var(ptcstats).multmsg++;
-
-	time2 = get_cycles();
-	__get_cpu_var(ptcstats).dflush += (time2 - time1);
-
-	irq_exit();
-	set_irq_regs(old_regs);
+		stat->d_multmsg++;
+	ack_APIC_irq();
 }
 
 /*
  * uv_enable_timeouts
  *
- * Each target blade (i.e. blades that have cpu's) needs to have
+ * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
  * shootdown message timeouts enabled.  The timeout does not cause
  * an interrupt, but causes an error message to be returned to
  * the sender.
  */
 static void uv_enable_timeouts(void)
 {
-	int blade;
-	int nblades;
+	int uvhub;
+	int nuvhubs;
 	int pnode;
 	unsigned long mmr_image;
 
-	nblades = uv_num_possible_blades();
+	nuvhubs = uv_num_possible_blades();
 
-	for (blade = 0; blade < nblades; blade++) {
-		if (!uv_blade_nr_possible_cpus(blade))
+	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+		if (!uv_blade_nr_possible_cpus(uvhub))
 			continue;
 
-		pnode = uv_blade_to_pnode(blade);
+		pnode = uv_blade_to_pnode(uvhub);
 		mmr_image =
 		    uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
 		/*
@@ -523,9 +906,20 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data)
 {
 }
 
+static inline unsigned long long
+millisec_2_cycles(unsigned long millisec)
+{
+	unsigned long ns;
+	unsigned long long cyc;
+
+	ns = millisec * 1000;
+	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
+	return cyc;
+}
+
 /*
- * Display the statistics thru /proc
- * data points to the cpu number
+ * Display the statistics thru /proc.
+ * 'data' points to the cpu number
  */
 static int uv_ptc_seq_show(struct seq_file *file, void *data)
 {
@@ -536,78 +930,155 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 
 	if (!cpu) {
 		seq_printf(file,
-		"# cpu requestor requestee one all sretry dretry ptc_i ");
+			"# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 ");
 		seq_printf(file,
-		"sw_ack sflush dflush sok dnomsg dmult starget\n");
+			"numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto ");
+		seq_printf(file,
+			"retries rok resetp resett giveup sto bz throt ");
+		seq_printf(file,
+			"sw_ack recv rtime all ");
+		seq_printf(file,
+			"one mult none retry canc nocan reset rcan\n");
 	}
 	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
 		stat = &per_cpu(ptcstats, cpu);
-		seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ",
-			   cpu, stat->requestor,
-			   stat->requestee, stat->onetlb, stat->alltlb,
-			   stat->s_retry, stat->d_retry, stat->ptc_i);
-		seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",
+		/* source side statistics */
+		seq_printf(file,
+			"cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
+			   cpu, stat->s_requestor, cycles_2_us(stat->s_time),
+			   stat->s_ntarguvhub, stat->s_ntarguvhub16,
+			   stat->s_ntarguvhub8, stat->s_ntarguvhub4,
+			   stat->s_ntarguvhub2, stat->s_ntarguvhub1,
+			   stat->s_ntargcpu, stat->s_dtimeout);
+		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
+			   stat->s_retry_messages, stat->s_retriesok,
+			   stat->s_resets_plug, stat->s_resets_timeout,
+			   stat->s_giveup, stat->s_stimeout,
+			   stat->s_busy, stat->s_throttles);
+		/* destination side statistics */
+		seq_printf(file,
+			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
 			   uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
 					UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
-			   stat->sflush, stat->dflush,
-			   stat->retriesok, stat->nomsg,
-			   stat->multmsg, stat->ntargeted);
+			   stat->d_requestee, cycles_2_us(stat->d_time),
+			   stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
+			   stat->d_nomsg, stat->d_retries, stat->d_canceled,
+			   stat->d_nocanceled, stat->d_resets,
+			   stat->d_rcanceled);
 	}
 
 	return 0;
 }
 
 /*
+ * -1: resetf the statistics
  *  0: display meaning of the statistics
- * >0: retry limit
+ * >0: maximum concurrent active descriptors per uvhub (throttle)
  */
 static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
 				 size_t count, loff_t *data)
 {
-	long newmode;
+	int cpu;
+	long input_arg;
 	char optstr[64];
+	struct ptc_stats *stat;
+	struct bau_control *bcp;
 
 	if (count == 0 || count > sizeof(optstr))
 		return -EINVAL;
 	if (copy_from_user(optstr, user, count))
 		return -EFAULT;
 	optstr[count - 1] = '\0';
-	if (strict_strtoul(optstr, 10, &newmode) < 0) {
+	if (strict_strtol(optstr, 10, &input_arg) < 0) {
 		printk(KERN_DEBUG "%s is invalid\n", optstr);
 		return -EINVAL;
 	}
 
-	if (newmode == 0) {
+	if (input_arg == 0) {
 		printk(KERN_DEBUG "# cpu:      cpu number\n");
+		printk(KERN_DEBUG "Sender statistics:\n");
+		printk(KERN_DEBUG
+		"sent:     number of shootdown messages sent\n");
+		printk(KERN_DEBUG
+		"stime:    time spent sending messages\n");
+		printk(KERN_DEBUG
+		"numuvhubs: number of hubs targeted with shootdown\n");
+		printk(KERN_DEBUG
+		"numuvhubs16: number times 16 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs8: number times 8 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs4: number times 4 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs2: number times 2 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs1: number times 1 hub targeted\n");
+		printk(KERN_DEBUG
+		"numcpus:  number of cpus targeted with shootdown\n");
+		printk(KERN_DEBUG
+		"dto:      number of destination timeouts\n");
+		printk(KERN_DEBUG
+		"retries:  destination timeout retries sent\n");
+		printk(KERN_DEBUG
+		"rok:   :  destination timeouts successfully retried\n");
+		printk(KERN_DEBUG
+		"resetp:   ipi-style resource resets for plugs\n");
+		printk(KERN_DEBUG
+		"resett:   ipi-style resource resets for timeouts\n");
+		printk(KERN_DEBUG
+		"giveup:   fall-backs to ipi-style shootdowns\n");
+		printk(KERN_DEBUG
+		"sto:      number of source timeouts\n");
+		printk(KERN_DEBUG
+		"bz:       number of stay-busy's\n");
+		printk(KERN_DEBUG
+		"throt:    number times spun in throttle\n");
+		printk(KERN_DEBUG "Destination side statistics:\n");
 		printk(KERN_DEBUG
-		"requestor:  times this cpu was the flush requestor\n");
+		"sw_ack:   image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
 		printk(KERN_DEBUG
-		"requestee:  times this cpu was requested to flush its TLBs\n");
+		"recv:     shootdown messages received\n");
 		printk(KERN_DEBUG
-		"one:        times requested to flush a single address\n");
+		"rtime:    time spent processing messages\n");
 		printk(KERN_DEBUG
-		"all:        times requested to flush all TLB's\n");
+		"all:      shootdown all-tlb messages\n");
 		printk(KERN_DEBUG
-		"sretry:     number of retries of source-side timeouts\n");
+		"one:      shootdown one-tlb messages\n");
 		printk(KERN_DEBUG
-		"dretry:     number of retries of destination-side timeouts\n");
+		"mult:     interrupts that found multiple messages\n");
 		printk(KERN_DEBUG
-		"ptc_i:      times UV fell through to IPI-style flushes\n");
+		"none:     interrupts that found no messages\n");
 		printk(KERN_DEBUG
-		"sw_ack:     image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
+		"retry:    number of retry messages processed\n");
 		printk(KERN_DEBUG
-		"sflush_us:  cycles spent in uv_flush_tlb_others()\n");
+		"canc:     number messages canceled by retries\n");
 		printk(KERN_DEBUG
-		"dflush_us:  cycles spent in handling flush requests\n");
-		printk(KERN_DEBUG "sok:        successes on retry\n");
-		printk(KERN_DEBUG "dnomsg:     interrupts with no message\n");
+		"nocan:    number retries that found nothing to cancel\n");
 		printk(KERN_DEBUG
-		"dmult:      interrupts with multiple messages\n");
-		printk(KERN_DEBUG "starget:    nodes targeted\n");
+		"reset:    number of ipi-style reset requests processed\n");
+		printk(KERN_DEBUG
+		"rcan:     number messages canceled by reset requests\n");
+	} else if (input_arg == -1) {
+		for_each_present_cpu(cpu) {
+			stat = &per_cpu(ptcstats, cpu);
+			memset(stat, 0, sizeof(struct ptc_stats));
+		}
 	} else {
-		uv_bau_retry_limit = newmode;
-		printk(KERN_DEBUG "timeout retry limit:%d\n",
-		       uv_bau_retry_limit);
+		uv_bau_max_concurrent = input_arg;
+		bcp = &per_cpu(bau_control, smp_processor_id());
+		if (uv_bau_max_concurrent < 1 ||
+		    uv_bau_max_concurrent > bcp->cpus_in_uvhub) {
+			printk(KERN_DEBUG
+				"Error: BAU max concurrent %d; %d is invalid\n",
+				bcp->max_concurrent, uv_bau_max_concurrent);
+			return -EINVAL;
+		}
+		printk(KERN_DEBUG "Set BAU max concurrent:%d\n",
+		       uv_bau_max_concurrent);
+		for_each_present_cpu(cpu) {
+			bcp = &per_cpu(bau_control, cpu);
+			bcp->max_concurrent = uv_bau_max_concurrent;
+		}
 	}
 
 	return count;
@@ -650,80 +1121,31 @@ static int __init uv_ptc_init(void)
 	return 0;
 }
 
-/*
- * begin the initialization of the per-blade control structures
- */
-static struct bau_control * __init uv_table_bases_init(int blade, int node)
-{
-	int i;
-	struct bau_msg_status *msp;
-	struct bau_control *bau_tabp;
-
-	bau_tabp =
-	    kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);
-	BUG_ON(!bau_tabp);
-
-	bau_tabp->msg_statuses =
-	    kmalloc_node(sizeof(struct bau_msg_status) *
-			 DEST_Q_SIZE, GFP_KERNEL, node);
-	BUG_ON(!bau_tabp->msg_statuses);
-
-	for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++)
-		bau_cpubits_clear(&msp->seen_by, (int)
-				  uv_blade_nr_possible_cpus(blade));
-
-	uv_bau_table_bases[blade] = bau_tabp;
-
-	return bau_tabp;
-}
-
-/*
- * finish the initialization of the per-blade control structures
- */
-static void __init
-uv_table_bases_finish(int blade,
-		      struct bau_control *bau_tablesp,
-		      struct bau_desc *adp)
-{
-	struct bau_control *bcp;
-	int cpu;
-
-	for_each_present_cpu(cpu) {
-		if (blade != uv_cpu_to_blade_id(cpu))
-			continue;
-
-		bcp = (struct bau_control *)&per_cpu(bau_control, cpu);
-		bcp->bau_msg_head	= bau_tablesp->va_queue_first;
-		bcp->va_queue_first	= bau_tablesp->va_queue_first;
-		bcp->va_queue_last	= bau_tablesp->va_queue_last;
-		bcp->msg_statuses	= bau_tablesp->msg_statuses;
-		bcp->descriptor_base	= adp;
-	}
-}
-
 /*
  * initialize the sending side's sending buffers
  */
-static struct bau_desc * __init
+static void
 uv_activation_descriptor_init(int node, int pnode)
 {
 	int i;
+	int cpu;
 	unsigned long pa;
 	unsigned long m;
 	unsigned long n;
-	struct bau_desc *adp;
-	struct bau_desc *ad2;
+	struct bau_desc *bau_desc;
+	struct bau_desc *bd2;
+	struct bau_control *bcp;
 
 	/*
 	 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
-	 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
+	 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
 	 */
-	adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
+	bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
 		UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
-	BUG_ON(!adp);
+	BUG_ON(!bau_desc);
 
-	pa = uv_gpa(adp); /* need the real nasid*/
-	n = uv_gpa_to_pnode(pa);
+	pa = uv_gpa(bau_desc); /* need the real nasid*/
+	n = pa >> uv_nshift;
 	m = pa & uv_mmask;
 
 	uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
@@ -732,96 +1154,188 @@ uv_activation_descriptor_init(int node, int pnode)
 	/*
 	 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
 	 * cpu even though we only use the first one; one descriptor can
-	 * describe a broadcast to 256 nodes.
+	 * describe a broadcast to 256 uv hubs.
 	 */
-	for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
-		i++, ad2++) {
-		memset(ad2, 0, sizeof(struct bau_desc));
-		ad2->header.sw_ack_flag = 1;
+	for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
+		i++, bd2++) {
+		memset(bd2, 0, sizeof(struct bau_desc));
+		bd2->header.sw_ack_flag = 1;
 		/*
-		 * base_dest_nodeid is the first node in the partition, so
-		 * the bit map will indicate partition-relative node numbers.
-		 * note that base_dest_nodeid is actually a nasid.
+		 * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub
+		 * in the partition. The bit map will indicate uvhub numbers,
+		 * which are 0-N in a partition. Pnodes are unique system-wide.
 		 */
-		ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
-		ad2->header.dest_subnodeid = 0x10; /* the LB */
-		ad2->header.command = UV_NET_ENDPOINT_INTD;
-		ad2->header.int_both = 1;
+		bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
+		bd2->header.dest_subnodeid = 0x10; /* the LB */
+		bd2->header.command = UV_NET_ENDPOINT_INTD;
+		bd2->header.int_both = 1;
 		/*
 		 * all others need to be set to zero:
 		 *   fairness chaining multilevel count replied_to
 		 */
 	}
-	return adp;
+	for_each_present_cpu(cpu) {
+		if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
+			continue;
+		bcp = &per_cpu(bau_control, cpu);
+		bcp->descriptor_base = bau_desc;
+	}
 }
 
 /*
  * initialize the destination side's receiving buffers
+ * entered for each uvhub in the partition
+ * - node is first node (kernel memory notion) on the uvhub
+ * - pnode is the uvhub's physical identifier
  */
-static struct bau_payload_queue_entry * __init
-uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
+static void
+uv_payload_queue_init(int node, int pnode)
 {
-	struct bau_payload_queue_entry *pqp;
-	unsigned long pa;
 	int pn;
+	int cpu;
 	char *cp;
+	unsigned long pa;
+	struct bau_payload_queue_entry *pqp;
+	struct bau_payload_queue_entry *pqp_malloc;
+	struct bau_control *bcp;
 
 	pqp = (struct bau_payload_queue_entry *) kmalloc_node(
 		(DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
 		GFP_KERNEL, node);
 	BUG_ON(!pqp);
+	pqp_malloc = pqp;
 
 	cp = (char *)pqp + 31;
 	pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
-	bau_tablesp->va_queue_first = pqp;
+
+	for_each_present_cpu(cpu) {
+		if (pnode != uv_cpu_to_pnode(cpu))
+			continue;
+		/* for every cpu on this pnode: */
+		bcp = &per_cpu(bau_control, cpu);
+		bcp->va_queue_first = pqp;
+		bcp->bau_msg_head = pqp;
+		bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
+	}
 	/*
 	 * need the pnode of where the memory was really allocated
 	 */
 	pa = uv_gpa(pqp);
-	pn = uv_gpa_to_pnode(pa);
+	pn = pa >> uv_nshift;
 	uv_write_global_mmr64(pnode,
 			      UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
 			      ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
 			      uv_physnodeaddr(pqp));
 	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
 			      uv_physnodeaddr(pqp));
-	bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
 	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
 			      (unsigned long)
-			      uv_physnodeaddr(bau_tablesp->va_queue_last));
+			      uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
+	/* in effect, all msg_type's are set to MSG_NOOP */
 	memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
-
-	return pqp;
 }
 
 /*
- * Initialization of each UV blade's structures
+ * Initialization of each UV hub's structures
  */
-static int __init uv_init_blade(int blade)
+static void __init uv_init_uvhub(int uvhub, int vector)
 {
 	int node;
 	int pnode;
-	unsigned long pa;
 	unsigned long apicid;
-	struct bau_desc *adp;
-	struct bau_payload_queue_entry *pqp;
-	struct bau_control *bau_tablesp;
-
-	node = blade_to_first_node(blade);
-	bau_tablesp = uv_table_bases_init(blade, node);
-	pnode = uv_blade_to_pnode(blade);
-	adp = uv_activation_descriptor_init(node, pnode);
-	pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
-	uv_table_bases_finish(blade, bau_tablesp, adp);
+
+	node = uvhub_to_first_node(uvhub);
+	pnode = uv_blade_to_pnode(uvhub);
+	uv_activation_descriptor_init(node, pnode);
+	uv_payload_queue_init(node, pnode);
 	/*
 	 * the below initialization can't be in firmware because the
 	 * messaging IRQ will be determined by the OS
 	 */
-	apicid = blade_to_first_apicid(blade);
-	pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
+	apicid = uvhub_to_first_apicid(uvhub);
 	uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
-				      ((apicid << 32) | UV_BAU_MESSAGE));
-	return 0;
+				      ((apicid << 32) | vector));
+}
+
+/*
+ * initialize the bau_control structure for each cpu
+ */
+static void uv_init_per_cpu(int nuvhubs)
+{
+	int i, j, k;
+	int cpu;
+	int pnode;
+	int uvhub;
+	short socket = 0;
+	struct bau_control *bcp;
+	struct uvhub_desc *bdp;
+	struct socket_desc *sdp;
+	struct bau_control *hmaster = NULL;
+	struct bau_control *smaster = NULL;
+	struct socket_desc {
+		short num_cpus;
+		short cpu_number[16];
+	};
+	struct uvhub_desc {
+		short num_sockets;
+		short num_cpus;
+		short uvhub;
+		short pnode;
+		struct socket_desc socket[2];
+	};
+	struct uvhub_desc *uvhub_descs;
+
+	uvhub_descs = (struct uvhub_desc *)
+		kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
+	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+	for_each_present_cpu(cpu) {
+		bcp = &per_cpu(bau_control, cpu);
+		memset(bcp, 0, sizeof(struct bau_control));
+		spin_lock_init(&bcp->masks_lock);
+		bcp->max_concurrent = uv_bau_max_concurrent;
+		pnode = uv_cpu_hub_info(cpu)->pnode;
+		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
+		bdp = &uvhub_descs[uvhub];
+		bdp->num_cpus++;
+		bdp->uvhub = uvhub;
+		bdp->pnode = pnode;
+		/* time interval to catch a hardware stay-busy bug */
+		bcp->timeout_interval = millisec_2_cycles(3);
+		/* kludge: assume uv_hub.h is constant */
+		socket = (cpu_physical_id(cpu)>>5)&1;
+		if (socket >= bdp->num_sockets)
+			bdp->num_sockets = socket+1;
+		sdp = &bdp->socket[socket];
+		sdp->cpu_number[sdp->num_cpus] = cpu;
+		sdp->num_cpus++;
+	}
+	socket = 0;
+	for_each_possible_blade(uvhub) {
+		bdp = &uvhub_descs[uvhub];
+		for (i = 0; i < bdp->num_sockets; i++) {
+			sdp = &bdp->socket[i];
+			for (j = 0; j < sdp->num_cpus; j++) {
+				cpu = sdp->cpu_number[j];
+				bcp = &per_cpu(bau_control, cpu);
+				bcp->cpu = cpu;
+				if (j == 0) {
+					smaster = bcp;
+					if (i == 0)
+						hmaster = bcp;
+				}
+				bcp->cpus_in_uvhub = bdp->num_cpus;
+				bcp->cpus_in_socket = sdp->num_cpus;
+				bcp->socket_master = smaster;
+				bcp->uvhub_master = hmaster;
+				for (k = 0; k < DEST_Q_SIZE; k++)
+					bcp->socket_acknowledge_count[k] = 0;
+				bcp->uvhub_cpu =
+				  uv_cpu_hub_info(cpu)->blade_processor_id;
+			}
+			socket++;
+		}
+	}
+	kfree(uvhub_descs);
 }
 
 /*
@@ -829,38 +1343,54 @@ static int __init uv_init_blade(int blade)
  */
 static int __init uv_bau_init(void)
 {
-	int blade;
-	int nblades;
+	int uvhub;
+	int pnode;
+	int nuvhubs;
 	int cur_cpu;
+	int vector;
+	unsigned long mmr;
 
 	if (!is_uv_system())
 		return 0;
 
+	if (nobau)
+		return 0;
+
 	for_each_possible_cpu(cur_cpu)
 		zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
 				       GFP_KERNEL, cpu_to_node(cur_cpu));
 
-	uv_bau_retry_limit = 1;
+	uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
+	uv_nshift = uv_hub_info->m_val;
 	uv_mmask = (1UL << uv_hub_info->m_val) - 1;
-	nblades = uv_num_possible_blades();
+	nuvhubs = uv_num_possible_blades();
 
-	uv_bau_table_bases = (struct bau_control **)
-	    kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
-	BUG_ON(!uv_bau_table_bases);
+	uv_init_per_cpu(nuvhubs);
 
 	uv_partition_base_pnode = 0x7fffffff;
-	for (blade = 0; blade < nblades; blade++)
-		if (uv_blade_nr_possible_cpus(blade) &&
-			(uv_blade_to_pnode(blade) < uv_partition_base_pnode))
-			uv_partition_base_pnode = uv_blade_to_pnode(blade);
-	for (blade = 0; blade < nblades; blade++)
-		if (uv_blade_nr_possible_cpus(blade))
-			uv_init_blade(blade);
-
-	alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
+	for (uvhub = 0; uvhub < nuvhubs; uvhub++)
+		if (uv_blade_nr_possible_cpus(uvhub) &&
+			(uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
+			uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
+
+	vector = UV_BAU_MESSAGE;
+	for_each_possible_blade(uvhub)
+		if (uv_blade_nr_possible_cpus(uvhub))
+			uv_init_uvhub(uvhub, vector);
+
 	uv_enable_timeouts();
+	alloc_intr_gate(vector, uv_bau_message_intr1);
+
+	for_each_possible_blade(uvhub) {
+		pnode = uv_blade_to_pnode(uvhub);
+		/* INIT the bau */
+		uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL,
+				      ((unsigned long)1 << 63));
+		mmr = 1; /* should be 1 to broadcast to both sockets */
+		uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr);
+	}
 
 	return 0;
 }
-__initcall(uv_bau_init);
-__initcall(uv_ptc_init);
+core_initcall(uv_bau_init);
+core_initcall(uv_ptc_init);
-- 
cgit v1.2.3-55-g7522


From a289cc7c70da784a2d370b91885cab4f966dcb0f Mon Sep 17 00:00:00 2001
From: Randy Dunlap
Date: Fri, 16 Apr 2010 17:51:42 -0700
Subject: x86, UV: uv_irq.c: Fix all sparse warnings

Fix all sparse warnings in building uv_irq.c.

 arch/x86/kernel/uv_irq.c:46:17: warning: symbol 'uv_irq_chip' was not declared. Should it be static?
 arch/x86/kernel/uv_irq.c:143:50: error: no identifier for function argument
 arch/x86/kernel/uv_irq.c:162:13: error: typename in expression
 arch/x86/kernel/uv_irq.c:162:13: error: undefined identifier 'restrict'
 arch/x86/kernel/uv_irq.c:250:44: error: no identifier for function argument
 arch/x86/kernel/uv_irq.c:260:17: error: typename in expression
 arch/x86/kernel/uv_irq.c:260:17: error: undefined identifier 'restrict'
 arch/x86/kernel/uv_irq.c:233:50: warning: incorrect type in argument 3 (different signedness)
 arch/x86/kernel/uv_irq.c:233:50:    expected int *pnode
 arch/x86/kernel/uv_irq.c:233:50:    got unsigned int *<noident>
 arch/x86/include/asm/uv/uv_hub.h:318:44: warning: incorrect type in argument 2 (different address spaces)
 arch/x86/include/asm/uv/uv_hub.h:318:44:    expected void volatile [noderef] <asn:2>*addr
 arch/x86/include/asm/uv/uv_hub.h:318:44:    got unsigned long *

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Mike Travis <travis@sgi.com>
Cc: Cliff Wickman <cpw@sgi.com>
Cc: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20100416175142.f4b59683.randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_hub.h |  2 +-
 arch/x86/kernel/uv_irq.c         | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 14cc74ba5d23..bf6b88ef8eeb 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -307,7 +307,7 @@ static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset
  * Access Global MMR space using the MMR space located at the top of physical
  * memory.
  */
-static inline unsigned long *uv_global_mmr64_address(int pnode, unsigned long offset)
+static inline volatile void __iomem *uv_global_mmr64_address(int pnode, unsigned long offset)
 {
 	return __va(UV_GLOBAL_MMR64_BASE |
 		    UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset);
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index ece73d8e3240..1a9f55a33489 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -43,7 +43,7 @@ static void uv_ack_apic(unsigned int irq)
 	ack_APIC_irq();
 }
 
-struct irq_chip uv_irq_chip = {
+static struct irq_chip uv_irq_chip = {
 	.name		= "UV-CORE",
 	.startup	= uv_noop_ret,
 	.shutdown	= uv_noop,
@@ -140,7 +140,7 @@ int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
  */
 static int
 arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
-		       unsigned long mmr_offset, int restrict)
+		       unsigned long mmr_offset, int limit)
 {
 	const struct cpumask *eligible_cpu = cpumask_of(cpu);
 	struct irq_desc *desc = irq_to_desc(irq);
@@ -159,7 +159,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 	if (err != 0)
 		return err;
 
-	if (restrict == UV_AFFINITY_CPU)
+	if (limit == UV_AFFINITY_CPU)
 		desc->status |= IRQ_NO_BALANCING;
 	else
 		desc->status |= IRQ_MOVE_PCNTXT;
@@ -213,7 +213,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	unsigned long mmr_value;
 	struct uv_IO_APIC_route_entry *entry;
 	unsigned long mmr_offset;
-	unsigned mmr_pnode;
+	int mmr_pnode;
 
 	if (set_desc_affinity(desc, mask, &dest))
 		return -1;
@@ -247,7 +247,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
  * interrupt is raised.
  */
 int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
-		 unsigned long mmr_offset, int restrict)
+		 unsigned long mmr_offset, int limit)
 {
 	int irq, ret;
 
@@ -257,7 +257,7 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
 		return -EBUSY;
 
 	ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
-		restrict);
+		limit);
 	if (ret == irq)
 		uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
 	else
-- 
cgit v1.2.3-55-g7522


From 39447b386c846bbf1c56f6403c5282837486200f Mon Sep 17 00:00:00 2001
From: Zhang, Yanmin
Date: Mon, 19 Apr 2010 13:32:41 +0800
Subject: perf: Enhance perf to allow for guest statistic collection from host

Below patch introduces perf_guest_info_callbacks and related
register/unregister functions. Add more PERF_RECORD_MISC_XXX bits
meaning guest kernel and guest user space.

Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/perf_event.h | 15 ++++-----------
 arch/x86/kernel/cpu/perf_event.c  | 31 +++++++++++++++++++++++++++++++
 include/linux/perf_event.h        | 21 ++++++++++++++++++++-
 kernel/perf_event.c               | 23 ++++++++++++++++++++++-
 4 files changed, 77 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index f6d43dbfd8e7..254883d0c7e0 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -135,17 +135,10 @@ extern void perf_events_lapic_init(void);
  */
 #define PERF_EFLAGS_EXACT	(1UL << 3)
 
-#define perf_misc_flags(regs)				\
-({	int misc = 0;					\
-	if (user_mode(regs))				\
-		misc |= PERF_RECORD_MISC_USER;		\
-	else						\
-		misc |= PERF_RECORD_MISC_KERNEL;	\
-	if (regs->flags & PERF_EFLAGS_EXACT)		\
-		misc |= PERF_RECORD_MISC_EXACT;		\
-	misc; })
-
-#define perf_instruction_pointer(regs)	((regs)->ip)
+struct pt_regs;
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+#define perf_misc_flags(regs)	perf_misc_flags(regs)
 
 #else
 static inline void init_hw_perf_events(void)		{ }
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 626154a9f535..2ea78abf69d9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1720,6 +1720,11 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 {
 	struct perf_callchain_entry *entry;
 
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		/* TODO: We don't support guest os callchain now */
+		return NULL;
+	}
+
 	if (in_nmi())
 		entry = &__get_cpu_var(pmc_nmi_entry);
 	else
@@ -1743,3 +1748,29 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 	regs->cs = __KERNEL_CS;
 	local_save_flags(regs->flags);
 }
+
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+	unsigned long ip;
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+		ip = perf_guest_cbs->get_guest_ip();
+	else
+		ip = instruction_pointer(regs);
+	return ip;
+}
+
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+	int misc = 0;
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		misc |= perf_guest_cbs->is_user_mode() ?
+			PERF_RECORD_MISC_GUEST_USER :
+			PERF_RECORD_MISC_GUEST_KERNEL;
+	} else
+		misc |= user_mode(regs) ? PERF_RECORD_MISC_USER :
+			PERF_RECORD_MISC_KERNEL;
+	if (regs->flags & PERF_EFLAGS_EXACT)
+		misc |= PERF_RECORD_MISC_EXACT;
+
+	return misc;
+}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index bf896d0b2e9c..24de5f181a41 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -288,11 +288,13 @@ struct perf_event_mmap_page {
 	__u64	data_tail;		/* user-space written tail */
 };
 
-#define PERF_RECORD_MISC_CPUMODE_MASK		(3 << 0)
+#define PERF_RECORD_MISC_CPUMODE_MASK		(7 << 0)
 #define PERF_RECORD_MISC_CPUMODE_UNKNOWN	(0 << 0)
 #define PERF_RECORD_MISC_KERNEL			(1 << 0)
 #define PERF_RECORD_MISC_USER			(2 << 0)
 #define PERF_RECORD_MISC_HYPERVISOR		(3 << 0)
+#define PERF_RECORD_MISC_GUEST_KERNEL		(4 << 0)
+#define PERF_RECORD_MISC_GUEST_USER		(5 << 0)
 
 #define PERF_RECORD_MISC_EXACT			(1 << 14)
 /*
@@ -446,6 +448,12 @@ enum perf_callchain_context {
 # include <asm/perf_event.h>
 #endif
 
+struct perf_guest_info_callbacks {
+	int (*is_in_guest) (void);
+	int (*is_user_mode) (void);
+	unsigned long (*get_guest_ip) (void);
+};
+
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 #include <asm/hw_breakpoint.h>
 #endif
@@ -932,6 +940,12 @@ static inline void perf_event_mmap(struct vm_area_struct *vma)
 		__perf_event_mmap(vma);
 }
 
+extern struct perf_guest_info_callbacks *perf_guest_cbs;
+extern int perf_register_guest_info_callbacks(
+		struct perf_guest_info_callbacks *);
+extern int perf_unregister_guest_info_callbacks(
+		struct perf_guest_info_callbacks *);
+
 extern void perf_event_comm(struct task_struct *tsk);
 extern void perf_event_fork(struct task_struct *tsk);
 
@@ -1001,6 +1015,11 @@ perf_sw_event(u32 event_id, u64 nr, int nmi,
 static inline void
 perf_bp_event(struct perf_event *event, void *data)			{ }
 
+static inline int perf_register_guest_info_callbacks
+(struct perf_guest_info_callbacks *) {return 0; }
+static inline int perf_unregister_guest_info_callbacks
+(struct perf_guest_info_callbacks *) {return 0; }
+
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_comm(struct task_struct *tsk)		{ }
 static inline void perf_event_fork(struct task_struct *tsk)		{ }
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 07b7a435bf03..9dbe8cdaf145 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2797,6 +2797,27 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 }
 
 
+/*
+ * We assume there is only KVM supporting the callbacks.
+ * Later on, we might change it to a list if there is
+ * another virtualization implementation supporting the callbacks.
+ */
+struct perf_guest_info_callbacks *perf_guest_cbs;
+
+int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+	perf_guest_cbs = cbs;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
+
+int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+	perf_guest_cbs = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+
 /*
  * Output
  */
@@ -3749,7 +3770,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
 		.event_id  = {
 			.header = {
 				.type = PERF_RECORD_MMAP,
-				.misc = 0,
+				.misc = PERF_RECORD_MISC_USER,
 				/* .size */
 			},
 			/* .pid */
-- 
cgit v1.2.3-55-g7522


From ff9d07a0e7ce756a183e7c2e483aec452ee6b574 Mon Sep 17 00:00:00 2001
From: Zhang, Yanmin
Date: Mon, 19 Apr 2010 13:32:45 +0800
Subject: KVM: Implement perf callbacks for guest sampling

Below patch implements the perf_guest_info_callbacks on kvm.

Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c |  5 ++++-
 arch/x86/kvm/x86.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.h |  3 +++
 3 files changed, 53 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 686492ed3079..82be6dac3d25 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3654,8 +3654,11 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 
 	/* We need to handle NMIs before interrupts are enabled */
 	if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
-	    (exit_intr_info & INTR_INFO_VALID_MASK))
+	    (exit_intr_info & INTR_INFO_VALID_MASK)) {
+		kvm_before_handle_nmi(&vmx->vcpu);
 		asm("int $2");
+		kvm_after_handle_nmi(&vmx->vcpu);
+	}
 
 	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 24cd0ee896e9..c3a33b2bb169 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -40,6 +40,7 @@
 #include <linux/user-return-notifier.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
+#include <linux/perf_event.h>
 #include <trace/events/kvm.h>
 #undef TRACE_INCLUDE_FILE
 #define CREATE_TRACE_POINTS
@@ -3765,6 +3766,47 @@ static void kvm_timer_init(void)
 	}
 }
 
+static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+
+static int kvm_is_in_guest(void)
+{
+	return percpu_read(current_vcpu) != NULL;
+}
+
+static int kvm_is_user_mode(void)
+{
+	int user_mode = 3;
+	if (percpu_read(current_vcpu))
+		user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
+	return user_mode != 0;
+}
+
+static unsigned long kvm_get_guest_ip(void)
+{
+	unsigned long ip = 0;
+	if (percpu_read(current_vcpu))
+		ip = kvm_rip_read(percpu_read(current_vcpu));
+	return ip;
+}
+
+static struct perf_guest_info_callbacks kvm_guest_cbs = {
+	.is_in_guest		= kvm_is_in_guest,
+	.is_user_mode		= kvm_is_user_mode,
+	.get_guest_ip		= kvm_get_guest_ip,
+};
+
+void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
+{
+	percpu_write(current_vcpu, vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
+
+void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
+{
+	percpu_write(current_vcpu, NULL);
+}
+EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
+
 int kvm_arch_init(void *opaque)
 {
 	int r;
@@ -3801,6 +3843,8 @@ int kvm_arch_init(void *opaque)
 
 	kvm_timer_init();
 
+	perf_register_guest_info_callbacks(&kvm_guest_cbs);
+
 	return 0;
 
 out:
@@ -3809,6 +3853,8 @@ out:
 
 void kvm_arch_exit(void)
 {
+	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+
 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
 		cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
 					    CPUFREQ_TRANSITION_NOTIFIER);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2d101639bd8d..b7a404722d2b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,4 +65,7 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
 	return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
 }
 
+void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
+void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
+
 #endif
-- 
cgit v1.2.3-55-g7522


From dcf46b9443ad48a227a61713adea001228925adf Mon Sep 17 00:00:00 2001
From: Zhang, Yanmin
Date: Tue, 20 Apr 2010 10:13:58 +0800
Subject: perf & kvm: Clean up some of the guest profiling callback API details

Fix some build bug and programming style issues:

 - use valid C
 - fix up various style details

Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Sheng Yang <sheng@linux.intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: oerg Roedel <joro@8bytes.org>
Cc: Jes Sorensen <Jes.Sorensen@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Zachary Amsden <zamsden@redhat.com>
Cc: zhiteng.huang@intel.com
Cc: tim.c.chen@intel.com
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
LKML-Reference: <1271729638.2078.624.camel@ymzhang.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 20 ++++++++++++++------
 arch/x86/kvm/x86.c               |  4 ++++
 include/linux/perf_event.h       | 10 ++++------
 3 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2ea78abf69d9..7de70613e6c3 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1752,23 +1752,31 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
 	unsigned long ip;
+
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
 		ip = perf_guest_cbs->get_guest_ip();
 	else
 		ip = instruction_pointer(regs);
+
 	return ip;
 }
 
 unsigned long perf_misc_flags(struct pt_regs *regs)
 {
 	int misc = 0;
+
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-		misc |= perf_guest_cbs->is_user_mode() ?
-			PERF_RECORD_MISC_GUEST_USER :
-			PERF_RECORD_MISC_GUEST_KERNEL;
-	} else
-		misc |= user_mode(regs) ? PERF_RECORD_MISC_USER :
-			PERF_RECORD_MISC_KERNEL;
+		if (perf_guest_cbs->is_user_mode())
+			misc |= PERF_RECORD_MISC_GUEST_USER;
+		else
+			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+	} else {
+		if (user_mode(regs))
+			misc |= PERF_RECORD_MISC_USER;
+		else
+			misc |= PERF_RECORD_MISC_KERNEL;
+	}
+
 	if (regs->flags & PERF_EFLAGS_EXACT)
 		misc |= PERF_RECORD_MISC_EXACT;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c3a33b2bb169..21b9b6aa3e88 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3776,16 +3776,20 @@ static int kvm_is_in_guest(void)
 static int kvm_is_user_mode(void)
 {
 	int user_mode = 3;
+
 	if (percpu_read(current_vcpu))
 		user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
+
 	return user_mode != 0;
 }
 
 static unsigned long kvm_get_guest_ip(void)
 {
 	unsigned long ip = 0;
+
 	if (percpu_read(current_vcpu))
 		ip = kvm_rip_read(percpu_read(current_vcpu));
+
 	return ip;
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 24de5f181a41..ace31fbac513 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -941,10 +941,8 @@ static inline void perf_event_mmap(struct vm_area_struct *vma)
 }
 
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
-extern int perf_register_guest_info_callbacks(
-		struct perf_guest_info_callbacks *);
-extern int perf_unregister_guest_info_callbacks(
-		struct perf_guest_info_callbacks *);
+extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
+extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
 
 extern void perf_event_comm(struct task_struct *tsk);
 extern void perf_event_fork(struct task_struct *tsk);
@@ -1016,9 +1014,9 @@ static inline void
 perf_bp_event(struct perf_event *event, void *data)			{ }
 
 static inline int perf_register_guest_info_callbacks
-(struct perf_guest_info_callbacks *) {return 0; }
+(struct perf_guest_info_callbacks *callbacks) { return 0; }
 static inline int perf_unregister_guest_info_callbacks
-(struct perf_guest_info_callbacks *) {return 0; }
+(struct perf_guest_info_callbacks *callbacks) { return 0; }
 
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_comm(struct task_struct *tsk)		{ }
-- 
cgit v1.2.3-55-g7522


From 40f0a5d0a16e68a68ab3d230f1ddd96c81cf5340 Mon Sep 17 00:00:00 2001
From: Justin P. Mattock
Date: Mon, 19 Apr 2010 11:51:16 -0700
Subject: Fix comment typo in percpu.h

Fix a typo in arch/x86/include/asm/percpu.h

Signed-off-by: Justin P. Mattock <justinmattock@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/include/asm/percpu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 66a272dfd8b8..9899afa0283e 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -105,7 +105,7 @@ do {							\
 
 /*
  * Generate a percpu add to memory instruction and optimize code
- * if a one is added or subtracted.
+ * if one is added or subtracted.
  */
 #define percpu_add_op(var, val)						\
 do {									\
-- 
cgit v1.2.3-55-g7522


From aa2110cb1a7510f9b834adfb39b05d4843a35d35 Mon Sep 17 00:00:00 2001
From: Lin Ming
Date: Thu, 8 Apr 2010 14:34:27 +0800
Subject: ACPI: add boot option acpi=copy_dsdt to fix corrupt DSDT

Some BIOS on Toshiba machines corrupt the DSDT, so add a new
boot option acpi=copy_dsdt to workaround it.
Add warning message to ask users to use this option if corrupt DSDT detected.

Also build a DMI blacklist to check it and automatically copy DSDT.

https://bugzilla.kernel.org/show_bug.cgi?id=14679

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 Documentation/kernel-parameters.txt |  1 +
 arch/x86/kernel/acpi/boot.c         |  4 ++++
 drivers/acpi/acpica/tbutils.c       |  4 ++++
 drivers/acpi/bus.c                  | 37 +++++++++++++++++++++++++++++++++++++
 4 files changed, 46 insertions(+)

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e2202e93b148..93f113a1b326 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -151,6 +151,7 @@ and is between 256 and 4096 characters. It is defined in the file
 			strict -- Be less tolerant of platforms that are not
 				strictly ACPI specification compliant.
 			rsdt -- prefer RSDT over (default) XSDT
+			copy_dsdt -- copy DSDT to memory
 
 			See also Documentation/power/pm.txt, pci=noacpi
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index cd40aba6aa95..b9629384263c 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1576,6 +1576,10 @@ static int __init parse_acpi(char *arg)
 	/* "acpi=noirq" disables ACPI interrupt routing */
 	else if (strcmp(arg, "noirq") == 0) {
 		acpi_noirq_set();
+	}
+	/* "acpi=copy_dsdt" copys DSDT */
+	else if (strcmp(arg, "copy_dsdt") == 0) {
+		acpi_gbl_copy_dsdt_locally = 1;
 	} else {
 		/* Core will printk when we return error. */
 		return -EINVAL;
diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c
index 54a8712bae62..a9b105fc2e55 100644
--- a/drivers/acpi/acpica/tbutils.c
+++ b/drivers/acpi/acpica/tbutils.c
@@ -373,6 +373,10 @@ void acpi_tb_check_dsdt_header(void)
 		acpi_tb_print_table_header(0, &acpi_gbl_original_dsdt_header);
 		acpi_tb_print_table_header(0, acpi_gbl_DSDT);
 
+		ACPI_ERROR((AE_INFO,
+			    "Please send DMI info to linux-acpi@vger.kernel.org\n"
+			    "If system does not work as expected, please boot with acpi=copy_dsdt"));
+
 		/* Disable further error messages */
 
 		acpi_gbl_original_dsdt_header.length = acpi_gbl_DSDT->length;
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 37132dc2da03..49af19bb8c9b 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -69,6 +69,37 @@ static struct dmi_system_id __cpuinitdata power_nocheck_dmi_table[] = {
 };
 
 
+static int set_copy_dsdt(const struct dmi_system_id *id)
+{
+	printk(KERN_NOTICE "%s detected - "
+		"force copy of DSDT to local memory\n", id->ident);
+	acpi_gbl_copy_dsdt_locally = 1;
+	return 0;
+}
+
+static struct dmi_system_id dsdt_dmi_table[] __initdata = {
+	/*
+	 * Insyde BIOS on some TOSHIBA machines corrupt the DSDT.
+	 * https://bugzilla.kernel.org/show_bug.cgi?id=14679
+	 */
+	{
+	 .callback = set_copy_dsdt,
+	 .ident = "TOSHIBA Satellite A505",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
+		DMI_MATCH(DMI_PRODUCT_NAME, "Satellite A505"),
+		},
+	},
+	{
+	 .callback = set_copy_dsdt,
+	 .ident = "TOSHIBA Satellite L505D",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
+		DMI_MATCH(DMI_PRODUCT_NAME, "Satellite L505D"),
+		},
+	}
+};
+
 /* --------------------------------------------------------------------------
                                 Device Management
    -------------------------------------------------------------------------- */
@@ -813,6 +844,12 @@ void __init acpi_early_init(void)
 
 	acpi_gbl_permanent_mmap = 1;
 
+	/*
+	 * If the machine falls into the DMI check table,
+	 * DSDT will be copied to memory
+	 */
+	dmi_check_system(dsdt_dmi_table);
+
 	status = acpi_reallocate_root_table();
 	if (ACPI_FAILURE(status)) {
 		printk(KERN_ERR PREFIX
-- 
cgit v1.2.3-55-g7522


From b1ab1b4d9ab9812c77843abec79030292ef0a544 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Thu, 22 Apr 2010 16:06:58 +0200
Subject: x86, cacheinfo: Unify AMD L3 cache index disable checking

All F10h CPUs starting with model 8 resp. 9, stepping 1, support L3
cache index disable. Concentrate the family, model, stepping checking at
one place and enable the feature implicitly on upcoming Fam10h models.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1271945222-5283-2-git-send-email-bp@amd64.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index b3eeb66c0a51..acfb08383908 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -328,18 +328,22 @@ static unsigned int __cpuinit amd_calc_l3_indices(void)
 static void __cpuinit
 amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 {
-	if (index < 3)
+	if (boot_cpu_data.x86 != 0x10)
 		return;
 
-	if (boot_cpu_data.x86 == 0x11)
+	if (index < 3)
 		return;
 
 	/* see errata #382 and #388 */
-	if ((boot_cpu_data.x86 == 0x10) &&
-	    ((boot_cpu_data.x86_model < 0x8) ||
-	     (boot_cpu_data.x86_mask  < 0x1)))
+	if (boot_cpu_data.x86_model < 0x8)
 		return;
 
+	if ((boot_cpu_data.x86_model == 0x8 ||
+	     boot_cpu_data.x86_model == 0x9)
+		&&
+	     boot_cpu_data.x86_mask < 0x1)
+			return;
+
 	this_leaf->can_disable = true;
 	this_leaf->l3_indices  = amd_calc_l3_indices();
 }
@@ -443,8 +447,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
 
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
 		amd_cpuid4(index, &eax, &ebx, &ecx);
-		if (boot_cpu_data.x86 >= 0x10)
-			amd_check_l3_disable(index, this_leaf);
+		amd_check_l3_disable(index, this_leaf);
 	} else {
 		cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
 	}
-- 
cgit v1.2.3-55-g7522


From f2b20e41407fccfcfacf927ff91ec888832a37af Mon Sep 17 00:00:00 2001
From: Frank Arnold
Date: Thu, 22 Apr 2010 16:06:59 +0200
Subject: x86, cacheinfo: Turn off L3 cache index disable feature in
 virtualized environments

When running a quest kernel on xen we get:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000038
IP: [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x2ca/0x3df
PGD 0
Oops: 0000 [#1] SMP
last sysfs file:
CPU 0
Modules linked in:

Pid: 0, comm: swapper Tainted: G        W  2.6.34-rc3 #1 /HVM domU
RIP: 0010:[<ffffffff8142f2fb>]  [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x
2ca/0x3df
RSP: 0018:ffff880002203e08  EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000060
RDX: 0000000000000000 RSI: 0000000000000040 RDI: 0000000000000000
RBP: ffff880002203ed8 R08: 00000000000017c0 R09: ffff880002203e38
R10: ffff8800023d5d40 R11: ffffffff81a01e28 R12: ffff880187e6f5c0
R13: ffff880002203e34 R14: ffff880002203e58 R15: ffff880002203e68
FS:  0000000000000000(0000) GS:ffff880002200000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000038 CR3: 0000000001a3c000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffffffff81a00000, task ffffffff81a44020)
Stack:
 ffffffff810d7ecb ffff880002203e20 ffffffff81059140 ffff880002203e30
<0> ffffffff810d7ec9 0000000002203e40 000000000050d140 ffff880002203e70
<0> 0000000002008140 0000000000000086 ffff880040020140 ffffffff81068b8b
Call Trace:
 <IRQ>
 [<ffffffff810d7ecb>] ? sync_supers_timer_fn+0x0/0x1c
 [<ffffffff81059140>] ? mod_timer+0x23/0x25
 [<ffffffff810d7ec9>] ? arm_supers_timer+0x34/0x36
 [<ffffffff81068b8b>] ? hrtimer_get_next_event+0xa7/0xc3
 [<ffffffff81058e85>] ? get_next_timer_interrupt+0x19a/0x20d
 [<ffffffff8142fa23>] get_cpu_leaves+0x5c/0x232
 [<ffffffff8106a7b1>] ? sched_clock_local+0x1c/0x82
 [<ffffffff8106a9a0>] ? sched_clock_tick+0x75/0x7a
 [<ffffffff8107748c>] generic_smp_call_function_single_interrupt+0xae/0xd0
 [<ffffffff8101f6ef>] smp_call_function_single_interrupt+0x18/0x27
 [<ffffffff8100a773>] call_function_single_interrupt+0x13/0x20
 <EOI>
 [<ffffffff8143c468>] ? notifier_call_chain+0x14/0x63
 [<ffffffff810295c6>] ? native_safe_halt+0xc/0xd
 [<ffffffff810114eb>] ? default_idle+0x36/0x53
 [<ffffffff81008c22>] cpu_idle+0xaa/0xe4
 [<ffffffff81423a9a>] rest_init+0x7e/0x80
 [<ffffffff81b10dd2>] start_kernel+0x40e/0x419
 [<ffffffff81b102c8>] x86_64_start_reservations+0xb3/0xb7
 [<ffffffff81b103c4>] x86_64_start_kernel+0xf8/0x107
Code: 14 d5 40 ff ae 81 8b 14 02 31 c0 3b 15 47 1c 8b 00 7d 0e 48 8b 05 36 1c 8b
 00 48 63 d2 48 8b 04 d0 c7 85 5c ff ff ff 00 00 00 00 <8b> 70 38 48 8d 8d 5c ff
 ff ff 48 8b 78 10 ba c4 01 00 00 e8 eb
RIP  [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x2ca/0x3df
 RSP <ffff880002203e08>
CR2: 0000000000000038
---[ end trace a7919e7f17c0a726 ]---

The L3 cache index disable feature of AMD CPUs has to be disabled if the
kernel is running as guest on top of a hypervisor because northbridge
devices are not available to the guest. Currently, this fixes a boot
crash on top of Xen. In the future this will become an issue on KVM as
well.

Check if northbridge devices are present and do not enable the feature
if there are none.

Signed-off-by: Frank Arnold <frank.arnold@amd.com>
LKML-Reference: <1271945222-5283-3-git-send-email-bp@amd64.org>
Acked-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index acfb08383908..5ab14c86c6ec 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -344,6 +344,10 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 	     boot_cpu_data.x86_mask < 0x1)
 			return;
 
+	/* not in virtualized environments */
+	if (num_k8_northbridges == 0)
+		return;
+
 	this_leaf->can_disable = true;
 	this_leaf->l3_indices  = amd_calc_l3_indices();
 }
-- 
cgit v1.2.3-55-g7522


From 9350f982e4fe539e83a2d4a13e9b53ad8253c4a8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Thu, 22 Apr 2010 16:07:00 +0200
Subject: x86, cacheinfo: Reorganize AMD L3 cache structure

Add a struct representing L3 cache attributes (subcache sizes and
indices count) and move the respective members out of _cpuid4_info.
Also, stash the struct pci_dev ptr into the struct simplifying the code
even more.

There should be no functionality change resulting from this patch except
slightly slimming the _cpuid4_info per-cpu vars.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1271945222-5283-4-git-send-email-bp@amd64.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 53 +++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 5ab14c86c6ec..ff663ca63fdc 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -148,13 +148,19 @@ union _cpuid4_leaf_ecx {
 	u32 full;
 };
 
+struct amd_l3_cache {
+	struct	 pci_dev *dev;
+	bool	 can_disable;
+	unsigned indices;
+	u8	 subcaches[4];
+};
+
 struct _cpuid4_info {
 	union _cpuid4_leaf_eax eax;
 	union _cpuid4_leaf_ebx ebx;
 	union _cpuid4_leaf_ecx ecx;
 	unsigned long size;
-	bool can_disable;
-	unsigned int l3_indices;
+	struct amd_l3_cache *l3;
 	DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
 };
 
@@ -164,8 +170,7 @@ struct _cpuid4_info_regs {
 	union _cpuid4_leaf_ebx ebx;
 	union _cpuid4_leaf_ecx ecx;
 	unsigned long size;
-	bool can_disable;
-	unsigned int l3_indices;
+	struct amd_l3_cache *l3;
 };
 
 unsigned short			num_cache_leaves;
@@ -302,7 +307,7 @@ struct _cache_attr {
 };
 
 #ifdef CONFIG_CPU_SUP_AMD
-static unsigned int __cpuinit amd_calc_l3_indices(void)
+static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 {
 	/*
 	 * We're called over smp_call_function_single() and therefore
@@ -317,12 +322,14 @@ static unsigned int __cpuinit amd_calc_l3_indices(void)
 	pci_read_config_dword(dev, 0x1C4, &val);
 
 	/* calculate subcache sizes */
-	sc0 = !(val & BIT(0));
-	sc1 = !(val & BIT(4));
-	sc2 = !(val & BIT(8))  + !(val & BIT(9));
-	sc3 = !(val & BIT(12)) + !(val & BIT(13));
+	l3->subcaches[0] = sc0 = !(val & BIT(0));
+	l3->subcaches[1] = sc1 = !(val & BIT(4));
+	l3->subcaches[2] = sc2 = !(val & BIT(8))  + !(val & BIT(9));
+	l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
+
+	l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
 
-	return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+	l3->dev = dev;
 }
 
 static void __cpuinit
@@ -348,19 +355,23 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 	if (num_k8_northbridges == 0)
 		return;
 
-	this_leaf->can_disable = true;
-	this_leaf->l3_indices  = amd_calc_l3_indices();
+	this_leaf->l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
+	if (!this_leaf->l3) {
+		printk(KERN_WARNING "Error allocating L3 struct\n");
+		return;
+	}
+
+	this_leaf->l3->can_disable = true;
+	amd_calc_l3_indices(this_leaf->l3);
 }
 
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
 				  unsigned int index)
 {
-	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
-	int node = amd_get_nb_id(cpu);
-	struct pci_dev *dev = node_to_k8_nb_misc(node);
+	struct pci_dev *dev = this_leaf->l3->dev;
 	unsigned int reg = 0;
 
-	if (!this_leaf->can_disable)
+	if (!this_leaf->l3 || !this_leaf->l3->can_disable)
 		return -EINVAL;
 
 	if (!dev)
@@ -382,15 +393,14 @@ SHOW_CACHE_DISABLE(1)
 static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 	const char *buf, size_t count, unsigned int index)
 {
+	struct pci_dev *dev = this_leaf->l3->dev;
 	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
-	int node = amd_get_nb_id(cpu);
-	struct pci_dev *dev = node_to_k8_nb_misc(node);
 	unsigned long val = 0;
 
 #define SUBCACHE_MASK	(3UL << 20)
 #define SUBCACHE_INDEX	0xfff
 
-	if (!this_leaf->can_disable)
+	if (!this_leaf->l3 || !this_leaf->l3->can_disable)
 		return -EINVAL;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -404,7 +414,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 
 	/* do not allow writes outside of allowed bits */
 	if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
-	    ((val & SUBCACHE_INDEX) > this_leaf->l3_indices))
+	    ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
 		return -EINVAL;
 
 	val |= BIT(30);
@@ -708,6 +718,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
 	for (i = 0; i < num_cache_leaves; i++)
 		cache_remove_shared_cpu_map(cpu, i);
 
+	kfree(per_cpu(ici_cpuid4_info, cpu)->l3);
 	kfree(per_cpu(ici_cpuid4_info, cpu));
 	per_cpu(ici_cpuid4_info, cpu) = NULL;
 }
@@ -992,7 +1003,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 
 		this_leaf = CPUID4_INFO_IDX(cpu, i);
 
-		if (this_leaf->can_disable)
+		if (this_leaf->l3 && this_leaf->l3->can_disable)
 			ktype_cache.default_attrs = default_l3_attrs;
 		else
 			ktype_cache.default_attrs = default_attrs;
-- 
cgit v1.2.3-55-g7522


From ba06edb63f5ef2913aad37070eaec3c9d8ac73b8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Thu, 22 Apr 2010 16:07:01 +0200
Subject: x86, cacheinfo: Make L3 cache info per node

Currently, we're allocating L3 cache info and calculating indices for
each online cpu which is clearly superfluous. Instead, we need to do
this per-node as is each L3 cache.

No functional change, only per-cpu memory savings.

-v2: Allocate L3 cache descriptors array dynamically.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1271945222-5283-5-git-send-email-bp@amd64.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 59 ++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index ff663ca63fdc..1346e9c23fc4 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -307,19 +307,18 @@ struct _cache_attr {
 };
 
 #ifdef CONFIG_CPU_SUP_AMD
+
+/*
+ * L3 cache descriptors
+ */
+static struct amd_l3_cache **__cpuinitdata l3_caches;
+
 static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 {
-	/*
-	 * We're called over smp_call_function_single() and therefore
-	 * are on the correct cpu.
-	 */
-	int cpu = smp_processor_id();
-	int node = cpu_to_node(cpu);
-	struct pci_dev *dev = node_to_k8_nb_misc(node);
 	unsigned int sc0, sc1, sc2, sc3;
 	u32 val = 0;
 
-	pci_read_config_dword(dev, 0x1C4, &val);
+	pci_read_config_dword(l3->dev, 0x1C4, &val);
 
 	/* calculate subcache sizes */
 	l3->subcaches[0] = sc0 = !(val & BIT(0));
@@ -328,13 +327,31 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 	l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
 
 	l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+}
+
+static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
+{
+	struct amd_l3_cache *l3;
+	struct pci_dev *dev = node_to_k8_nb_misc(node);
+
+	l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
+	if (!l3) {
+		printk(KERN_WARNING "Error allocating L3 struct\n");
+		return NULL;
+	}
 
 	l3->dev = dev;
+
+	amd_calc_l3_indices(l3);
+
+	return l3;
 }
 
 static void __cpuinit
 amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 {
+	int node;
+
 	if (boot_cpu_data.x86 != 0x10)
 		return;
 
@@ -355,14 +372,28 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 	if (num_k8_northbridges == 0)
 		return;
 
-	this_leaf->l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
-	if (!this_leaf->l3) {
-		printk(KERN_WARNING "Error allocating L3 struct\n");
-		return;
+	/*
+	 * Strictly speaking, the amount in @size below is leaked since it is
+	 * never freed but this is done only on shutdown so it doesn't matter.
+	 */
+	if (!l3_caches) {
+		int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
+
+		l3_caches = kzalloc(size, GFP_ATOMIC);
+		if (!l3_caches)
+			return;
 	}
 
-	this_leaf->l3->can_disable = true;
-	amd_calc_l3_indices(this_leaf->l3);
+	node = amd_get_nb_id(smp_processor_id());
+
+	if (!l3_caches[node]) {
+		l3_caches[node] = amd_init_l3_cache(node);
+		l3_caches[node]->can_disable = true;
+	}
+
+	WARN_ON(!l3_caches[node]);
+
+	this_leaf->l3 = l3_caches[node];
 }
 
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
-- 
cgit v1.2.3-55-g7522


From 59d3b388741cf1a5eb7ad27fd4e9ed72643164ae Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Thu, 22 Apr 2010 16:07:02 +0200
Subject: x86, cacheinfo: Disable index in all four subcaches

When disabling an L3 cache index, make sure we disable that index in
all four subcaches of the L3. Clarify nomenclature while at it, wrt to
disable slots versus disable index and rename accordingly.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1271945222-5283-6-git-send-email-bp@amd64.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 60 +++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 1346e9c23fc4..33eae2062cf5 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -397,7 +397,7 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 }
 
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
-				  unsigned int index)
+				  unsigned int slot)
 {
 	struct pci_dev *dev = this_leaf->l3->dev;
 	unsigned int reg = 0;
@@ -408,21 +408,53 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
 	if (!dev)
 		return -EINVAL;
 
-	pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
+	pci_read_config_dword(dev, 0x1BC + slot * 4, &reg);
 	return sprintf(buf, "0x%08x\n", reg);
 }
 
-#define SHOW_CACHE_DISABLE(index)					\
+#define SHOW_CACHE_DISABLE(slot)					\
 static ssize_t								\
-show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf)	\
+show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf)	\
 {									\
-	return show_cache_disable(this_leaf, buf, index);		\
+	return show_cache_disable(this_leaf, buf, slot);		\
 }
 SHOW_CACHE_DISABLE(0)
 SHOW_CACHE_DISABLE(1)
 
+static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
+				 unsigned slot, unsigned long idx)
+{
+	int i;
+
+	idx |= BIT(30);
+
+	/*
+	 *  disable index in all 4 subcaches
+	 */
+	for (i = 0; i < 4; i++) {
+		u32 reg = idx | (i << 20);
+
+		if (!l3->subcaches[i])
+			continue;
+
+		pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+
+		/*
+		 * We need to WBINVD on a core on the node containing the L3
+		 * cache which indices we disable therefore a simple wbinvd()
+		 * is not sufficient.
+		 */
+		wbinvd_on_cpu(cpu);
+
+		reg |= BIT(31);
+		pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+	}
+}
+
+
 static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
-	const char *buf, size_t count, unsigned int index)
+				   const char *buf, size_t count,
+				   unsigned int slot)
 {
 	struct pci_dev *dev = this_leaf->l3->dev;
 	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
@@ -448,23 +480,17 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 	    ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
 		return -EINVAL;
 
-	val |= BIT(30);
-	pci_write_config_dword(dev, 0x1BC + index * 4, val);
-	/*
-	 * We need to WBINVD on a core on the node containing the L3 cache which
-	 * indices we disable therefore a simple wbinvd() is not sufficient.
-	 */
-	wbinvd_on_cpu(cpu);
-	pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
+	amd_l3_disable_index(this_leaf->l3, cpu, slot, val);
+
 	return count;
 }
 
-#define STORE_CACHE_DISABLE(index)					\
+#define STORE_CACHE_DISABLE(slot)					\
 static ssize_t								\
-store_cache_disable_##index(struct _cpuid4_info *this_leaf,		\
+store_cache_disable_##slot(struct _cpuid4_info *this_leaf,		\
 			    const char *buf, size_t count)		\
 {									\
-	return store_cache_disable(this_leaf, buf, count, index);	\
+	return store_cache_disable(this_leaf, buf, count, slot);	\
 }
 STORE_CACHE_DISABLE(0)
 STORE_CACHE_DISABLE(1)
-- 
cgit v1.2.3-55-g7522


From 1f9cc3cb6a27521edfe0a21abf97d2bb11c4d237 Mon Sep 17 00:00:00 2001
From: Robin Holt
Date: Fri, 23 Apr 2010 10:36:22 -0500
Subject: x86, pat: Update the page flags for memtype atomically instead of
 using memtype_lock

While testing an application using the xpmem (out of kernel) driver, we
noticed a significant page fault rate reduction of x86_64 with respect
to ia64.  For one test running with 32 cpus, one thread per cpu, it
took 01:08 for each of the threads to vm_insert_pfn 2GB worth of pages.
For the same test running on 256 cpus, one thread per cpu, it took 14:48
to vm_insert_pfn 2 GB worth of pages.

The slowdown was tracked to lookup_memtype which acquires the
spinlock memtype_lock.  This heavily contended lock was slowing down
vm_insert_pfn().

With the cmpxchg on page->flags method, both the 32 cpu and 256 cpu
cases take approx 00:01.3 seconds to complete.

Signed-off-by: Robin Holt <holt@sgi.com>
LKML-Reference: <20100423153627.751194346@gulag1.americas.sgi.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@gmail.com>
Cc: Rafael Wysocki <rjw@novell.com>
Reviewed-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/cacheflush.h | 44 ++++++++++++++++++++++-----------------
 arch/x86/mm/pat.c                 |  8 -------
 2 files changed, 25 insertions(+), 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 634c40a739a6..c70068d05f70 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -44,9 +44,6 @@ static inline void copy_from_user_page(struct vm_area_struct *vma,
 	memcpy(dst, src, len);
 }
 
-#define PG_WC				PG_arch_1
-PAGEFLAG(WC, WC)
-
 #ifdef CONFIG_X86_PAT
 /*
  * X86 PAT uses page flags WC and Uncached together to keep track of
@@ -55,16 +52,24 @@ PAGEFLAG(WC, WC)
  * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not
  * been changed from its default (value of -1 used to denote this).
  * Note we do not support _PAGE_CACHE_UC here.
- *
- * Caller must hold memtype_lock for atomicity.
  */
+
+#define _PGMT_DEFAULT		0
+#define _PGMT_WC		(1UL << PG_arch_1)
+#define _PGMT_UC_MINUS		(1UL << PG_uncached)
+#define _PGMT_WB		(1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_MASK		(1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_CLEAR_MASK	(~_PGMT_MASK)
+
 static inline unsigned long get_page_memtype(struct page *pg)
 {
-	if (!PageUncached(pg) && !PageWC(pg))
+	unsigned long pg_flags = pg->flags & _PGMT_MASK;
+
+	if (pg_flags == _PGMT_DEFAULT)
 		return -1;
-	else if (!PageUncached(pg) && PageWC(pg))
+	else if (pg_flags == _PGMT_WC)
 		return _PAGE_CACHE_WC;
-	else if (PageUncached(pg) && !PageWC(pg))
+	else if (pg_flags == _PGMT_UC_MINUS)
 		return _PAGE_CACHE_UC_MINUS;
 	else
 		return _PAGE_CACHE_WB;
@@ -72,25 +77,26 @@ static inline unsigned long get_page_memtype(struct page *pg)
 
 static inline void set_page_memtype(struct page *pg, unsigned long memtype)
 {
+	unsigned long memtype_flags = _PGMT_DEFAULT;
+	unsigned long old_flags;
+	unsigned long new_flags;
+
 	switch (memtype) {
 	case _PAGE_CACHE_WC:
-		ClearPageUncached(pg);
-		SetPageWC(pg);
+		memtype_flags = _PGMT_WC;
 		break;
 	case _PAGE_CACHE_UC_MINUS:
-		SetPageUncached(pg);
-		ClearPageWC(pg);
+		memtype_flags = _PGMT_UC_MINUS;
 		break;
 	case _PAGE_CACHE_WB:
-		SetPageUncached(pg);
-		SetPageWC(pg);
-		break;
-	default:
-	case -1:
-		ClearPageUncached(pg);
-		ClearPageWC(pg);
+		memtype_flags = _PGMT_WB;
 		break;
 	}
+
+	do {
+		old_flags = pg->flags;
+		new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
+	} while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags);
 }
 #else
 static inline unsigned long get_page_memtype(struct page *pg) { return -1; }
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 951011166ef5..501fc60e5e4d 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -190,8 +190,6 @@ static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
  * Here we do two pass:
  * - Find the memtype of all the pages in the range, look for any conflicts
  * - In case of no conflicts, set the new memtype for pages in the range
- *
- * Caller must hold memtype_lock for atomicity.
  */
 static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
 				  unsigned long *new_type)
@@ -297,9 +295,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 	is_range_ram = pat_pagerange_is_ram(start, end);
 	if (is_range_ram == 1) {
 
-		spin_lock(&memtype_lock);
 		err = reserve_ram_pages_type(start, end, req_type, new_type);
-		spin_unlock(&memtype_lock);
 
 		return err;
 	} else if (is_range_ram < 0) {
@@ -351,9 +347,7 @@ int free_memtype(u64 start, u64 end)
 	is_range_ram = pat_pagerange_is_ram(start, end);
 	if (is_range_ram == 1) {
 
-		spin_lock(&memtype_lock);
 		err = free_ram_pages_type(start, end);
-		spin_unlock(&memtype_lock);
 
 		return err;
 	} else if (is_range_ram < 0) {
@@ -394,10 +388,8 @@ static unsigned long lookup_memtype(u64 paddr)
 
 	if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
 		struct page *page;
-		spin_lock(&memtype_lock);
 		page = pfn_to_page(paddr >> PAGE_SHIFT);
 		rettype = get_page_memtype(page);
-		spin_unlock(&memtype_lock);
 		/*
 		 * -1 from get_page_memtype() implies RAM page is in its
 		 * default state and not reserved, and hence of type WB
-- 
cgit v1.2.3-55-g7522


From 89a27f4d0e042a2fa3391a76b652aec3e16ef200 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Tue, 16 Feb 2010 10:51:48 +0200
Subject: KVM: use desc_ptr struct instead of kvm private descriptor_table

x86 arch defines desc_ptr for idt/gdt pointers, no need to define
another structure in kvm code.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 17 ++++++---------
 arch/x86/kvm/svm.c              | 28 ++++++++++++-------------
 arch/x86/kvm/vmx.c              | 36 ++++++++++++++++----------------
 arch/x86/kvm/x86.c              | 46 ++++++++++++++++++++---------------------
 4 files changed, 61 insertions(+), 66 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 06d9e79ca37d..cf392dfb8000 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -461,11 +461,6 @@ struct kvm_vcpu_stat {
 	u32 nmi_injections;
 };
 
-struct descriptor_table {
-	u16 limit;
-	unsigned long base;
-} __attribute__((packed));
-
 struct kvm_x86_ops {
 	int (*cpu_has_kvm_support)(void);          /* __init */
 	int (*disabled_by_bios)(void);             /* __init */
@@ -503,10 +498,10 @@ struct kvm_x86_ops {
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
 	void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
 	void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
-	void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+	void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+	void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+	void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+	void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
 	int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
 	int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
 	void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
@@ -724,12 +719,12 @@ static inline void kvm_load_ldt(u16 sel)
 	asm("lldt %0" : : "rm"(sel));
 }
 
-static inline void kvm_get_idt(struct descriptor_table *table)
+static inline void kvm_get_idt(struct desc_ptr *table)
 {
 	asm("sidt %0" : "=m"(*table));
 }
 
-static inline void kvm_get_gdt(struct descriptor_table *table)
+static inline void kvm_get_gdt(struct desc_ptr *table)
 {
 	asm("sgdt %0" : "=m"(*table));
 }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2ba58206812a..77fa2e3053b5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -319,7 +319,7 @@ static int svm_hardware_enable(void *garbage)
 
 	struct svm_cpu_data *sd;
 	uint64_t efer;
-	struct descriptor_table gdt_descr;
+	struct desc_ptr gdt_descr;
 	struct desc_struct *gdt;
 	int me = raw_smp_processor_id();
 
@@ -345,7 +345,7 @@ static int svm_hardware_enable(void *garbage)
 	sd->next_asid = sd->max_asid + 1;
 
 	kvm_get_gdt(&gdt_descr);
-	gdt = (struct desc_struct *)gdt_descr.base;
+	gdt = (struct desc_struct *)gdt_descr.address;
 	sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 
 	wrmsrl(MSR_EFER, efer | EFER_SVME);
@@ -936,36 +936,36 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu)
 	return save->cpl;
 }
 
-static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	dt->limit = svm->vmcb->save.idtr.limit;
-	dt->base = svm->vmcb->save.idtr.base;
+	dt->size = svm->vmcb->save.idtr.limit;
+	dt->address = svm->vmcb->save.idtr.base;
 }
 
-static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	svm->vmcb->save.idtr.limit = dt->limit;
-	svm->vmcb->save.idtr.base = dt->base ;
+	svm->vmcb->save.idtr.limit = dt->size;
+	svm->vmcb->save.idtr.base = dt->address ;
 }
 
-static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	dt->limit = svm->vmcb->save.gdtr.limit;
-	dt->base = svm->vmcb->save.gdtr.base;
+	dt->size = svm->vmcb->save.gdtr.limit;
+	dt->address = svm->vmcb->save.gdtr.base;
 }
 
-static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	svm->vmcb->save.gdtr.limit = dt->limit;
-	svm->vmcb->save.gdtr.base = dt->base ;
+	svm->vmcb->save.gdtr.limit = dt->size;
+	svm->vmcb->save.gdtr.base = dt->address ;
 }
 
 static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bc933cfb4e66..68f895b00450 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -600,11 +600,11 @@ static void reload_tss(void)
 	/*
 	 * VT restores TR but not its size.  Useless.
 	 */
-	struct descriptor_table gdt;
+	struct desc_ptr gdt;
 	struct desc_struct *descs;
 
 	kvm_get_gdt(&gdt);
-	descs = (void *)gdt.base;
+	descs = (void *)gdt.address;
 	descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
 	load_TR_desc();
 }
@@ -758,7 +758,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	if (vcpu->cpu != cpu) {
-		struct descriptor_table dt;
+		struct desc_ptr dt;
 		unsigned long sysenter_esp;
 
 		vcpu->cpu = cpu;
@@ -768,7 +768,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		 */
 		vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
 		kvm_get_gdt(&dt);
-		vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
+		vmcs_writel(HOST_GDTR_BASE, dt.address);   /* 22.2.4 */
 
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
@@ -1934,28 +1934,28 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
 	*l = (ar >> 13) & 1;
 }
 
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
-	dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
-	dt->base = vmcs_readl(GUEST_IDTR_BASE);
+	dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
+	dt->address = vmcs_readl(GUEST_IDTR_BASE);
 }
 
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
-	vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
-	vmcs_writel(GUEST_IDTR_BASE, dt->base);
+	vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
+	vmcs_writel(GUEST_IDTR_BASE, dt->address);
 }
 
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
-	dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
-	dt->base = vmcs_readl(GUEST_GDTR_BASE);
+	dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
+	dt->address = vmcs_readl(GUEST_GDTR_BASE);
 }
 
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
-	vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
-	vmcs_writel(GUEST_GDTR_BASE, dt->base);
+	vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
+	vmcs_writel(GUEST_GDTR_BASE, dt->address);
 }
 
 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
@@ -2334,7 +2334,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	u32 junk;
 	u64 host_pat, tsc_this, tsc_base;
 	unsigned long a;
-	struct descriptor_table dt;
+	struct desc_ptr dt;
 	int i;
 	unsigned long kvm_vmx_return;
 	u32 exec_control;
@@ -2416,7 +2416,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
 
 	kvm_get_idt(&dt);
-	vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
+	vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
 
 	asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
 	vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3c4ca98ad27f..274a8e39bca7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -225,7 +225,7 @@ static void drop_user_return_notifiers(void *ignore)
 
 unsigned long segment_base(u16 selector)
 {
-	struct descriptor_table gdt;
+	struct desc_ptr gdt;
 	struct desc_struct *d;
 	unsigned long table_base;
 	unsigned long v;
@@ -234,7 +234,7 @@ unsigned long segment_base(u16 selector)
 		return 0;
 
 	kvm_get_gdt(&gdt);
-	table_base = gdt.base;
+	table_base = gdt.address;
 
 	if (selector & 4) {           /* from ldt */
 		u16 ldt_selector = kvm_read_ldt();
@@ -3949,14 +3949,14 @@ static u64 mk_cr_64(u64 curr_cr, u32 new_val)
 
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
 {
-	struct descriptor_table dt = { limit, base };
+	struct desc_ptr dt = { limit, base };
 
 	kvm_x86_ops->set_gdt(vcpu, &dt);
 }
 
 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
 {
-	struct descriptor_table dt = { limit, base };
+	struct desc_ptr dt = { limit, base };
 
 	kvm_x86_ops->set_idt(vcpu, &dt);
 }
@@ -4581,7 +4581,7 @@ EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
-	struct descriptor_table dt;
+	struct desc_ptr dt;
 
 	vcpu_load(vcpu);
 
@@ -4596,11 +4596,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
 
 	kvm_x86_ops->get_idt(vcpu, &dt);
-	sregs->idt.limit = dt.limit;
-	sregs->idt.base = dt.base;
+	sregs->idt.limit = dt.size;
+	sregs->idt.base = dt.address;
 	kvm_x86_ops->get_gdt(vcpu, &dt);
-	sregs->gdt.limit = dt.limit;
-	sregs->gdt.base = dt.base;
+	sregs->gdt.limit = dt.size;
+	sregs->gdt.base = dt.address;
 
 	sregs->cr0 = kvm_read_cr0(vcpu);
 	sregs->cr2 = vcpu->arch.cr2;
@@ -4672,7 +4672,7 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
 
 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
 					  u16 selector,
-					  struct descriptor_table *dtable)
+					  struct desc_ptr *dtable)
 {
 	if (selector & 1 << 2) {
 		struct kvm_segment kvm_seg;
@@ -4680,10 +4680,10 @@ static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
 		kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
 
 		if (kvm_seg.unusable)
-			dtable->limit = 0;
+			dtable->size = 0;
 		else
-			dtable->limit = kvm_seg.limit;
-		dtable->base = kvm_seg.base;
+			dtable->size = kvm_seg.limit;
+		dtable->address = kvm_seg.base;
 	}
 	else
 		kvm_x86_ops->get_gdt(vcpu, dtable);
@@ -4693,7 +4693,7 @@ static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 					 struct desc_struct *seg_desc)
 {
-	struct descriptor_table dtable;
+	struct desc_ptr dtable;
 	u16 index = selector >> 3;
 	int ret;
 	u32 err;
@@ -4701,7 +4701,7 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 
 	get_segment_descriptor_dtable(vcpu, selector, &dtable);
 
-	if (dtable.limit < index * 8 + 7) {
+	if (dtable.size < index * 8 + 7) {
 		kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
 		return X86EMUL_PROPAGATE_FAULT;
 	}
@@ -4718,14 +4718,14 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 					 struct desc_struct *seg_desc)
 {
-	struct descriptor_table dtable;
+	struct desc_ptr dtable;
 	u16 index = selector >> 3;
 
 	get_segment_descriptor_dtable(vcpu, selector, &dtable);
 
-	if (dtable.limit < index * 8 + 7)
+	if (dtable.size < index * 8 + 7)
 		return 1;
-	return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
+	return kvm_write_guest_virt(dtable.address + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
 }
 
 static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
@@ -5204,15 +5204,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 {
 	int mmu_reset_needed = 0;
 	int pending_vec, max_bits;
-	struct descriptor_table dt;
+	struct desc_ptr dt;
 
 	vcpu_load(vcpu);
 
-	dt.limit = sregs->idt.limit;
-	dt.base = sregs->idt.base;
+	dt.size = sregs->idt.limit;
+	dt.address = sregs->idt.base;
 	kvm_x86_ops->set_idt(vcpu, &dt);
-	dt.limit = sregs->gdt.limit;
-	dt.base = sregs->gdt.base;
+	dt.size = sregs->gdt.limit;
+	dt.address = sregs->gdt.base;
 	kvm_x86_ops->set_gdt(vcpu, &dt);
 
 	vcpu->arch.cr2 = sregs->cr2;
-- 
cgit v1.2.3-55-g7522


From 1161624f15f584096a0df3dda70403cd1d00721e Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 11 Feb 2010 14:43:14 +0200
Subject: KVM: inject #UD in 64bit mode from instruction that are not valid
 there

Some instruction are obsolete in a long mode. Inject #UD.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4dade6ac0827..96d4bef06e14 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1015,11 +1015,6 @@ done_prefixes:
 		}
 	}
 
-	if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
-		kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
-		return -1;
-	}
-
 	if (c->d & Group) {
 		group = c->d & GroupMask;
 		c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1828,6 +1823,11 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
 	saved_eip = c->eip;
 
+	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
+		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		goto done;
+	}
+
 	/* LOCK prefix is allowed only with some instructions */
 	if (c->lock_prefix && !(c->d & Lock)) {
 		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
-- 
cgit v1.2.3-55-g7522


From 3e2815e9fa6c06bcb8a9340e43008bbe48437d25 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Fri, 12 Feb 2010 15:53:59 +0900
Subject: KVM: x86 emulator: X86EMUL macro replacements: from
 do_fetch_insn_byte() to x86_decode_insn()

This patch just replaces the integer values used inside x86's
decode functions to X86EMUL_*.

By this patch, it becomes clearer that we are using X86EMUL_*
value propagated from ops->read_std() in do_fetch_insn_byte().

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 96d4bef06e14..b8aed35ab5f9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -647,20 +647,20 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
 	if (linear < fc->start || linear >= fc->end) {
 		size = min(15UL, PAGE_SIZE - offset_in_page(linear));
 		rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL);
-		if (rc)
+		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		fc->start = linear;
 		fc->end = linear + size;
 	}
 	*dest = fc->data[linear - fc->start];
-	return 0;
+	return X86EMUL_CONTINUE;
 }
 
 static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
 			 struct x86_emulate_ops *ops,
 			 unsigned long eip, void *dest, unsigned size)
 {
-	int rc = 0;
+	int rc;
 
 	/* x86 instructions are limited to 15 bytes. */
 	if (eip + size - ctxt->decode.eip_orig > 15)
@@ -668,10 +668,10 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
 	eip += ctxt->cs_base;
 	while (size--) {
 		rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
-		if (rc)
+		if (rc != X86EMUL_CONTINUE)
 			return rc;
 	}
-	return 0;
+	return X86EMUL_CONTINUE;
 }
 
 /*
@@ -782,7 +782,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	u8 sib;
 	int index_reg = 0, base_reg = 0, scale;
-	int rc = 0;
+	int rc = X86EMUL_CONTINUE;
 
 	if (c->rex_prefix) {
 		c->modrm_reg = (c->rex_prefix & 4) << 1;	/* REX.R */
@@ -895,7 +895,7 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
 		      struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc = 0;
+	int rc = X86EMUL_CONTINUE;
 
 	switch (c->ad_bytes) {
 	case 2:
@@ -916,7 +916,7 @@ int
 x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc = 0;
+	int rc = X86EMUL_CONTINUE;
 	int mode = ctxt->mode;
 	int def_op_bytes, def_ad_bytes, group;
 
@@ -1041,7 +1041,7 @@ done_prefixes:
 		rc = decode_modrm(ctxt, ops);
 	else if (c->d & MemAbs)
 		rc = decode_abs(ctxt, ops);
-	if (rc)
+	if (rc != X86EMUL_CONTINUE)
 		goto done;
 
 	if (!c->has_seg_override)
-- 
cgit v1.2.3-55-g7522


From 1b30eaa84609031c06e417eafd5b68f45e4266f7 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Fri, 12 Feb 2010 15:57:56 +0900
Subject: KVM: x86 emulator: X86EMUL macro replacements: x86_emulate_insn() and
 its helpers

This patch just replaces integer values used inside
x86_emulate_insn() and its helper functions to X86EMUL_*.

The purpose of this is to make it clear what will happen
when the variable rc is compared to X86EMUL_* at the end
of x86_emulate_insn().

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 62 +++++++++++++++++++++++---------------------------
 1 file changed, 29 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b8aed35ab5f9..ee1a2a2c12e9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -702,7 +702,7 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
 	*address = 0;
 	rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
 			   ctxt->vcpu, NULL);
-	if (rc)
+	if (rc != X86EMUL_CONTINUE)
 		return rc;
 	rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
 			   ctxt->vcpu, NULL);
@@ -1301,7 +1301,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
 	int rc;
 
 	rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
-	if (rc != 0)
+	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
 	rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg);
@@ -1327,7 +1327,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
 			struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc = 0;
+	int rc = X86EMUL_CONTINUE;
 	int reg = VCPU_REGS_RDI;
 
 	while (reg >= VCPU_REGS_RAX) {
@@ -1338,7 +1338,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
 		}
 
 		rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			break;
 		--reg;
 	}
@@ -1349,12 +1349,8 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
 				struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc;
 
-	rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
-	if (rc != 0)
-		return rc;
-	return 0;
+	return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
 }
 
 static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
@@ -1390,7 +1386,7 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 			       struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc = 0;
+	int rc = X86EMUL_CONTINUE;
 
 	switch (c->modrm_reg) {
 	case 0 ... 1:	/* test */
@@ -1437,7 +1433,7 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
 		emulate_push(ctxt);
 		break;
 	}
-	return 0;
+	return X86EMUL_CONTINUE;
 }
 
 static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
@@ -1468,7 +1464,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
 			return rc;
 		ctxt->eflags |= EFLG_ZF;
 	}
-	return 0;
+	return X86EMUL_CONTINUE;
 }
 
 static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
@@ -1479,12 +1475,12 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
 	unsigned long cs;
 
 	rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
-	if (rc)
+	if (rc != X86EMUL_CONTINUE)
 		return rc;
 	if (c->op_bytes == 4)
 		c->eip = (u32)c->eip;
 	rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
-	if (rc)
+	if (rc != X86EMUL_CONTINUE)
 		return rc;
 	rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS);
 	return rc;
@@ -1539,7 +1535,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 	default:
 		break;
 	}
-	return 0;
+	return X86EMUL_CONTINUE;
 }
 
 static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
@@ -1811,7 +1807,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	struct decode_cache *c = &ctxt->decode;
 	unsigned int port;
 	int io_dir_in;
-	int rc = 0;
+	int rc = X86EMUL_CONTINUE;
 
 	ctxt->interruptibility = 0;
 
@@ -1926,7 +1922,7 @@ special_insn:
 		break;
 	case 0x07:		/* pop es */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0x08 ... 0x0d:
@@ -1945,7 +1941,7 @@ special_insn:
 		break;
 	case 0x17:		/* pop ss */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0x18 ... 0x1d:
@@ -1957,7 +1953,7 @@ special_insn:
 		break;
 	case 0x1f:		/* pop ds */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0x20 ... 0x25:
@@ -1988,7 +1984,7 @@ special_insn:
 	case 0x58 ... 0x5f: /* pop reg */
 	pop_instruction:
 		rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0x60:	/* pusha */
@@ -1996,7 +1992,7 @@ special_insn:
 		break;
 	case 0x61:	/* popa */
 		rc = emulate_popa(ctxt, ops);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0x63:		/* movsxd */
@@ -2141,7 +2137,7 @@ special_insn:
 	}
 	case 0x8f:		/* pop (sole member of Grp1a) */
 		rc = emulate_grp1a(ctxt, ops);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0x90: /* nop / xchg r8,rax */
@@ -2277,7 +2273,7 @@ special_insn:
 		break;
 	case 0xcb:		/* ret far */
 		rc = emulate_ret_far(ctxt, ops);
-		if (rc)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0xd0 ... 0xd1:	/* Grp2 */
@@ -2351,7 +2347,7 @@ special_insn:
 		break;
 	case 0xf6 ... 0xf7:	/* Grp3 */
 		rc = emulate_grp3(ctxt, ops);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0xf8: /* clc */
@@ -2385,14 +2381,14 @@ special_insn:
 		break;
 	case 0xfe ... 0xff:	/* Grp4/Grp5 */
 		rc = emulate_grp45(ctxt, ops);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	}
 
 writeback:
 	rc = writeback(ctxt, ops);
-	if (rc != 0)
+	if (rc != X86EMUL_CONTINUE)
 		goto done;
 
 	/* Commit shadow register state. */
@@ -2418,7 +2414,7 @@ twobyte_insn:
 				goto cannot_emulate;
 
 			rc = kvm_fix_hypercall(ctxt->vcpu);
-			if (rc)
+			if (rc != X86EMUL_CONTINUE)
 				goto done;
 
 			/* Let the processor re-execute the fixed hypercall */
@@ -2429,7 +2425,7 @@ twobyte_insn:
 		case 2: /* lgdt */
 			rc = read_descriptor(ctxt, ops, c->src.ptr,
 					     &size, &address, c->op_bytes);
-			if (rc)
+			if (rc != X86EMUL_CONTINUE)
 				goto done;
 			realmode_lgdt(ctxt->vcpu, size, address);
 			/* Disable writeback. */
@@ -2440,7 +2436,7 @@ twobyte_insn:
 				switch (c->modrm_rm) {
 				case 1:
 					rc = kvm_fix_hypercall(ctxt->vcpu);
-					if (rc)
+					if (rc != X86EMUL_CONTINUE)
 						goto done;
 					break;
 				default:
@@ -2450,7 +2446,7 @@ twobyte_insn:
 				rc = read_descriptor(ctxt, ops, c->src.ptr,
 						     &size, &address,
 						     c->op_bytes);
-				if (rc)
+				if (rc != X86EMUL_CONTINUE)
 					goto done;
 				realmode_lidt(ctxt->vcpu, size, address);
 			}
@@ -2577,7 +2573,7 @@ twobyte_insn:
 		break;
 	case 0xa1:	 /* pop fs */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0xa3:
@@ -2596,7 +2592,7 @@ twobyte_insn:
 		break;
 	case 0xa9:	/* pop gs */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0xab:
@@ -2669,7 +2665,7 @@ twobyte_insn:
 		break;
 	case 0xc7:		/* Grp9 (cmpxchg8b) */
 		rc = emulate_grp9(ctxt, ops, memop);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		c->dst.type = OP_NONE;
 		break;
-- 
cgit v1.2.3-55-g7522


From 0e4176a15f9af494ad098cb5a76bcfa17e14282b Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Fri, 12 Feb 2010 16:00:55 +0900
Subject: KVM: x86 emulator: Fix x86_emulate_insn() not to use the variable rc
 for non-X86EMUL values

This patch makes non-X86EMUL_* family functions not to use
the variable rc.

Be sure that this changes nothing but makes the purpose of
the variable rc clearer.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ee1a2a2c12e9..35f7acd4a91f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2498,9 +2498,9 @@ twobyte_insn:
 	case 0x21: /* mov from dr to reg */
 		if (c->modrm_mod != 3)
 			goto cannot_emulate;
-		rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
-		if (rc)
+		if (emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]))
 			goto cannot_emulate;
+		rc = X86EMUL_CONTINUE;
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x22: /* mov reg, cr */
@@ -2513,18 +2513,16 @@ twobyte_insn:
 	case 0x23: /* mov from reg to dr */
 		if (c->modrm_mod != 3)
 			goto cannot_emulate;
-		rc = emulator_set_dr(ctxt, c->modrm_reg,
-				     c->regs[c->modrm_rm]);
-		if (rc)
+		if (emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]))
 			goto cannot_emulate;
+		rc = X86EMUL_CONTINUE;
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x30:
 		/* wrmsr */
 		msr_data = (u32)c->regs[VCPU_REGS_RAX]
 			| ((u64)c->regs[VCPU_REGS_RDX] << 32);
-		rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
-		if (rc) {
+		if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			c->eip = kvm_rip_read(ctxt->vcpu);
 		}
@@ -2533,8 +2531,7 @@ twobyte_insn:
 		break;
 	case 0x32:
 		/* rdmsr */
-		rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
-		if (rc) {
+		if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			c->eip = kvm_rip_read(ctxt->vcpu);
 		} else {
-- 
cgit v1.2.3-55-g7522


From ad91f8ffbb18413e79f9f976a55b4e11d02e6a6d Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Fri, 12 Feb 2010 16:02:54 +0900
Subject: KVM: remove redundant prototype of load_pdptrs()

This patch removes redundant prototype of load_pdptrs().

I found load_pdptrs() twice in kvm_host.h. Let's remove one.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index cf392dfb8000..d46e791de129 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -670,7 +670,6 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
 void kvm_enable_tdp(void);
 void kvm_disable_tdp(void);
 
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
 int complete_pio(struct kvm_vcpu *vcpu);
 bool kvm_check_iopl(struct kvm_vcpu *vcpu);
 
-- 
cgit v1.2.3-55-g7522


From 7597f129d8b6799da7a264e6d6f7401668d3a36d Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Fri, 19 Feb 2010 16:23:00 +0100
Subject: KVM: SVM: Don't use kmap_atomic in nested_svm_map

Use of kmap_atomic disables preemption but if we run in
shadow-shadow mode the vmrun emulation executes kvm_set_cr3
which might sleep or fault. So use kmap instead for
nested_svm_map.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 47 ++++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 77fa2e3053b5..f9da35b06ec7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1423,7 +1423,7 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
 	return 0;
 }
 
-static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
+static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
 {
 	struct page *page;
 
@@ -1431,7 +1431,9 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
 	if (is_error_page(page))
 		goto error;
 
-	return kmap_atomic(page, idx);
+	*_page = page;
+
+	return kmap(page);
 
 error:
 	kvm_release_page_clean(page);
@@ -1440,16 +1442,9 @@ error:
 	return NULL;
 }
 
-static void nested_svm_unmap(void *addr, enum km_type idx)
+static void nested_svm_unmap(struct page *page)
 {
-	struct page *page;
-
-	if (!addr)
-		return;
-
-	page = kmap_atomic_to_page(addr);
-
-	kunmap_atomic(addr, idx);
+	kunmap(page);
 	kvm_release_page_dirty(page);
 }
 
@@ -1457,6 +1452,7 @@ static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 {
 	u32 param = svm->vmcb->control.exit_info_1 & 1;
 	u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+	struct page *page;
 	bool ret = false;
 	u32 t0, t1;
 	u8 *msrpm;
@@ -1464,7 +1460,7 @@ static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 	if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
 		return false;
 
-	msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
+	msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, &page);
 
 	if (!msrpm)
 		goto out;
@@ -1492,7 +1488,7 @@ static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 	ret = msrpm[t1] & ((1 << param) << t0);
 
 out:
-	nested_svm_unmap(msrpm, KM_USER0);
+	nested_svm_unmap(page);
 
 	return ret;
 }
@@ -1615,6 +1611,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	struct vmcb *nested_vmcb;
 	struct vmcb *hsave = svm->nested.hsave;
 	struct vmcb *vmcb = svm->vmcb;
+	struct page *page;
 
 	trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
 				       vmcb->control.exit_info_1,
@@ -1622,7 +1619,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 				       vmcb->control.exit_int_info,
 				       vmcb->control.exit_int_info_err);
 
-	nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
+	nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
 	if (!nested_vmcb)
 		return 1;
 
@@ -1712,7 +1709,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	/* Exit nested SVM mode */
 	svm->nested.vmcb = 0;
 
-	nested_svm_unmap(nested_vmcb, KM_USER0);
+	nested_svm_unmap(page);
 
 	kvm_mmu_reset_context(&svm->vcpu);
 	kvm_mmu_load(&svm->vcpu);
@@ -1723,9 +1720,10 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 {
 	u32 *nested_msrpm;
+	struct page *page;
 	int i;
 
-	nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
+	nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, &page);
 	if (!nested_msrpm)
 		return false;
 
@@ -1734,7 +1732,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 
 	svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
 
-	nested_svm_unmap(nested_msrpm, KM_USER0);
+	nested_svm_unmap(page);
 
 	return true;
 }
@@ -1744,8 +1742,9 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	struct vmcb *nested_vmcb;
 	struct vmcb *hsave = svm->nested.hsave;
 	struct vmcb *vmcb = svm->vmcb;
+	struct page *page;
 
-	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
 	if (!nested_vmcb)
 		return false;
 
@@ -1857,7 +1856,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
 	svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
 
-	nested_svm_unmap(nested_vmcb, KM_USER0);
+	nested_svm_unmap(page);
 
 	enable_gif(svm);
 
@@ -1883,6 +1882,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
 static int vmload_interception(struct vcpu_svm *svm)
 {
 	struct vmcb *nested_vmcb;
+	struct page *page;
 
 	if (nested_svm_check_permissions(svm))
 		return 1;
@@ -1890,12 +1890,12 @@ static int vmload_interception(struct vcpu_svm *svm)
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
 	skip_emulated_instruction(&svm->vcpu);
 
-	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
 	if (!nested_vmcb)
 		return 1;
 
 	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
-	nested_svm_unmap(nested_vmcb, KM_USER0);
+	nested_svm_unmap(page);
 
 	return 1;
 }
@@ -1903,6 +1903,7 @@ static int vmload_interception(struct vcpu_svm *svm)
 static int vmsave_interception(struct vcpu_svm *svm)
 {
 	struct vmcb *nested_vmcb;
+	struct page *page;
 
 	if (nested_svm_check_permissions(svm))
 		return 1;
@@ -1910,12 +1911,12 @@ static int vmsave_interception(struct vcpu_svm *svm)
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
 	skip_emulated_instruction(&svm->vcpu);
 
-	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
 	if (!nested_vmcb)
 		return 1;
 
 	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
-	nested_svm_unmap(nested_vmcb, KM_USER0);
+	nested_svm_unmap(page);
 
 	return 1;
 }
-- 
cgit v1.2.3-55-g7522


From b8e88bc8ffba5fe53fb8d8a0a4be3bbcffeebe56 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Fri, 19 Feb 2010 16:23:02 +0100
Subject: KVM: SVM: Fix schedule-while-atomic on nested exception handling

Move the actual vmexit routine out of code that runs with
irqs and preemption disabled.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f9da35b06ec7..c27da0ad040c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -129,6 +129,7 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
 
 static int nested_svm_exit_handled(struct vcpu_svm *svm);
+static int nested_svm_intercept(struct vcpu_svm *svm);
 static int nested_svm_vmexit(struct vcpu_svm *svm);
 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 				      bool has_error_code, u32 error_code);
@@ -1384,6 +1385,8 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 				      bool has_error_code, u32 error_code)
 {
+	int vmexit;
+
 	if (!is_nested(svm))
 		return 0;
 
@@ -1392,7 +1395,11 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 	svm->vmcb->control.exit_info_1 = error_code;
 	svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
 
-	return nested_svm_exit_handled(svm);
+	vmexit = nested_svm_intercept(svm);
+	if (vmexit == NESTED_EXIT_DONE)
+		svm->nested.exit_required = true;
+
+	return vmexit;
 }
 
 static inline int nested_svm_intr(struct vcpu_svm *svm)
@@ -1521,7 +1528,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
 /*
  * If this function returns true, this #vmexit was already handled
  */
-static int nested_svm_exit_handled(struct vcpu_svm *svm)
+static int nested_svm_intercept(struct vcpu_svm *svm)
 {
 	u32 exit_code = svm->vmcb->control.exit_code;
 	int vmexit = NESTED_EXIT_HOST;
@@ -1567,9 +1574,17 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
 	}
 	}
 
-	if (vmexit == NESTED_EXIT_DONE) {
+	return vmexit;
+}
+
+static int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+	int vmexit;
+
+	vmexit = nested_svm_intercept(svm);
+
+	if (vmexit == NESTED_EXIT_DONE)
 		nested_svm_vmexit(svm);
-	}
 
 	return vmexit;
 }
-- 
cgit v1.2.3-55-g7522


From cdbbdc1210223879450555fee04c29ebf116576b Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Fri, 19 Feb 2010 16:23:03 +0100
Subject: KVM: SVM: Sync all control registers on nested vmexit

Currently the vmexit emulation does not sync control
registers were the access is typically intercepted by the
nested hypervisor. But we can not count on that intercepts
to sync these registers too and make the code
architecturally more correct.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c27da0ad040c..02f8d491d15a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1647,9 +1647,13 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->save.ds     = vmcb->save.ds;
 	nested_vmcb->save.gdtr   = vmcb->save.gdtr;
 	nested_vmcb->save.idtr   = vmcb->save.idtr;
+	nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
 	if (npt_enabled)
 		nested_vmcb->save.cr3    = vmcb->save.cr3;
+	else
+		nested_vmcb->save.cr3    = svm->vcpu.arch.cr3;
 	nested_vmcb->save.cr2    = vmcb->save.cr2;
+	nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
 	nested_vmcb->save.rflags = vmcb->save.rflags;
 	nested_vmcb->save.rip    = vmcb->save.rip;
 	nested_vmcb->save.rsp    = vmcb->save.rsp;
-- 
cgit v1.2.3-55-g7522


From 6c3bd3d7660c35a703073b81eccfd5a3b7c15295 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Fri, 19 Feb 2010 16:23:04 +0100
Subject: KVM: SVM: Annotate nested_svm_map with might_sleep()

The nested_svm_map() function can sleep and must not be
called from atomic context. So annotate that function.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 02f8d491d15a..4bc018333d76 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1434,6 +1434,8 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
 {
 	struct page *page;
 
+	might_sleep();
+
 	page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
 	if (is_error_page(page))
 		goto error;
-- 
cgit v1.2.3-55-g7522


From 4c7da8cb43c09e71a405b5aeaa58a1dbac3c39e9 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Fri, 19 Feb 2010 16:23:05 +0100
Subject: KVM: SVM: Fix nested msr intercept handling

The nested_svm_exit_handled_msr() function maps only one
page of the guests msr permission bitmap. This patch changes
the code to use kvm_read_guest to fix the bug.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 4bc018333d76..4459c477af9f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1461,19 +1461,13 @@ static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 {
 	u32 param = svm->vmcb->control.exit_info_1 & 1;
 	u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
-	struct page *page;
 	bool ret = false;
 	u32 t0, t1;
-	u8 *msrpm;
+	u8 val;
 
 	if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
 		return false;
 
-	msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, &page);
-
-	if (!msrpm)
-		goto out;
-
 	switch (msr) {
 	case 0 ... 0x1fff:
 		t0 = (msr * 2) % 8;
@@ -1494,11 +1488,10 @@ static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 		goto out;
 	}
 
-	ret = msrpm[t1] & ((1 << param) << t0);
+	if (!kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + t1, &val, 1))
+		ret = val & ((1 << param) << t0);
 
 out:
-	nested_svm_unmap(page);
-
 	return ret;
 }
 
-- 
cgit v1.2.3-55-g7522


From 88ab24adc7142506c8583ac36a34fa388300b750 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Fri, 19 Feb 2010 16:23:06 +0100
Subject: KVM: SVM: Don't sync nested cr8 to lapic and back

This patch makes syncing of the guest tpr to the lapic
conditional on !nested. Otherwise a nested guest using the
TPR could freeze the guest.
Another important change this patch introduces is that the
cr8 intercept bits are no longer ORed at vmrun emulation if
the guest sets VINTR_MASKING in its VMCB. The reason is that
nested cr8 accesses need alway be handled by the nested
hypervisor because they change the shadow version of the
tpr.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 46 +++++++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 4459c477af9f..481bd0ee5f7e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1832,21 +1832,6 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
 	svm->vmcb->save.cpl = nested_vmcb->save.cpl;
 
-	/* We don't want a nested guest to be more powerful than the guest,
-	   so all intercepts are ORed */
-	svm->vmcb->control.intercept_cr_read |=
-		nested_vmcb->control.intercept_cr_read;
-	svm->vmcb->control.intercept_cr_write |=
-		nested_vmcb->control.intercept_cr_write;
-	svm->vmcb->control.intercept_dr_read |=
-		nested_vmcb->control.intercept_dr_read;
-	svm->vmcb->control.intercept_dr_write |=
-		nested_vmcb->control.intercept_dr_write;
-	svm->vmcb->control.intercept_exceptions |=
-		nested_vmcb->control.intercept_exceptions;
-
-	svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
-
 	svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
 
 	/* cache intercepts */
@@ -1864,6 +1849,28 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	else
 		svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
 
+	if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
+		/* We only want the cr8 intercept bits of the guest */
+		svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK;
+		svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+	}
+
+	/* We don't want a nested guest to be more powerful than the guest,
+	   so all intercepts are ORed */
+	svm->vmcb->control.intercept_cr_read |=
+		nested_vmcb->control.intercept_cr_read;
+	svm->vmcb->control.intercept_cr_write |=
+		nested_vmcb->control.intercept_cr_write;
+	svm->vmcb->control.intercept_dr_read |=
+		nested_vmcb->control.intercept_dr_read;
+	svm->vmcb->control.intercept_dr_write |=
+		nested_vmcb->control.intercept_dr_write;
+	svm->vmcb->control.intercept_exceptions |=
+		nested_vmcb->control.intercept_exceptions;
+
+	svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
+
+	svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
 	svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
 	svm->vmcb->control.int_state = nested_vmcb->control.int_state;
 	svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
@@ -2526,6 +2533,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+		return;
+
 	if (irr == -1)
 		return;
 
@@ -2629,6 +2639,9 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+		return;
+
 	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
 		int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
 		kvm_set_cr8(vcpu, cr8);
@@ -2640,6 +2653,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u64 cr8;
 
+	if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+		return;
+
 	cr8 = kvm_get_cr8(vcpu);
 	svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
 	svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
-- 
cgit v1.2.3-55-g7522


From 06fc7772690dec2a0e3814633357babf8f63af41 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Fri, 19 Feb 2010 16:23:07 +0100
Subject: KVM: SVM: Activate nested state only when guest state is complete

Certain functions called during the emulated world switch
behave differently when the vcpu is running nested. This is
not the expected behavior during a world switch emulation.
This patch ensures that the nested state is activated only
if the vcpu is completly in nested state.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 481bd0ee5f7e..8ace0b0da933 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1633,6 +1633,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	if (!nested_vmcb)
 		return 1;
 
+	/* Exit nested SVM mode */
+	svm->nested.vmcb = 0;
+
 	/* Give the current vmcb to the guest */
 	disable_gif(svm);
 
@@ -1720,9 +1723,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	svm->vmcb->save.cpl = 0;
 	svm->vmcb->control.exit_int_info = 0;
 
-	/* Exit nested SVM mode */
-	svm->nested.vmcb = 0;
-
 	nested_svm_unmap(page);
 
 	kvm_mmu_reset_context(&svm->vcpu);
@@ -1757,14 +1757,14 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	struct vmcb *hsave = svm->nested.hsave;
 	struct vmcb *vmcb = svm->vmcb;
 	struct page *page;
+	u64 vmcb_gpa;
+
+	vmcb_gpa = svm->vmcb->save.rax;
 
 	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
 	if (!nested_vmcb)
 		return false;
 
-	/* nested_vmcb is our indicator if nested SVM is activated */
-	svm->nested.vmcb = svm->vmcb->save.rax;
-
 	trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
 			       nested_vmcb->save.rip,
 			       nested_vmcb->control.int_ctl,
@@ -1879,6 +1879,9 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 
 	nested_svm_unmap(page);
 
+	/* nested_vmcb is our indicator if nested SVM is activated */
+	svm->nested.vmcb = vmcb_gpa;
+
 	enable_gif(svm);
 
 	return true;
-- 
cgit v1.2.3-55-g7522


From 66a562f7e2576cde384ec813b481404d8f54f4c6 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Fri, 19 Feb 2010 16:23:08 +0100
Subject: KVM: SVM: Make lazy FPU switching work with nested svm

The new lazy fpu switching code may disable cr0 intercepts
when running nested. This is a bug because the nested
hypervisor may still want to intercept cr0 which will break
in this situation. This patch fixes this issue and makes
lazy fpu switching working with nested svm.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 43 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8ace0b0da933..a8ec53fe74f5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -979,6 +979,7 @@ static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 
 static void update_cr0_intercept(struct vcpu_svm *svm)
 {
+	struct vmcb *vmcb = svm->vmcb;
 	ulong gcr0 = svm->vcpu.arch.cr0;
 	u64 *hcr0 = &svm->vmcb->save.cr0;
 
@@ -990,11 +991,25 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
 
 
 	if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
-		svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
-		svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+		vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
+		vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+		if (is_nested(svm)) {
+			struct vmcb *hsave = svm->nested.hsave;
+
+			hsave->control.intercept_cr_read  &= ~INTERCEPT_CR0_MASK;
+			hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+			vmcb->control.intercept_cr_read  |= svm->nested.intercept_cr_read;
+			vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
+		}
 	} else {
 		svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
 		svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
+		if (is_nested(svm)) {
+			struct vmcb *hsave = svm->nested.hsave;
+
+			hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
+			hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
+		}
 	}
 }
 
@@ -1269,7 +1284,22 @@ static int ud_interception(struct vcpu_svm *svm)
 static void svm_fpu_activate(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
+	u32 excp;
+
+	if (is_nested(svm)) {
+		u32 h_excp, n_excp;
+
+		h_excp  = svm->nested.hsave->control.intercept_exceptions;
+		n_excp  = svm->nested.intercept_exceptions;
+		h_excp &= ~(1 << NM_VECTOR);
+		excp    = h_excp | n_excp;
+	} else {
+		excp  = svm->vmcb->control.intercept_exceptions;
+	        excp &= ~(1 << NM_VECTOR);
+	}
+
+	svm->vmcb->control.intercept_exceptions = excp;
+
 	svm->vcpu.fpu_active = 1;
 	update_cr0_intercept(svm);
 }
@@ -1513,6 +1543,9 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
 		if (!npt_enabled)
 			return NESTED_EXIT_HOST;
 		break;
+	case SVM_EXIT_EXCP_BASE + NM_VECTOR:
+		nm_interception(svm);
+		break;
 	default:
 		break;
 	}
@@ -2980,8 +3013,10 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	update_cr0_intercept(svm);
 	svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
+	if (is_nested(svm))
+		svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
+	update_cr0_intercept(svm);
 }
 
 static struct kvm_x86_ops svm_x86_ops = {
-- 
cgit v1.2.3-55-g7522


From 052ce6211c4f7309988068fccdb7204c721871df Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Fri, 19 Feb 2010 16:23:09 +0100
Subject: KVM: SVM: Remove newlines from nested trace points

The tracing infrastructure adds its own newlines. Remove
them from the trace point printk format strings.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/trace.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 6ad30a29f044..12f8d2dee984 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -413,7 +413,7 @@ TRACE_EVENT(kvm_nested_vmrun,
 	),
 
 	TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
-		  "event_inj: 0x%08x npt: %s\n",
+		  "event_inj: 0x%08x npt: %s",
 		__entry->rip, __entry->vmcb, __entry->nested_rip,
 		__entry->int_ctl, __entry->event_inj,
 		__entry->npt ? "on" : "off")
@@ -447,7 +447,7 @@ TRACE_EVENT(kvm_nested_vmexit,
 		__entry->exit_int_info_err	= exit_int_info_err;
 	),
 	TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
-		  "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
+		  "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
 		  __entry->rip,
 		  ftrace_print_symbols_seq(p, __entry->exit_code,
 					   kvm_x86_ops->exit_reasons_str),
@@ -482,7 +482,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
 	),
 
 	TP_printk("reason: %s ext_inf1: 0x%016llx "
-		  "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
+		  "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
 		  ftrace_print_symbols_seq(p, __entry->exit_code,
 					   kvm_x86_ops->exit_reasons_str),
 		__entry->exit_info1, __entry->exit_info2,
@@ -504,7 +504,7 @@ TRACE_EVENT(kvm_nested_intr_vmexit,
 		__entry->rip	=	rip
 	),
 
-	TP_printk("rip: 0x%016llx\n", __entry->rip)
+	TP_printk("rip: 0x%016llx", __entry->rip)
 );
 
 /*
@@ -526,7 +526,7 @@ TRACE_EVENT(kvm_invlpga,
 		__entry->address	=	address;
 	),
 
-	TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n",
+	TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx",
 		  __entry->rip, __entry->asid, __entry->address)
 );
 
@@ -547,7 +547,7 @@ TRACE_EVENT(kvm_skinit,
 		__entry->slb		=	slb;
 	),
 
-	TP_printk("rip: 0x%016llx slb: 0x%08x\n",
+	TP_printk("rip: 0x%016llx slb: 0x%08x",
 		  __entry->rip, __entry->slb)
 );
 
-- 
cgit v1.2.3-55-g7522


From 112592da0dc2460c95e8a89d0c5657c6a30286aa Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Sun, 21 Feb 2010 15:00:47 +0200
Subject: KVM: drop unneeded kvm_run check in emulate_instruction()

vcpu->run is initialized on vcpu creation and can never be NULL
here.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 274a8e39bca7..1d27a57026ab 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3443,7 +3443,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	if (vcpu->arch.pio.string)
 		return EMULATE_DO_MMIO;
 
-	if ((r || vcpu->mmio_is_write) && run) {
+	if (r || vcpu->mmio_is_write) {
 		run->exit_reason = KVM_EXIT_MMIO;
 		run->mmio.phys_addr = vcpu->mmio_phys_addr;
 		memcpy(run->mmio.data, vcpu->mmio_data, 8);
-- 
cgit v1.2.3-55-g7522


From 8fe546547cf6857a9d984bfe2f2194910f3fc5d0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Fri, 19 Feb 2010 16:23:01 +0100
Subject: KVM: SVM: Fix wrong interrupt injection in enable_irq_windows

The nested_svm_intr() function does not execute the vmexit
anymore. Therefore we may still be in the nested state after
that function ran. This patch changes the nested_svm_intr()
function to return wether the irq window could be enabled.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a8ec53fe74f5..294bbca34173 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1432,16 +1432,17 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 	return vmexit;
 }
 
-static inline int nested_svm_intr(struct vcpu_svm *svm)
+/* This function returns true if it is save to enable the irq window */
+static inline bool nested_svm_intr(struct vcpu_svm *svm)
 {
 	if (!is_nested(svm))
-		return 0;
+		return true;
 
 	if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
-		return 0;
+		return true;
 
 	if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
-		return 0;
+		return false;
 
 	svm->vmcb->control.exit_code = SVM_EXIT_INTR;
 
@@ -1454,10 +1455,10 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
 		 */
 		svm->nested.exit_required = true;
 		trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
-		return 1;
+		return false;
 	}
 
-	return 0;
+	return true;
 }
 
 static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
@@ -2629,13 +2630,11 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	nested_svm_intr(svm);
-
 	/* In case GIF=0 we can't rely on the CPU to tell us when
 	 * GIF becomes 1, because that's a separate STGI/VMRUN intercept.
 	 * The next time we get that intercept, this function will be
 	 * called again though and we'll get the vintr intercept. */
-	if (gif_set(svm)) {
+	if (gif_set(svm) && nested_svm_intr(svm)) {
 		svm_set_vintr(svm);
 		svm_inject_irq(svm, 0x0);
 	}
-- 
cgit v1.2.3-55-g7522


From 03b82a30ea8b26199901b219848d706dbd70c609 Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Mon, 15 Feb 2010 10:45:41 +0100
Subject: KVM: x86: Do not return soft events in vcpu_events

To avoid that user space migrates a pending software exception or
interrupt, mask them out on KVM_GET_VCPU_EVENTS. Without this, user
space would try to reinject them, and we would have to reconstruct the
proper instruction length for VMX event injection. Now the pending event
will be reinjected via executing the triggering instruction again.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1d27a57026ab..2b1c9f2fb8dd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2100,14 +2100,17 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 {
 	vcpu_load(vcpu);
 
-	events->exception.injected = vcpu->arch.exception.pending;
+	events->exception.injected =
+		vcpu->arch.exception.pending &&
+		!kvm_exception_is_soft(vcpu->arch.exception.nr);
 	events->exception.nr = vcpu->arch.exception.nr;
 	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
 	events->exception.error_code = vcpu->arch.exception.error_code;
 
-	events->interrupt.injected = vcpu->arch.interrupt.pending;
+	events->interrupt.injected =
+		vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
 	events->interrupt.nr = vcpu->arch.interrupt.nr;
-	events->interrupt.soft = vcpu->arch.interrupt.soft;
+	events->interrupt.soft = 0;
 
 	events->nmi.injected = vcpu->arch.nmi_injected;
 	events->nmi.pending = vcpu->arch.nmi_pending;
-- 
cgit v1.2.3-55-g7522


From 48005f64d0ea965d454e38b5181af4aba9bdef5b Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Fri, 19 Feb 2010 19:38:07 +0100
Subject: KVM: x86: Save&restore interrupt shadow mask

The interrupt shadow created by STI or MOV-SS-like operations is part of
the VCPU state and must be preserved across migration. Transfer it in
the spare padding field of kvm_vcpu_events.interrupt.

As a side effect we now have to make vmx_set_interrupt_shadow robust
against both shadow types being set. Give MOV SS a higher priority and
skip STI in that case to avoid that VMX throws a fault on next entry.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/kvm/api.txt          | 11 ++++++++++-
 arch/x86/include/asm/kvm.h         |  7 ++++++-
 arch/x86/include/asm/kvm_emulate.h |  3 ---
 arch/x86/kvm/emulate.c             |  4 ++--
 arch/x86/kvm/svm.c                 |  2 +-
 arch/x86/kvm/vmx.c                 |  8 ++++----
 arch/x86/kvm/x86.c                 | 12 ++++++++++--
 include/linux/kvm.h                |  1 +
 8 files changed, 34 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index beb444a95013..9e5de5a1c4ef 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -656,6 +656,7 @@ struct kvm_clock_data {
 4.29 KVM_GET_VCPU_EVENTS
 
 Capability: KVM_CAP_VCPU_EVENTS
+Extended by: KVM_CAP_INTR_SHADOW
 Architectures: x86
 Type: vm ioctl
 Parameters: struct kvm_vcpu_event (out)
@@ -676,7 +677,7 @@ struct kvm_vcpu_events {
 		__u8 injected;
 		__u8 nr;
 		__u8 soft;
-		__u8 pad;
+		__u8 shadow;
 	} interrupt;
 	struct {
 		__u8 injected;
@@ -688,9 +689,13 @@ struct kvm_vcpu_events {
 	__u32 flags;
 };
 
+KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that
+interrupt.shadow contains a valid state. Otherwise, this field is undefined.
+
 4.30 KVM_SET_VCPU_EVENTS
 
 Capability: KVM_CAP_VCPU_EVENTS
+Extended by: KVM_CAP_INTR_SHADOW
 Architectures: x86
 Type: vm ioctl
 Parameters: struct kvm_vcpu_event (in)
@@ -709,6 +714,10 @@ current in-kernel state. The bits are:
 KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel
 KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector
 
+If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in
+the flags field to signal that interrupt.shadow contains a valid state and
+shall be written into the VCPU.
+
 
 5. The kvm_run structure
 
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index f46b79f6c16c..fb6117063ea3 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -257,6 +257,11 @@ struct kvm_reinject_control {
 /* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
 #define KVM_VCPUEVENT_VALID_NMI_PENDING	0x00000001
 #define KVM_VCPUEVENT_VALID_SIPI_VECTOR	0x00000002
+#define KVM_VCPUEVENT_VALID_SHADOW	0x00000004
+
+/* Interrupt shadow states */
+#define KVM_X86_SHADOW_INT_MOV_SS	0x01
+#define KVM_X86_SHADOW_INT_STI		0x02
 
 /* for KVM_GET/SET_VCPU_EVENTS */
 struct kvm_vcpu_events {
@@ -271,7 +276,7 @@ struct kvm_vcpu_events {
 		__u8 injected;
 		__u8 nr;
 		__u8 soft;
-		__u8 pad;
+		__u8 shadow;
 	} interrupt;
 	struct {
 		__u8 injected;
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7a6f54fa13ba..2666d7ac3229 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -153,9 +153,6 @@ struct decode_cache {
 	struct fetch_cache fetch;
 };
 
-#define X86_SHADOW_INT_MOV_SS  1
-#define X86_SHADOW_INT_STI     2
-
 struct x86_emulate_ctxt {
 	/* Register state before/after emulation. */
 	struct kvm_vcpu *vcpu;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 35f7acd4a91f..c9f604b0819c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2128,7 +2128,7 @@ special_insn:
 		}
 
 		if (c->modrm_reg == VCPU_SREG_SS)
-			toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
+			toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
 
 		rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg);
 
@@ -2366,7 +2366,7 @@ special_insn:
 		if (emulator_bad_iopl(ctxt))
 			kvm_inject_gp(ctxt->vcpu, 0);
 		else {
-			toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
+			toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
 			ctxt->eflags |= X86_EFLAGS_IF;
 			c->dst.type = OP_NONE;	/* Disable writeback. */
 		}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 294bbca34173..bd8f52f0823f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -265,7 +265,7 @@ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 	u32 ret = 0;
 
 	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
-		ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS;
+		ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 	return ret & mask;
 }
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 68f895b00450..61f03980adae 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -846,9 +846,9 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 	int ret = 0;
 
 	if (interruptibility & GUEST_INTR_STATE_STI)
-		ret |= X86_SHADOW_INT_STI;
+		ret |= KVM_X86_SHADOW_INT_STI;
 	if (interruptibility & GUEST_INTR_STATE_MOV_SS)
-		ret |= X86_SHADOW_INT_MOV_SS;
+		ret |= KVM_X86_SHADOW_INT_MOV_SS;
 
 	return ret & mask;
 }
@@ -860,9 +860,9 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 
 	interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
 
-	if (mask & X86_SHADOW_INT_MOV_SS)
+	if (mask & KVM_X86_SHADOW_INT_MOV_SS)
 		interruptibility |= GUEST_INTR_STATE_MOV_SS;
-	if (mask & X86_SHADOW_INT_STI)
+	else if (mask & KVM_X86_SHADOW_INT_STI)
 		interruptibility |= GUEST_INTR_STATE_STI;
 
 	if ((interruptibility != interruptibility_old))
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2b1c9f2fb8dd..84ffd95ee198 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2111,6 +2111,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 		vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
 	events->interrupt.nr = vcpu->arch.interrupt.nr;
 	events->interrupt.soft = 0;
+	events->interrupt.shadow =
+		kvm_x86_ops->get_interrupt_shadow(vcpu,
+			KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
 
 	events->nmi.injected = vcpu->arch.nmi_injected;
 	events->nmi.pending = vcpu->arch.nmi_pending;
@@ -2119,7 +2122,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 	events->sipi_vector = vcpu->arch.sipi_vector;
 
 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
-			 | KVM_VCPUEVENT_VALID_SIPI_VECTOR);
+			 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
+			 | KVM_VCPUEVENT_VALID_SHADOW);
 
 	vcpu_put(vcpu);
 }
@@ -2128,7 +2132,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 					      struct kvm_vcpu_events *events)
 {
 	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
-			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR))
+			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
+			      | KVM_VCPUEVENT_VALID_SHADOW))
 		return -EINVAL;
 
 	vcpu_load(vcpu);
@@ -2143,6 +2148,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	vcpu->arch.interrupt.soft = events->interrupt.soft;
 	if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
 		kvm_pic_clear_isr_ack(vcpu->kvm);
+	if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
+		kvm_x86_ops->set_interrupt_shadow(vcpu,
+						  events->interrupt.shadow);
 
 	vcpu->arch.nmi_injected = events->nmi.injected;
 	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 360f85e8c435..48516a2a0b84 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -502,6 +502,7 @@ struct kvm_ioeventfd {
 #define KVM_CAP_HYPERV_SPIN 46
 #define KVM_CAP_PCI_SEGMENT 47
 #define KVM_CAP_PPC_PAIRED_SINGLES 48
+#define KVM_CAP_INTR_SHADOW 49
 #define KVM_CAP_X86_ROBUST_SINGLESTEP 51
 
 #ifdef KVM_CAP_IRQ_ROUTING
-- 
cgit v1.2.3-55-g7522


From a1efbe77c1fd7c34a97a76a61520bf23fb3663f6 Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Mon, 15 Feb 2010 10:45:43 +0100
Subject: KVM: x86: Add support for saving&restoring debug registers

So far user space was not able to save and restore debug registers for
migration or after reset. Plug this hole.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/kvm/api.txt  | 31 ++++++++++++++++++++++++++
 arch/x86/include/asm/kvm.h | 10 +++++++++
 arch/x86/kvm/x86.c         | 54 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/kvm.h        |  6 ++++++
 4 files changed, 101 insertions(+)

(limited to 'arch/x86')

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 9e5de5a1c4ef..d170cb435545 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -718,6 +718,37 @@ If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in
 the flags field to signal that interrupt.shadow contains a valid state and
 shall be written into the VCPU.
 
+4.32 KVM_GET_DEBUGREGS
+
+Capability: KVM_CAP_DEBUGREGS
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_debugregs (out)
+Returns: 0 on success, -1 on error
+
+Reads debug registers from the vcpu.
+
+struct kvm_debugregs {
+	__u64 db[4];
+	__u64 dr6;
+	__u64 dr7;
+	__u64 flags;
+	__u64 reserved[9];
+};
+
+4.33 KVM_SET_DEBUGREGS
+
+Capability: KVM_CAP_DEBUGREGS
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_debugregs (in)
+Returns: 0 on success, -1 on error
+
+Writes debug registers into the vcpu.
+
+See KVM_GET_DEBUGREGS for the data structure. The flags field is unused
+yet and must be cleared on entry.
+
 
 5. The kvm_run structure
 
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index fb6117063ea3..ff90055c7f0b 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -21,6 +21,7 @@
 #define __KVM_HAVE_PIT_STATE2
 #define __KVM_HAVE_XEN_HVM
 #define __KVM_HAVE_VCPU_EVENTS
+#define __KVM_HAVE_DEBUGREGS
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
@@ -289,4 +290,13 @@ struct kvm_vcpu_events {
 	__u32 reserved[10];
 };
 
+/* for KVM_GET/SET_DEBUGREGS */
+struct kvm_debugregs {
+	__u64 db[4];
+	__u64 dr6;
+	__u64 dr7;
+	__u64 flags;
+	__u64 reserved[9];
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 84ffd95ee198..efeeabd84ecd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1548,6 +1548,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_HYPERV_VAPIC:
 	case KVM_CAP_HYPERV_SPIN:
 	case KVM_CAP_PCI_SEGMENT:
+	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 		r = 1;
 		break;
@@ -2165,6 +2166,36 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+					     struct kvm_debugregs *dbgregs)
+{
+	vcpu_load(vcpu);
+
+	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
+	dbgregs->dr6 = vcpu->arch.dr6;
+	dbgregs->dr7 = vcpu->arch.dr7;
+	dbgregs->flags = 0;
+
+	vcpu_put(vcpu);
+}
+
+static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
+					    struct kvm_debugregs *dbgregs)
+{
+	if (dbgregs->flags)
+		return -EINVAL;
+
+	vcpu_load(vcpu);
+
+	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+	vcpu->arch.dr6 = dbgregs->dr6;
+	vcpu->arch.dr7 = dbgregs->dr7;
+
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
@@ -2343,6 +2374,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
 		break;
 	}
+	case KVM_GET_DEBUGREGS: {
+		struct kvm_debugregs dbgregs;
+
+		kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+
+		r = -EFAULT;
+		if (copy_to_user(argp, &dbgregs,
+				 sizeof(struct kvm_debugregs)))
+			break;
+		r = 0;
+		break;
+	}
+	case KVM_SET_DEBUGREGS: {
+		struct kvm_debugregs dbgregs;
+
+		r = -EFAULT;
+		if (copy_from_user(&dbgregs, argp,
+				   sizeof(struct kvm_debugregs)))
+			break;
+
+		r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 48516a2a0b84..ce2876717a8b 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -503,6 +503,9 @@ struct kvm_ioeventfd {
 #define KVM_CAP_PCI_SEGMENT 47
 #define KVM_CAP_PPC_PAIRED_SINGLES 48
 #define KVM_CAP_INTR_SHADOW 49
+#ifdef __KVM_HAVE_DEBUGREGS
+#define KVM_CAP_DEBUGREGS 50
+#endif
 #define KVM_CAP_X86_ROBUST_SINGLESTEP 51
 
 #ifdef KVM_CAP_IRQ_ROUTING
@@ -690,6 +693,9 @@ struct kvm_clock_data {
 /* Available with KVM_CAP_VCPU_EVENTS */
 #define KVM_GET_VCPU_EVENTS       _IOR(KVMIO,  0x9f, struct kvm_vcpu_events)
 #define KVM_SET_VCPU_EVENTS       _IOW(KVMIO,  0xa0, struct kvm_vcpu_events)
+/* Available with KVM_CAP_DEBUGREGS */
+#define KVM_GET_DEBUGREGS         _IOR(KVMIO,  0xa1, struct kvm_debugregs)
+#define KVM_SET_DEBUGREGS         _IOW(KVMIO,  0xa2, struct kvm_debugregs)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
-- 
cgit v1.2.3-55-g7522


From 50a085bdd48af08cc7e3178ba0d7c1d5d8191698 Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Wed, 24 Feb 2010 10:41:58 +0100
Subject: KVM: x86: Kick VCPU outside PIC lock again

This restores the deferred VCPU kicking before 956f97cf. We need this
over -rt as wake_up* requires non-atomic context in this configuration.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8259.c | 53 +++++++++++++++++++++++++++++++++++++---------------
 arch/x86/kvm/irq.h   |  1 +
 2 files changed, 39 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index a790fa128a9f..93825ff3338f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -33,6 +33,29 @@
 #include <linux/kvm_host.h>
 #include "trace.h"
 
+static void pic_lock(struct kvm_pic *s)
+	__acquires(&s->lock)
+{
+	raw_spin_lock(&s->lock);
+}
+
+static void pic_unlock(struct kvm_pic *s)
+	__releases(&s->lock)
+{
+	bool wakeup = s->wakeup_needed;
+	struct kvm_vcpu *vcpu;
+
+	s->wakeup_needed = false;
+
+	raw_spin_unlock(&s->lock);
+
+	if (wakeup) {
+		vcpu = s->kvm->bsp_vcpu;
+		if (vcpu)
+			kvm_vcpu_kick(vcpu);
+	}
+}
+
 static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
 {
 	s->isr &= ~(1 << irq);
@@ -45,19 +68,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
 	 * Other interrupt may be delivered to PIC while lock is dropped but
 	 * it should be safe since PIC state is already updated at this stage.
 	 */
-	raw_spin_unlock(&s->pics_state->lock);
+	pic_unlock(s->pics_state);
 	kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
-	raw_spin_lock(&s->pics_state->lock);
+	pic_lock(s->pics_state);
 }
 
 void kvm_pic_clear_isr_ack(struct kvm *kvm)
 {
 	struct kvm_pic *s = pic_irqchip(kvm);
 
-	raw_spin_lock(&s->lock);
+	pic_lock(s);
 	s->pics[0].isr_ack = 0xff;
 	s->pics[1].isr_ack = 0xff;
-	raw_spin_unlock(&s->lock);
+	pic_unlock(s);
 }
 
 /*
@@ -158,9 +181,9 @@ static void pic_update_irq(struct kvm_pic *s)
 
 void kvm_pic_update_irq(struct kvm_pic *s)
 {
-	raw_spin_lock(&s->lock);
+	pic_lock(s);
 	pic_update_irq(s);
-	raw_spin_unlock(&s->lock);
+	pic_unlock(s);
 }
 
 int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -168,14 +191,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
 	struct kvm_pic *s = opaque;
 	int ret = -1;
 
-	raw_spin_lock(&s->lock);
+	pic_lock(s);
 	if (irq >= 0 && irq < PIC_NUM_PINS) {
 		ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
 		pic_update_irq(s);
 		trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
 				      s->pics[irq >> 3].imr, ret == 0);
 	}
-	raw_spin_unlock(&s->lock);
+	pic_unlock(s);
 
 	return ret;
 }
@@ -205,7 +228,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 	int irq, irq2, intno;
 	struct kvm_pic *s = pic_irqchip(kvm);
 
-	raw_spin_lock(&s->lock);
+	pic_lock(s);
 	irq = pic_get_irq(&s->pics[0]);
 	if (irq >= 0) {
 		pic_intack(&s->pics[0], irq);
@@ -230,7 +253,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 		intno = s->pics[0].irq_base + irq;
 	}
 	pic_update_irq(s);
-	raw_spin_unlock(&s->lock);
+	pic_unlock(s);
 
 	return intno;
 }
@@ -444,7 +467,7 @@ static int picdev_write(struct kvm_io_device *this,
 			printk(KERN_ERR "PIC: non byte write\n");
 		return 0;
 	}
-	raw_spin_lock(&s->lock);
+	pic_lock(s);
 	switch (addr) {
 	case 0x20:
 	case 0x21:
@@ -457,7 +480,7 @@ static int picdev_write(struct kvm_io_device *this,
 		elcr_ioport_write(&s->pics[addr & 1], addr, data);
 		break;
 	}
-	raw_spin_unlock(&s->lock);
+	pic_unlock(s);
 	return 0;
 }
 
@@ -474,7 +497,7 @@ static int picdev_read(struct kvm_io_device *this,
 			printk(KERN_ERR "PIC: non byte read\n");
 		return 0;
 	}
-	raw_spin_lock(&s->lock);
+	pic_lock(s);
 	switch (addr) {
 	case 0x20:
 	case 0x21:
@@ -488,7 +511,7 @@ static int picdev_read(struct kvm_io_device *this,
 		break;
 	}
 	*(unsigned char *)val = data;
-	raw_spin_unlock(&s->lock);
+	pic_unlock(s);
 	return 0;
 }
 
@@ -505,7 +528,7 @@ static void pic_irq_request(void *opaque, int level)
 	s->output = level;
 	if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
 		s->pics[0].isr_ack &= ~(1 << irq);
-		kvm_vcpu_kick(vcpu);
+		s->wakeup_needed = true;
 	}
 }
 
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 34b15915754d..cd1f362f413d 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -63,6 +63,7 @@ struct kvm_kpic_state {
 
 struct kvm_pic {
 	raw_spinlock_t lock;
+	bool wakeup_needed;
 	unsigned pending_acks;
 	struct kvm *kvm;
 	struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
-- 
cgit v1.2.3-55-g7522


From 116a4752c82f767a59c27b2630a8474b936a776a Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Tue, 23 Feb 2010 17:47:54 +0100
Subject: KVM: SVM: Move svm_queue_exception

Move svm_queue_exception past skip_emulated_instruction to allow calling
it later on.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index bd8f52f0823f..908ec61895ce 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -236,23 +236,6 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 	vcpu->arch.efer = efer;
 }
 
-static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
-				bool has_error_code, u32 error_code)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	/* If we are within a nested VM we'd better #VMEXIT and let the
-	   guest handle the exception */
-	if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
-		return;
-
-	svm->vmcb->control.event_inj = nr
-		| SVM_EVTINJ_VALID
-		| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
-		| SVM_EVTINJ_TYPE_EXEPT;
-	svm->vmcb->control.event_inj_err = error_code;
-}
-
 static int is_external_interrupt(u32 info)
 {
 	info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
@@ -298,6 +281,23 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	svm_set_interrupt_shadow(vcpu, 0);
 }
 
+static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
+				bool has_error_code, u32 error_code)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	/* If we are within a nested VM we'd better #VMEXIT and let the
+	   guest handle the exception */
+	if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
+		return;
+
+	svm->vmcb->control.event_inj = nr
+		| SVM_EVTINJ_VALID
+		| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+		| SVM_EVTINJ_TYPE_EXEPT;
+	svm->vmcb->control.event_inj_err = error_code;
+}
+
 static int has_svm(void)
 {
 	const char *msg;
-- 
cgit v1.2.3-55-g7522


From f92653eeb496fe8624ac4b0d628c916a06a3d25c Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Tue, 23 Feb 2010 17:47:55 +0100
Subject: KVM: x86: Add kvm_is_linear_rip

Based on Gleb's suggestion: Add a helper kvm_is_linear_rip that matches
a given linear RIP against the current one. Use this for guest
single-stepping, more users will follow.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  4 +++-
 arch/x86/kvm/x86.c              | 21 +++++++++++++--------
 2 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d46e791de129..502fff123e29 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -362,8 +362,8 @@ struct kvm_vcpu_arch {
 	u64 *mce_banks;
 
 	/* used for guest single stepping over the given code position */
-	u16 singlestep_cs;
 	unsigned long singlestep_rip;
+
 	/* fields used by HYPER-V emulation */
 	u64 hv_vapic;
 };
@@ -820,4 +820,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 void kvm_define_shared_msr(unsigned index, u32 msr);
 void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
 
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index efeeabd84ecd..f2f246ae4a4c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5376,11 +5376,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 		vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
 	}
 
-	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
-		vcpu->arch.singlestep_cs =
-			get_segment_selector(vcpu, VCPU_SREG_CS);
-		vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
-	}
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
+			get_segment_base(vcpu, VCPU_SREG_CS);
 
 	/*
 	 * Trigger an rflags update that will inject or remove the trace
@@ -5871,6 +5869,15 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
 	return kvm_x86_ops->interrupt_allowed(vcpu);
 }
 
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
+{
+	unsigned long current_rip = kvm_rip_read(vcpu) +
+		get_segment_base(vcpu, VCPU_SREG_CS);
+
+	return current_rip == linear_rip;
+}
+EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
+
 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
 {
 	unsigned long rflags;
@@ -5885,9 +5892,7 @@ EXPORT_SYMBOL_GPL(kvm_get_rflags);
 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
-	    vcpu->arch.singlestep_cs ==
-			get_segment_selector(vcpu, VCPU_SREG_CS) &&
-	    vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
+	    kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
 		rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
 	kvm_x86_ops->set_rflags(vcpu, rflags);
 }
-- 
cgit v1.2.3-55-g7522


From 66b7138f913635b40822890ba8913548f8b285b8 Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Tue, 23 Feb 2010 17:47:56 +0100
Subject: KVM: SVM: Emulate nRIP feature when reinjecting INT3

When in guest debugging mode, we have to reinject those #BP software
exceptions that are caused by guest-injected INT3. As older AMD
processors do not support the required nRIP VMCB field, try to emulate
it by moving RIP past the instruction on exception injection. Fix it up
again in case the injection failed and we were able to catch this. This
does not work for unintercepted faults, but it is better than doing
nothing.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 38 +++++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 908ec61895ce..468ff6e721ce 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -47,6 +47,7 @@ MODULE_LICENSE("GPL");
 #define SVM_FEATURE_NPT  (1 << 0)
 #define SVM_FEATURE_LBRV (1 << 1)
 #define SVM_FEATURE_SVML (1 << 2)
+#define SVM_FEATURE_NRIP (1 << 3)
 #define SVM_FEATURE_PAUSE_FILTER (1 << 10)
 
 #define NESTED_EXIT_HOST	0	/* Exit handled on host level */
@@ -110,6 +111,9 @@ struct vcpu_svm {
 	struct nested_state nested;
 
 	bool nmi_singlestep;
+
+	unsigned int3_injected;
+	unsigned long int3_rip;
 };
 
 /* enable NPT for AMD64 and X86 with PAE */
@@ -291,6 +295,22 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
 		return;
 
+	if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) {
+		unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
+
+		/*
+		 * For guest debugging where we have to reinject #BP if some
+		 * INT3 is guest-owned:
+		 * Emulate nRIP by moving RIP forward. Will fail if injection
+		 * raises a fault that is not intercepted. Still better than
+		 * failing in all cases.
+		 */
+		skip_emulated_instruction(&svm->vcpu);
+		rip = kvm_rip_read(&svm->vcpu);
+		svm->int3_rip = rip + svm->vmcb->save.cs.base;
+		svm->int3_injected = rip - old_rip;
+	}
+
 	svm->vmcb->control.event_inj = nr
 		| SVM_EVTINJ_VALID
 		| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
@@ -2701,6 +2721,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 	u8 vector;
 	int type;
 	u32 exitintinfo = svm->vmcb->control.exit_int_info;
+	unsigned int3_injected = svm->int3_injected;
+
+	svm->int3_injected = 0;
 
 	if (svm->vcpu.arch.hflags & HF_IRET_MASK)
 		svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
@@ -2720,12 +2743,21 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 		svm->vcpu.arch.nmi_injected = true;
 		break;
 	case SVM_EXITINTINFO_TYPE_EXEPT:
-		/* In case of software exception do not reinject an exception
-		   vector, but re-execute and instruction instead */
 		if (is_nested(svm))
 			break;
-		if (kvm_exception_is_soft(vector))
+		/*
+		 * In case of software exceptions, do not reinject the vector,
+		 * but re-execute the instruction instead. Rewind RIP first
+		 * if we emulated INT3 before.
+		 */
+		if (kvm_exception_is_soft(vector)) {
+			if (vector == BP_VECTOR && int3_injected &&
+			    kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
+				kvm_rip_write(&svm->vcpu,
+					      kvm_rip_read(&svm->vcpu) -
+					      int3_injected);
 			break;
+		}
 		if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
 			u32 err = svm->vmcb->control.exit_int_info_err;
 			kvm_queue_exception_e(&svm->vcpu, vector, err);
-- 
cgit v1.2.3-55-g7522


From c310bac5a20fc37f761bd7297ba2e52cf40d79c6 Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Tue, 23 Feb 2010 17:47:58 +0100
Subject: KVM: x86: Drop RF manipulation for guest single-stepping

RF is not required for injecting TF as the latter will trigger only
after an instruction execution anyway. So do not touch RF when arming or
disarming guest single-step mode.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f2f246ae4a4c..a519fc6ed051 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5884,7 +5884,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
 
 	rflags = kvm_x86_ops->get_rflags(vcpu);
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-		rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
+		rflags &= ~X86_EFLAGS_TF;
 	return rflags;
 }
 EXPORT_SYMBOL_GPL(kvm_get_rflags);
@@ -5893,7 +5893,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
 	    kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
-		rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+		rflags |= X86_EFLAGS_TF;
 	kvm_x86_ops->set_rflags(vcpu, rflags);
 }
 EXPORT_SYMBOL_GPL(kvm_set_rflags);
-- 
cgit v1.2.3-55-g7522


From 83bf0002c91b65744db78df36d4f1af27bd9099b Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Tue, 23 Feb 2010 17:47:59 +0100
Subject: KVM: x86: Preserve injected TF across emulation

Call directly into the vendor services for getting/setting rflags in
emulate_instruction to ensure injected TF survives the emulation.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a519fc6ed051..3a367f35cebf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3447,7 +3447,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
 		vcpu->arch.emulate_ctxt.vcpu = vcpu;
-		vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
+		vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
 		vcpu->arch.emulate_ctxt.mode =
 			(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
 			(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
@@ -3526,7 +3526,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		return EMULATE_DO_MMIO;
 	}
 
-	kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 
 	if (vcpu->mmio_is_write) {
 		vcpu->mmio_needed = 0;
-- 
cgit v1.2.3-55-g7522


From e02317153e77150fed9609c3212c98204ec3ea74 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Wed, 24 Feb 2010 18:59:10 +0100
Subject: KVM: SVM: Coding style cleanup

This patch removes whitespace errors, fixes comment formats
and most of checkpatch warnings. Now vim does not show
c-space-errors anymore.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 148 +++++++++++++++++++++++++++++------------------------
 1 file changed, 81 insertions(+), 67 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 468ff6e721ce..44679530ad5d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -120,7 +120,7 @@ struct vcpu_svm {
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 static bool npt_enabled = true;
 #else
-static bool npt_enabled = false;
+static bool npt_enabled;
 #endif
 static int npt = 1;
 
@@ -168,8 +168,8 @@ static unsigned long iopm_base;
 struct kvm_ldttss_desc {
 	u16 limit0;
 	u16 base0;
-	unsigned base1 : 8, type : 5, dpl : 2, p : 1;
-	unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
+	unsigned base1:8, type:5, dpl:2, p:1;
+	unsigned limit1:4, zero0:3, g:1, base2:8;
 	u32 base3;
 	u32 zero1;
 } __attribute__((packed));
@@ -218,7 +218,7 @@ static inline void stgi(void)
 
 static inline void invlpga(unsigned long addr, u32 asid)
 {
-	asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
+	asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
 }
 
 static inline void force_new_asid(struct kvm_vcpu *vcpu)
@@ -290,8 +290,10 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	/* If we are within a nested VM we'd better #VMEXIT and let the
-	   guest handle the exception */
+	/*
+	 * If we are within a nested VM we'd better #VMEXIT and let the guest
+	 * handle the exception
+	 */
 	if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
 		return;
 
@@ -544,7 +546,7 @@ static void init_seg(struct vmcb_seg *seg)
 {
 	seg->selector = 0;
 	seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
-		SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
+		      SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
 	seg->limit = 0xffff;
 	seg->base = 0;
 }
@@ -564,16 +566,16 @@ static void init_vmcb(struct vcpu_svm *svm)
 
 	svm->vcpu.fpu_active = 1;
 
-	control->intercept_cr_read = 	INTERCEPT_CR0_MASK |
+	control->intercept_cr_read =	INTERCEPT_CR0_MASK |
 					INTERCEPT_CR3_MASK |
 					INTERCEPT_CR4_MASK;
 
-	control->intercept_cr_write = 	INTERCEPT_CR0_MASK |
+	control->intercept_cr_write =	INTERCEPT_CR0_MASK |
 					INTERCEPT_CR3_MASK |
 					INTERCEPT_CR4_MASK |
 					INTERCEPT_CR8_MASK;
 
-	control->intercept_dr_read = 	INTERCEPT_DR0_MASK |
+	control->intercept_dr_read =	INTERCEPT_DR0_MASK |
 					INTERCEPT_DR1_MASK |
 					INTERCEPT_DR2_MASK |
 					INTERCEPT_DR3_MASK |
@@ -582,7 +584,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 					INTERCEPT_DR6_MASK |
 					INTERCEPT_DR7_MASK;
 
-	control->intercept_dr_write = 	INTERCEPT_DR0_MASK |
+	control->intercept_dr_write =	INTERCEPT_DR0_MASK |
 					INTERCEPT_DR1_MASK |
 					INTERCEPT_DR2_MASK |
 					INTERCEPT_DR3_MASK |
@@ -596,7 +598,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 					(1 << MC_VECTOR);
 
 
-	control->intercept = 	(1ULL << INTERCEPT_INTR) |
+	control->intercept =	(1ULL << INTERCEPT_INTR) |
 				(1ULL << INTERCEPT_NMI) |
 				(1ULL << INTERCEPT_SMI) |
 				(1ULL << INTERCEPT_SELECTIVE_CR0) |
@@ -657,7 +659,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 	save->rip = 0x0000fff0;
 	svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
 
-	/* This is the guest-visible cr0 value.
+	/*
+	 * This is the guest-visible cr0 value.
 	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
 	 */
 	svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
@@ -903,7 +906,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 	var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
 	var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
 
-	/* AMD's VMCB does not have an explicit unusable field, so emulate it
+	/*
+	 * AMD's VMCB does not have an explicit unusable field, so emulate it
 	 * for cross vendor migration purposes by "not present"
 	 */
 	var->unusable = !var->present || (var->type == 0);
@@ -939,7 +943,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 			var->type |= 0x1;
 		break;
 	case VCPU_SREG_SS:
-		/* On AMD CPUs sometimes the DB bit in the segment
+		/*
+		 * On AMD CPUs sometimes the DB bit in the segment
 		 * descriptor is left as 1, although the whole segment has
 		 * been made unusable. Clear it here to pass an Intel VMX
 		 * entry check when cross vendor migrating.
@@ -1270,7 +1275,7 @@ static int db_interception(struct vcpu_svm *svm)
 	}
 
 	if (svm->vcpu.guest_debug &
-	    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
+	    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
 		kvm_run->exit_reason = KVM_EXIT_DEBUG;
 		kvm_run->debug.arch.pc =
 			svm->vmcb->save.cs.base + svm->vmcb->save.rip;
@@ -1315,7 +1320,7 @@ static void svm_fpu_activate(struct kvm_vcpu *vcpu)
 		excp    = h_excp | n_excp;
 	} else {
 		excp  = svm->vmcb->control.intercept_exceptions;
-	        excp &= ~(1 << NM_VECTOR);
+		excp &= ~(1 << NM_VECTOR);
 	}
 
 	svm->vmcb->control.intercept_exceptions = excp;
@@ -1554,13 +1559,13 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
 	case SVM_EXIT_INTR:
 	case SVM_EXIT_NMI:
 		return NESTED_EXIT_HOST;
-		/* For now we are always handling NPFs when using them */
 	case SVM_EXIT_NPF:
+		/* For now we are always handling NPFs when using them */
 		if (npt_enabled)
 			return NESTED_EXIT_HOST;
 		break;
-	/* When we're shadowing, trap PFs */
 	case SVM_EXIT_EXCP_BASE + PF_VECTOR:
+		/* When we're shadowing, trap PFs */
 		if (!npt_enabled)
 			return NESTED_EXIT_HOST;
 		break;
@@ -1795,7 +1800,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 	if (!nested_msrpm)
 		return false;
 
-	for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
+	for (i = 0; i < PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
 		svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
 
 	svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
@@ -1829,8 +1834,10 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	kvm_clear_exception_queue(&svm->vcpu);
 	kvm_clear_interrupt_queue(&svm->vcpu);
 
-	/* Save the old vmcb, so we don't need to pick what we save, but
-	   can restore everything when a VMEXIT occurs */
+	/*
+	 * Save the old vmcb, so we don't need to pick what we save, but can
+	 * restore everything when a VMEXIT occurs
+	 */
 	hsave->save.es     = vmcb->save.es;
 	hsave->save.cs     = vmcb->save.cs;
 	hsave->save.ss     = vmcb->save.ss;
@@ -1878,6 +1885,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
+
 	/* In case we don't even reach vcpu_run, the fields are not updated */
 	svm->vmcb->save.rax = nested_vmcb->save.rax;
 	svm->vmcb->save.rsp = nested_vmcb->save.rsp;
@@ -1909,8 +1917,10 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 		svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
 	}
 
-	/* We don't want a nested guest to be more powerful than the guest,
-	   so all intercepts are ORed */
+	/*
+	 * We don't want a nested guest to be more powerful than the guest, so
+	 * all intercepts are ORed
+	 */
 	svm->vmcb->control.intercept_cr_read |=
 		nested_vmcb->control.intercept_cr_read;
 	svm->vmcb->control.intercept_cr_write |=
@@ -2224,9 +2234,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 	case MSR_IA32_SYSENTER_ESP:
 		*data = svm->sysenter_esp;
 		break;
-	/* Nobody will change the following 5 values in the VMCB so
-	   we can safely return them on rdmsr. They will always be 0
-	   until LBRV is implemented. */
+	/*
+	 * Nobody will change the following 5 values in the VMCB so we can
+	 * safely return them on rdmsr. They will always be 0 until LBRV is
+	 * implemented.
+	 */
 	case MSR_IA32_DEBUGCTLMSR:
 		*data = svm->vmcb->save.dbgctl;
 		break;
@@ -2405,16 +2417,16 @@ static int pause_interception(struct vcpu_svm *svm)
 }
 
 static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
-	[SVM_EXIT_READ_CR0]           		= emulate_on_interception,
-	[SVM_EXIT_READ_CR3]           		= emulate_on_interception,
-	[SVM_EXIT_READ_CR4]           		= emulate_on_interception,
-	[SVM_EXIT_READ_CR8]           		= emulate_on_interception,
+	[SVM_EXIT_READ_CR0]			= emulate_on_interception,
+	[SVM_EXIT_READ_CR3]			= emulate_on_interception,
+	[SVM_EXIT_READ_CR4]			= emulate_on_interception,
+	[SVM_EXIT_READ_CR8]			= emulate_on_interception,
 	[SVM_EXIT_CR0_SEL_WRITE]		= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR0]          		= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR3]          		= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR4]          		= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR8]          		= cr8_write_interception,
-	[SVM_EXIT_READ_DR0] 			= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR0]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR3]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR4]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR8]			= cr8_write_interception,
+	[SVM_EXIT_READ_DR0]			= emulate_on_interception,
 	[SVM_EXIT_READ_DR1]			= emulate_on_interception,
 	[SVM_EXIT_READ_DR2]			= emulate_on_interception,
 	[SVM_EXIT_READ_DR3]			= emulate_on_interception,
@@ -2433,15 +2445,14 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_EXCP_BASE + DB_VECTOR]	= db_interception,
 	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception,
 	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
-	[SVM_EXIT_EXCP_BASE + PF_VECTOR] 	= pf_interception,
-	[SVM_EXIT_EXCP_BASE + NM_VECTOR] 	= nm_interception,
-	[SVM_EXIT_EXCP_BASE + MC_VECTOR] 	= mc_interception,
-	[SVM_EXIT_INTR] 			= intr_interception,
+	[SVM_EXIT_EXCP_BASE + PF_VECTOR]	= pf_interception,
+	[SVM_EXIT_EXCP_BASE + NM_VECTOR]	= nm_interception,
+	[SVM_EXIT_EXCP_BASE + MC_VECTOR]	= mc_interception,
+	[SVM_EXIT_INTR]				= intr_interception,
 	[SVM_EXIT_NMI]				= nmi_interception,
 	[SVM_EXIT_SMI]				= nop_on_interception,
 	[SVM_EXIT_INIT]				= nop_on_interception,
 	[SVM_EXIT_VINTR]			= interrupt_window_interception,
-	/* [SVM_EXIT_CR0_SEL_WRITE]		= emulate_on_interception, */
 	[SVM_EXIT_CPUID]			= cpuid_interception,
 	[SVM_EXIT_IRET]                         = iret_interception,
 	[SVM_EXIT_INVD]                         = emulate_on_interception,
@@ -2449,7 +2460,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_HLT]				= halt_interception,
 	[SVM_EXIT_INVLPG]			= invlpg_interception,
 	[SVM_EXIT_INVLPGA]			= invlpga_interception,
-	[SVM_EXIT_IOIO] 		  	= io_interception,
+	[SVM_EXIT_IOIO]				= io_interception,
 	[SVM_EXIT_MSR]				= msr_interception,
 	[SVM_EXIT_TASK_SWITCH]			= task_switch_interception,
 	[SVM_EXIT_SHUTDOWN]			= shutdown_interception,
@@ -2650,10 +2661,12 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	/* In case GIF=0 we can't rely on the CPU to tell us when
-	 * GIF becomes 1, because that's a separate STGI/VMRUN intercept.
-	 * The next time we get that intercept, this function will be
-	 * called again though and we'll get the vintr intercept. */
+	/*
+	 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
+	 * 1, because that's a separate STGI/VMRUN intercept.  The next time we
+	 * get that intercept, this function will be called again though and
+	 * we'll get the vintr intercept.
+	 */
 	if (gif_set(svm) && nested_svm_intr(svm)) {
 		svm_set_vintr(svm);
 		svm_inject_irq(svm, 0x0);
@@ -2668,9 +2681,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 	    == HF_NMI_MASK)
 		return; /* IRET will cause a vm exit */
 
-	/* Something prevents NMI from been injected. Single step over
-	   possible problem (IRET or exception injection or interrupt
-	   shadow) */
+	/*
+	 * Something prevents NMI from been injected. Single step over possible
+	 * problem (IRET or exception injection or interrupt shadow)
+	 */
 	svm->nmi_singlestep = true;
 	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
 	update_db_intercept(vcpu);
@@ -2978,24 +2992,24 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 }
 
 static const struct trace_print_flags svm_exit_reasons_str[] = {
-	{ SVM_EXIT_READ_CR0,           		"read_cr0" },
-	{ SVM_EXIT_READ_CR3,	      		"read_cr3" },
-	{ SVM_EXIT_READ_CR4,	      		"read_cr4" },
-	{ SVM_EXIT_READ_CR8,  	      		"read_cr8" },
-	{ SVM_EXIT_WRITE_CR0,          		"write_cr0" },
-	{ SVM_EXIT_WRITE_CR3,	      		"write_cr3" },
-	{ SVM_EXIT_WRITE_CR4,          		"write_cr4" },
-	{ SVM_EXIT_WRITE_CR8, 	      		"write_cr8" },
-	{ SVM_EXIT_READ_DR0, 	      		"read_dr0" },
-	{ SVM_EXIT_READ_DR1,	      		"read_dr1" },
-	{ SVM_EXIT_READ_DR2,	      		"read_dr2" },
-	{ SVM_EXIT_READ_DR3,	      		"read_dr3" },
-	{ SVM_EXIT_WRITE_DR0,	      		"write_dr0" },
-	{ SVM_EXIT_WRITE_DR1,	      		"write_dr1" },
-	{ SVM_EXIT_WRITE_DR2,	      		"write_dr2" },
-	{ SVM_EXIT_WRITE_DR3,	      		"write_dr3" },
-	{ SVM_EXIT_WRITE_DR5,	      		"write_dr5" },
-	{ SVM_EXIT_WRITE_DR7,	      		"write_dr7" },
+	{ SVM_EXIT_READ_CR0,			"read_cr0" },
+	{ SVM_EXIT_READ_CR3,			"read_cr3" },
+	{ SVM_EXIT_READ_CR4,			"read_cr4" },
+	{ SVM_EXIT_READ_CR8,			"read_cr8" },
+	{ SVM_EXIT_WRITE_CR0,			"write_cr0" },
+	{ SVM_EXIT_WRITE_CR3,			"write_cr3" },
+	{ SVM_EXIT_WRITE_CR4,			"write_cr4" },
+	{ SVM_EXIT_WRITE_CR8,			"write_cr8" },
+	{ SVM_EXIT_READ_DR0,			"read_dr0" },
+	{ SVM_EXIT_READ_DR1,			"read_dr1" },
+	{ SVM_EXIT_READ_DR2,			"read_dr2" },
+	{ SVM_EXIT_READ_DR3,			"read_dr3" },
+	{ SVM_EXIT_WRITE_DR0,			"write_dr0" },
+	{ SVM_EXIT_WRITE_DR1,			"write_dr1" },
+	{ SVM_EXIT_WRITE_DR2,			"write_dr2" },
+	{ SVM_EXIT_WRITE_DR3,			"write_dr3" },
+	{ SVM_EXIT_WRITE_DR5,			"write_dr5" },
+	{ SVM_EXIT_WRITE_DR7,			"write_dr7" },
 	{ SVM_EXIT_EXCP_BASE + DB_VECTOR,	"DB excp" },
 	{ SVM_EXIT_EXCP_BASE + BP_VECTOR,	"BP excp" },
 	{ SVM_EXIT_EXCP_BASE + UD_VECTOR,	"UD excp" },
-- 
cgit v1.2.3-55-g7522


From 0e5cbe368b366f02cf3fe707aea2c0efac1bf70e Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Wed, 24 Feb 2010 18:59:11 +0100
Subject: KVM: SVM: Reset MMU on nested_svm_vmrun for NPT too

Without resetting the MMU the gva_to_pga function will not
work reliably when the vcpu is running in nested context.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 44679530ad5d..ef40b90219f6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1877,10 +1877,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	if (npt_enabled) {
 		svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
 		svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
-	} else {
+	} else
 		kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
-		kvm_mmu_reset_context(&svm->vcpu);
-	}
+
+	/* Guest paging mode is active - reset mmu */
+	kvm_mmu_reset_context(&svm->vcpu);
+
 	svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
-- 
cgit v1.2.3-55-g7522


From 887f500ca1a721cac237ba56374b9c0f578251ec Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Wed, 24 Feb 2010 18:59:12 +0100
Subject: KVM: SVM: Check for nested intercepts on NMI injection

This patch implements the NMI intercept checking for nested
svm.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ef40b90219f6..9f1143105e00 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1486,6 +1486,21 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
 	return true;
 }
 
+/* This function returns true if it is save to enable the nmi window */
+static inline bool nested_svm_nmi(struct vcpu_svm *svm)
+{
+	if (!is_nested(svm))
+		return true;
+
+	if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
+		return true;
+
+	svm->vmcb->control.exit_code = SVM_EXIT_NMI;
+	svm->nested.exit_required = true;
+
+	return false;
+}
+
 static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
 {
 	struct page *page;
@@ -2687,9 +2702,11 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 	 * Something prevents NMI from been injected. Single step over possible
 	 * problem (IRET or exception injection or interrupt shadow)
 	 */
-	svm->nmi_singlestep = true;
-	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
-	update_db_intercept(vcpu);
+	if (gif_set(svm) && nested_svm_nmi(svm)) {
+		svm->nmi_singlestep = true;
+		svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+		update_db_intercept(vcpu);
+	}
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
-- 
cgit v1.2.3-55-g7522


From ecf1405df235da3efea213427ac7da7f816e9d06 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Wed, 24 Feb 2010 18:59:13 +0100
Subject: KVM: SVM: Restore tracing of nested vmcb address

A recent change broke tracing of the nested vmcb address. It
was reported as 0 all the time. This patch fixes it.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9f1143105e00..2e4e089646a7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1839,7 +1839,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	if (!nested_vmcb)
 		return false;
 
-	trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
+	trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa,
 			       nested_vmcb->save.rip,
 			       nested_vmcb->control.int_ctl,
 			       nested_vmcb->control.event_inj,
-- 
cgit v1.2.3-55-g7522


From 2e554e8d67926024b01e97d2fe652810165354e2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Wed, 24 Feb 2010 18:59:14 +0100
Subject: KVM: SVM: Add kvm_nested_intercepts tracepoint

This patch adds a tracepoint to get information about the
most important intercept bitmasks from the nested vmcb.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c   |  5 +++++
 arch/x86/kvm/trace.h | 22 ++++++++++++++++++++++
 arch/x86/kvm/x86.c   |  1 +
 3 files changed, 28 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2e4e089646a7..cac761c6d1dc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1845,6 +1845,11 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 			       nested_vmcb->control.event_inj,
 			       nested_vmcb->control.nested_ctl);
 
+	trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read,
+				    nested_vmcb->control.intercept_cr_write,
+				    nested_vmcb->control.intercept_exceptions,
+				    nested_vmcb->control.intercept);
+
 	/* Clear internal status */
 	kvm_clear_exception_queue(&svm->vcpu);
 	kvm_clear_interrupt_queue(&svm->vcpu);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 12f8d2dee984..17b52ccd9774 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -419,6 +419,28 @@ TRACE_EVENT(kvm_nested_vmrun,
 		__entry->npt ? "on" : "off")
 );
 
+TRACE_EVENT(kvm_nested_intercepts,
+	    TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
+	    TP_ARGS(cr_read, cr_write, exceptions, intercept),
+
+	TP_STRUCT__entry(
+		__field(	__u16,		cr_read		)
+		__field(	__u16,		cr_write	)
+		__field(	__u32,		exceptions	)
+		__field(	__u64,		intercept	)
+	),
+
+	TP_fast_assign(
+		__entry->cr_read	= cr_read;
+		__entry->cr_write	= cr_write;
+		__entry->exceptions	= exceptions;
+		__entry->intercept	= intercept;
+	),
+
+	TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
+		__entry->cr_read, __entry->cr_write, __entry->exceptions,
+		__entry->intercept)
+);
 /*
  * Tracepoint for #VMEXIT while nested
  */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3a367f35cebf..1aa4d6e26bad 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5909,3 +5909,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
-- 
cgit v1.2.3-55-g7522


From 4a810181c8bcd73112f5c62b205b5583fd4a197f Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Wed, 24 Feb 2010 18:59:15 +0100
Subject: KVM: SVM: Implement emulation of vm_cr msr

This patch implements the emulation of the vm_cr msr for
nested svm.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/svm.h |  4 ++++
 arch/x86/kvm/svm.c         | 29 ++++++++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 38638cd2fa4c..b26a38d85356 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -115,6 +115,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 #define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
 #define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
 
+#define SVM_VM_CR_VALID_MASK	0x001fULL
+#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
+#define SVM_VM_CR_SVM_DIS_MASK  0x0010ULL
+
 struct __attribute__ ((__packed__)) vmcb_seg {
 	u16 selector;
 	u16 attrib;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index cac761c6d1dc..b4aac5c7ad87 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -71,6 +71,7 @@ struct kvm_vcpu;
 struct nested_state {
 	struct vmcb *hsave;
 	u64 hsave_msr;
+	u64 vm_cr_msr;
 	u64 vmcb;
 
 	/* These are the merged vectors */
@@ -2280,7 +2281,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 		*data = svm->nested.hsave_msr;
 		break;
 	case MSR_VM_CR:
-		*data = 0;
+		*data = svm->nested.vm_cr_msr;
 		break;
 	case MSR_IA32_UCODE_REV:
 		*data = 0x01000065;
@@ -2310,6 +2311,31 @@ static int rdmsr_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
+static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	int svm_dis, chg_mask;
+
+	if (data & ~SVM_VM_CR_VALID_MASK)
+		return 1;
+
+	chg_mask = SVM_VM_CR_VALID_MASK;
+
+	if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
+		chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
+
+	svm->nested.vm_cr_msr &= ~chg_mask;
+	svm->nested.vm_cr_msr |= (data & chg_mask);
+
+	svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
+
+	/* check for svm_disable while efer.svme is set */
+	if (svm_dis && (vcpu->arch.efer & EFER_SVME))
+		return 1;
+
+	return 0;
+}
+
 static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -2376,6 +2402,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 		svm->nested.hsave_msr = data;
 		break;
 	case MSR_VM_CR:
+		return svm_set_vm_cr(vcpu, data);
 	case MSR_VM_IGNNE:
 		pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
 		break;
-- 
cgit v1.2.3-55-g7522


From 82494028dff648c29e3a40915f1fceca51b4490a Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Wed, 24 Feb 2010 18:59:16 +0100
Subject: KVM: SVM: Ignore write of hwcr.ignne

Hyper-V as a guest wants to write this bit. This patch
ignores it.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1aa4d6e26bad..81b7af5558f9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1090,6 +1090,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		break;
 	case MSR_K7_HWCR:
 		data &= ~(u64)0x40;	/* ignore flush filter disable */
+		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
 		if (data != 0) {
 			pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
 				data);
-- 
cgit v1.2.3-55-g7522


From b44ea385d8cb187e04ec8d901d4c320c8b07c40b Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Wed, 24 Feb 2010 18:59:17 +0100
Subject: KVM: x86: Don't set arch.cr0 in kvm_set_cr0

The vcpu->arch.cr0 variable is already set in the
architecture specific set_cr0 callbacks. There is no need to
set it in the common code.
This allows the architecture code to keep the old arch.cr0
value if it wants. This is required for nested svm to decide
if a selective_cr0 exit needs to be injected.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 81b7af5558f9..d0b184b5c248 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -475,7 +475,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	}
 
 	kvm_x86_ops->set_cr0(vcpu, cr0);
-	vcpu->arch.cr0 = cr0;
 
 	kvm_mmu_reset_context(vcpu);
 	return;
-- 
cgit v1.2.3-55-g7522


From 7f5d8b5600b5294137886b46bf00ef811d0fdf32 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Wed, 24 Feb 2010 18:59:18 +0100
Subject: KVM: SVM: Handle nested selective_cr0 intercept correctly

If we have the following situation with nested svm:

1. Host KVM intercepts cr0 writes
2. Guest hypervisor intercepts only selective cr0 writes

Then we get an cr0 write intercept which is handled on the
host. But that intercepts may actually be a selective cr0
intercept for the guest. This patch checks for this
condition and injects a selective cr0 intercept if needed.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b4aac5c7ad87..631d2e544491 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1043,6 +1043,27 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	if (is_nested(svm)) {
+		/*
+		 * We are here because we run in nested mode, the host kvm
+		 * intercepts cr0 writes but the l1 hypervisor does not.
+		 * But the L1 hypervisor may intercept selective cr0 writes.
+		 * This needs to be checked here.
+		 */
+		unsigned long old, new;
+
+		/* Remove bits that would trigger a real cr0 write intercept */
+		old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
+		new = cr0 & SVM_CR0_SELECTIVE_MASK;
+
+		if (old == new) {
+			/* cr0 write with ts and mp unchanged */
+			svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+			if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE)
+				return;
+		}
+	}
+
 #ifdef CONFIG_X86_64
 	if (vcpu->arch.efer & EFER_LME) {
 		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
-- 
cgit v1.2.3-55-g7522


From 197717d5813fc39a7185a3177b76f4a3b2405df7 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Wed, 24 Feb 2010 18:59:19 +0100
Subject: KVM: SVM: Clear exit_info for injected INTR exits

When injecting an vmexit.intr into the nested hypervisor
there might be leftover values in the exit_info fields.
Clear them to not confuse nested hypervisors.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 631d2e544491..38f1fceefea1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1491,7 +1491,9 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
 	if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
 		return false;
 
-	svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+	svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
+	svm->vmcb->control.exit_info_1 = 0;
+	svm->vmcb->control.exit_info_2 = 0;
 
 	if (svm->nested.intercept & 1ULL) {
 		/*
-- 
cgit v1.2.3-55-g7522


From d6ab1ed44627c91d0a857a430b7ec4ed8648c7a5 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 25 Feb 2010 12:43:07 +0200
Subject: KVM: Drop kvm_get_gdt() in favor of generic linux function

Linux now has native_store_gdt() to do the same. Use it instead of
kvm local version.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 5 -----
 arch/x86/kvm/svm.c              | 2 +-
 arch/x86/kvm/vmx.c              | 4 ++--
 arch/x86/kvm/x86.c              | 2 +-
 4 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 502fff123e29..e3167224d936 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -723,11 +723,6 @@ static inline void kvm_get_idt(struct desc_ptr *table)
 	asm("sidt %0" : "=m"(*table));
 }
 
-static inline void kvm_get_gdt(struct desc_ptr *table)
-{
-	asm("sgdt %0" : "=m"(*table));
-}
-
 static inline unsigned long kvm_read_tr_base(void)
 {
 	u16 tr;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 38f1fceefea1..6882be5b9267 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -368,7 +368,7 @@ static int svm_hardware_enable(void *garbage)
 	sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 	sd->next_asid = sd->max_asid + 1;
 
-	kvm_get_gdt(&gdt_descr);
+	native_store_gdt(&gdt_descr);
 	gdt = (struct desc_struct *)gdt_descr.address;
 	sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 61f03980adae..68712bdf0407 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -603,7 +603,7 @@ static void reload_tss(void)
 	struct desc_ptr gdt;
 	struct desc_struct *descs;
 
-	kvm_get_gdt(&gdt);
+	native_store_gdt(&gdt);
 	descs = (void *)gdt.address;
 	descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
 	load_TR_desc();
@@ -767,7 +767,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		 * processors.
 		 */
 		vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
-		kvm_get_gdt(&dt);
+		native_store_gdt(&dt);
 		vmcs_writel(HOST_GDTR_BASE, dt.address);   /* 22.2.4 */
 
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d0b184b5c248..e07b243055f8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -233,7 +233,7 @@ unsigned long segment_base(u16 selector)
 	if (selector == 0)
 		return 0;
 
-	kvm_get_gdt(&gdt);
+	native_store_gdt(&gdt);
 	table_base = gdt.address;
 
 	if (selector & 4) {           /* from ldt */
-- 
cgit v1.2.3-55-g7522


From 254d4d48a56925622a5592ad590a738735b66135 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 25 Feb 2010 12:43:08 +0200
Subject: KVM: fix segment_base() error checking

fix segment_base() to properly check for null segment selector and
avoid accessing NULL pointer if ldt selector in null.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e07b243055f8..814e72a02eff 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -230,7 +230,7 @@ unsigned long segment_base(u16 selector)
 	unsigned long table_base;
 	unsigned long v;
 
-	if (selector == 0)
+	if (!(selector & ~3))
 		return 0;
 
 	native_store_gdt(&gdt);
@@ -239,6 +239,8 @@ unsigned long segment_base(u16 selector)
 	if (selector & 4) {           /* from ldt */
 		u16 ldt_selector = kvm_read_ldt();
 
+		if (!(ldt_selector & ~3))
+			return 0;
 		table_base = segment_base(ldt_selector);
 	}
 	d = (struct desc_struct *)(table_base + (selector & ~7));
-- 
cgit v1.2.3-55-g7522


From 2d49ec72d3fab0aa90510a64a973d594c48b1fd1 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 25 Feb 2010 12:43:09 +0200
Subject: KVM: move segment_base() into vmx.c

segment_base() is used only by vmx so move it there.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  9 ---------
 arch/x86/kvm/vmx.c              | 37 +++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              | 30 ------------------------------
 3 files changed, 37 insertions(+), 39 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e3167224d936..ec891a2ce86e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -644,8 +644,6 @@ int emulator_write_emulated(unsigned long addr,
 			    unsigned int bytes,
 			    struct kvm_vcpu *vcpu);
 
-unsigned long segment_base(u16 selector);
-
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       const u8 *new, int bytes,
@@ -723,13 +721,6 @@ static inline void kvm_get_idt(struct desc_ptr *table)
 	asm("sidt %0" : "=m"(*table));
 }
 
-static inline unsigned long kvm_read_tr_base(void)
-{
-	u16 tr;
-	asm("str %0" : "=g"(tr));
-	return segment_base(tr);
-}
-
 #ifdef CONFIG_X86_64
 static inline unsigned long read_msr(unsigned long msr)
 {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 68712bdf0407..8e2a24693be9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -634,6 +634,43 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 	return true;
 }
 
+static unsigned long segment_base(u16 selector)
+{
+	struct desc_ptr gdt;
+	struct desc_struct *d;
+	unsigned long table_base;
+	unsigned long v;
+
+	if (!(selector & ~3))
+		return 0;
+
+	native_store_gdt(&gdt);
+	table_base = gdt.address;
+
+	if (selector & 4) {           /* from ldt */
+		u16 ldt_selector = kvm_read_ldt();
+
+		if (!(ldt_selector & ~3))
+			return 0;
+
+		table_base = segment_base(ldt_selector);
+	}
+	d = (struct desc_struct *)(table_base + (selector & ~7));
+	v = get_desc_base(d);
+#ifdef CONFIG_X86_64
+       if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
+               v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
+#endif
+	return v;
+}
+
+static inline unsigned long kvm_read_tr_base(void)
+{
+	u16 tr;
+	asm("str %0" : "=g"(tr));
+	return segment_base(tr);
+}
+
 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 814e72a02eff..2b34dc705cfb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -223,36 +223,6 @@ static void drop_user_return_notifiers(void *ignore)
 		kvm_on_user_return(&smsr->urn);
 }
 
-unsigned long segment_base(u16 selector)
-{
-	struct desc_ptr gdt;
-	struct desc_struct *d;
-	unsigned long table_base;
-	unsigned long v;
-
-	if (!(selector & ~3))
-		return 0;
-
-	native_store_gdt(&gdt);
-	table_base = gdt.address;
-
-	if (selector & 4) {           /* from ldt */
-		u16 ldt_selector = kvm_read_ldt();
-
-		if (!(ldt_selector & ~3))
-			return 0;
-		table_base = segment_base(ldt_selector);
-	}
-	d = (struct desc_struct *)(table_base + (selector & ~7));
-	v = get_desc_base(d);
-#ifdef CONFIG_X86_64
-	if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
-		v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
-#endif
-	return v;
-}
-EXPORT_SYMBOL_GPL(segment_base);
-
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 {
 	if (irqchip_in_kernel(vcpu->kvm))
-- 
cgit v1.2.3-55-g7522


From e35b7b9c9e7d8768ee34e5904fed4cb0f2c2cb5d Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 25 Feb 2010 16:36:42 +0200
Subject: KVM: x86 emulator: Add decoding of 16bit second in memory argument

Add decoding of Ep type of argument used by callf/jmpf.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c9f604b0819c..97a740368b30 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -85,6 +85,9 @@
 #define Src2ImmByte (2<<29)
 #define Src2One     (3<<29)
 #define Src2Imm16   (4<<29)
+#define Src2Mem16   (5<<29) /* Used for Ep encoding. First argument has to be
+			       in memory and second argument is located
+			       immediately after the first one in memory. */
 #define Src2Mask    (7<<29)
 
 enum {
@@ -1163,6 +1166,10 @@ done_prefixes:
 		c->src2.bytes = 1;
 		c->src2.val = 1;
 		break;
+	case Src2Mem16:
+		c->src2.bytes = 2;
+		c->src2.type = OP_MEM;
+		break;
 	}
 
 	/* Decode and fetch the destination operand: register or memory. */
@@ -1881,6 +1888,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		c->src.orig_val = c->src.val;
 	}
 
+	if (c->src2.type == OP_MEM) {
+		c->src2.ptr = (unsigned long *)(memop + c->src.bytes);
+		c->src2.val = 0;
+		rc = ops->read_emulated((unsigned long)c->src2.ptr,
+					&c->src2.val,
+					c->src2.bytes,
+					ctxt->vcpu);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+	}
+
 	if ((c->d & DstMask) == ImplicitOps)
 		goto special_insn;
 
-- 
cgit v1.2.3-55-g7522


From ea79849d4c8461034b75acb19c8041b6fddee2a5 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 25 Feb 2010 16:36:43 +0200
Subject: KVM: x86 emulator: Implement jmp far opcode ff/5

Implement jmp far opcode ff/5. It is used by multiboot loader.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 97a740368b30..5b6794adaa2e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -346,7 +346,8 @@ static u32 group_table[] = {
 	[Group5*8] =
 	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
 	SrcMem | ModRM | Stack, 0,
-	SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
+	SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps,
+	SrcMem | ModRM | Stack, 0,
 	[Group7*8] =
 	0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
 	SrcNone | ModRM | DstMem | Mov, 0,
@@ -2322,6 +2323,7 @@ special_insn:
 	case 0xe9: /* jmp rel */
 		goto jmp;
 	case 0xea: /* jmp far */
+	jump_far:
 		if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val,
 						VCPU_SREG_CS))
 			goto done;
@@ -2397,11 +2399,16 @@ special_insn:
 		ctxt->eflags |= EFLG_DF;
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
-	case 0xfe ... 0xff:	/* Grp4/Grp5 */
+	case 0xfe: /* Grp4 */
+	grp45:
 		rc = emulate_grp45(ctxt, ops);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
+	case 0xff: /* Grp5 */
+		if (c->modrm_reg == 5)
+			goto jump_far;
+		goto grp45;
 	}
 
 writeback:
-- 
cgit v1.2.3-55-g7522


From 402af0d7c692ddcfa2333e93d3f275ebd0487926 Mon Sep 17 00:00:00 2001
From: Jan Beulich
Date: Wed, 21 Apr 2010 15:21:51 +0100
Subject: x86, asm: Introduce and use percpu_inc()

... generating slightly smaller code.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4BCF261F020000780003B33C@vpn.id2.novell.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/hardirq.h   |  2 +-
 arch/x86/include/asm/percpu.h    | 24 ++++++++++++++++++++++++
 arch/x86/kernel/cpu/mcheck/mce.c |  4 ++--
 3 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 0f8576427cfe..aeab29aee617 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -35,7 +35,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
 
 #define __ARCH_IRQ_STAT
 
-#define inc_irq_stat(member)	percpu_add(irq_stat.member, 1)
+#define inc_irq_stat(member)	percpu_inc(irq_stat.member)
 
 #define local_softirq_pending()	percpu_read(irq_stat.__softirq_pending)
 
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 66a272dfd8b8..0ec6d12d84e6 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -190,6 +190,29 @@ do {									\
 	pfo_ret__;					\
 })
 
+#define percpu_unary_op(op, var)			\
+({							\
+	switch (sizeof(var)) {				\
+	case 1:						\
+		asm(op "b "__percpu_arg(0)		\
+		    : "+m" (var));			\
+		break;					\
+	case 2:						\
+		asm(op "w "__percpu_arg(0)		\
+		    : "+m" (var));			\
+		break;					\
+	case 4:						\
+		asm(op "l "__percpu_arg(0)		\
+		    : "+m" (var));			\
+		break;					\
+	case 8:						\
+		asm(op "q "__percpu_arg(0)		\
+		    : "+m" (var));			\
+		break;					\
+	default: __bad_percpu_size();			\
+	}						\
+})
+
 /*
  * percpu_read() makes gcc load the percpu variable every time it is
  * accessed while percpu_read_stable() allows the value to be cached.
@@ -207,6 +230,7 @@ do {									\
 #define percpu_and(var, val)		percpu_to_op("and", var, val)
 #define percpu_or(var, val)		percpu_to_op("or", var, val)
 #define percpu_xor(var, val)		percpu_to_op("xor", var, val)
+#define percpu_inc(var)		percpu_unary_op("inc", var)
 
 #define __this_cpu_read_1(pcp)		percpu_from_op("mov", (pcp), "m"(pcp))
 #define __this_cpu_read_2(pcp)		percpu_from_op("mov", (pcp), "m"(pcp))
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8a6f0afa767e..7a355ddcc64b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -539,7 +539,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 	struct mce m;
 	int i;
 
-	__get_cpu_var(mce_poll_count)++;
+	percpu_inc(mce_poll_count);
 
 	mce_setup(&m);
 
@@ -934,7 +934,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
 	atomic_inc(&mce_entry);
 
-	__get_cpu_var(mce_exception_count)++;
+	percpu_inc(mce_exception_count);
 
 	if (notify_die(DIE_NMI, "machine check", regs, error_code,
 			   18, SIGKILL) == NOTIFY_STOP)
-- 
cgit v1.2.3-55-g7522


From 2e61878698781d6a9a8bfbaa4ea9c5ddb5a178c3 Mon Sep 17 00:00:00 2001
From: Jan Beulich
Date: Wed, 21 Apr 2010 16:13:20 +0100
Subject: x86-64: Combine SRAT regions when possible

... i.e. when the hole between two regions isn't occupied by memory on
another node. This reduces the memory->node table size, thus reducing
cache footprint of lookups, which got increased significantly some
time ago, and things go back to how they were before that change on
the systems I looked at.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4BCF3230020000780003B3CA@vpn.id2.novell.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/srat_64.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 28c68762648f..3ebe6519bd87 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -363,6 +363,54 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 	for (i = 0; i < MAX_NUMNODES; i++)
 		cutoff_node(i, start, end);
 
+	/*
+	 * Join together blocks on the same node, holes between
+	 * which don't overlap with memory on other nodes.
+	 */
+	for (i = 0; i < num_node_memblks; ++i) {
+		int j, k;
+
+		for (j = i + 1; j < num_node_memblks; ++j) {
+			unsigned long start, end;
+
+			if (memblk_nodeid[i] != memblk_nodeid[j])
+				continue;
+			start = min(node_memblk_range[i].end,
+			            node_memblk_range[j].end);
+			end = max(node_memblk_range[i].start,
+			          node_memblk_range[j].start);
+			for (k = 0; k < num_node_memblks; ++k) {
+				if (memblk_nodeid[i] == memblk_nodeid[k])
+					continue;
+				if (start < node_memblk_range[k].end &&
+				    end > node_memblk_range[k].start)
+					break;
+			}
+			if (k < num_node_memblks)
+				continue;
+			start = min(node_memblk_range[i].start,
+			            node_memblk_range[j].start);
+			end = max(node_memblk_range[i].end,
+			          node_memblk_range[j].end);
+			printk(KERN_INFO "SRAT: Node %d "
+			       "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
+			       memblk_nodeid[i],
+			       node_memblk_range[i].start,
+			       node_memblk_range[i].end,
+			       node_memblk_range[j].start,
+			       node_memblk_range[j].end,
+			       start, end);
+			node_memblk_range[i].start = start;
+			node_memblk_range[i].end = end;
+			k = --num_node_memblks - j;
+			memmove(memblk_nodeid + j, memblk_nodeid + j+1,
+				k * sizeof(*memblk_nodeid));
+			memmove(node_memblk_range + j, node_memblk_range + j+1,
+				k * sizeof(*node_memblk_range));
+			--j;
+		}
+	}
+
 	memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
 					   memblk_nodeid);
 	if (memnode_shift < 0) {
-- 
cgit v1.2.3-55-g7522


From 5967ed87ade85a421ef814296c3c7f182b08c225 Mon Sep 17 00:00:00 2001
From: Jan Beulich
Date: Wed, 21 Apr 2010 16:08:14 +0100
Subject: x86-64: Reduce SMP locks table size

Reduce the SMP locks table size by using relative pointers instead of
absolute ones, thus cutting the table size by half.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4BCF30FE020000780003B3B6@vpn.id2.novell.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/alternative-asm.h |  4 +--
 arch/x86/include/asm/alternative.h     |  4 +--
 arch/x86/kernel/alternative.c          | 45 +++++++++++++++++++---------------
 3 files changed, 29 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index b97f786a48d5..a63a68be1cce 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -6,8 +6,8 @@
 	.macro LOCK_PREFIX
 1:	lock
 	.section .smp_locks,"a"
-	_ASM_ALIGN
-	_ASM_PTR 1b
+	.balign 4
+	.long 1b - .
 	.previous
 	.endm
 #else
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index b09ec55650b3..714bf2417284 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -30,8 +30,8 @@
 #ifdef CONFIG_SMP
 #define LOCK_PREFIX \
 		".section .smp_locks,\"a\"\n"	\
-		_ASM_ALIGN "\n"			\
-		_ASM_PTR "661f\n" /* address */	\
+		".balign 4\n"			\
+		".long 661f - .\n" /* offset */	\
 		".previous\n"			\
 		"661:\n\tlock; "
 
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1a160d5d44d0..936738427223 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -194,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
 }
 
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-extern u8 *__smp_locks[], *__smp_locks_end[];
+extern s32 __smp_locks[], __smp_locks_end[];
 static void *text_poke_early(void *addr, const void *opcode, size_t len);
 
 /* Replace instructions with better alternatives for this CPU type.
@@ -235,37 +235,39 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 
 #ifdef CONFIG_SMP
 
-static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+static void alternatives_smp_lock(const s32 *start, const s32 *end,
+				  u8 *text, u8 *text_end)
 {
-	u8 **ptr;
+	const s32 *poff;
 
 	mutex_lock(&text_mutex);
-	for (ptr = start; ptr < end; ptr++) {
-		if (*ptr < text)
-			continue;
-		if (*ptr > text_end)
+	for (poff = start; poff < end; poff++) {
+		u8 *ptr = (u8 *)poff + *poff;
+
+		if (!*poff || ptr < text || ptr >= text_end)
 			continue;
 		/* turn DS segment override prefix into lock prefix */
-		text_poke(*ptr, ((unsigned char []){0xf0}), 1);
+		text_poke(ptr, ((unsigned char []){0xf0}), 1);
 	};
 	mutex_unlock(&text_mutex);
 }
 
-static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+static void alternatives_smp_unlock(const s32 *start, const s32 *end,
+				    u8 *text, u8 *text_end)
 {
-	u8 **ptr;
+	const s32 *poff;
 
 	if (noreplace_smp)
 		return;
 
 	mutex_lock(&text_mutex);
-	for (ptr = start; ptr < end; ptr++) {
-		if (*ptr < text)
-			continue;
-		if (*ptr > text_end)
+	for (poff = start; poff < end; poff++) {
+		u8 *ptr = (u8 *)poff + *poff;
+
+		if (!*poff || ptr < text || ptr >= text_end)
 			continue;
 		/* turn lock prefix into DS segment override prefix */
-		text_poke(*ptr, ((unsigned char []){0x3E}), 1);
+		text_poke(ptr, ((unsigned char []){0x3E}), 1);
 	};
 	mutex_unlock(&text_mutex);
 }
@@ -276,8 +278,8 @@ struct smp_alt_module {
 	char		*name;
 
 	/* ptrs to lock prefixes */
-	u8		**locks;
-	u8		**locks_end;
+	const s32	*locks;
+	const s32	*locks_end;
 
 	/* .text segment, needed to avoid patching init code ;) */
 	u8		*text;
@@ -398,16 +400,19 @@ void alternatives_smp_switch(int smp)
 int alternatives_text_reserved(void *start, void *end)
 {
 	struct smp_alt_module *mod;
-	u8 **ptr;
+	const s32 *poff;
 	u8 *text_start = start;
 	u8 *text_end = end;
 
 	list_for_each_entry(mod, &smp_alt_modules, next) {
 		if (mod->text > text_end || mod->text_end < text_start)
 			continue;
-		for (ptr = mod->locks; ptr < mod->locks_end; ptr++)
-			if (text_start <= *ptr && text_end >= *ptr)
+		for (poff = mod->locks; poff < mod->locks_end; poff++) {
+			const u8 *ptr = (const u8 *)poff + *poff;
+
+			if (text_start <= ptr && text_end > ptr)
 				return 1;
+		}
 	}
 
 	return 0;
-- 
cgit v1.2.3-55-g7522


From 47f9fe26299ae022ac1e3fa12e7e73def62b7898 Mon Sep 17 00:00:00 2001
From: Jan Beulich
Date: Wed, 21 Apr 2010 16:19:57 +0100
Subject: x86-64: Don't export init_level4_pgt

It's not used by any module, and i386 (as well as some other arches)
also doesn't export its equivalent (swapper_pg_dir).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4BCF33BD020000780003B3E4@vpn.id2.novell.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/x8664_ksyms_64.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 693920b22496..1b950d151e58 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__memcpy);
 
 EXPORT_SYMBOL(empty_zero_page);
-EXPORT_SYMBOL(init_level4_pgt);
 #ifndef CONFIG_PARAVIRT
 EXPORT_SYMBOL(native_load_gs_index);
 #endif
-- 
cgit v1.2.3-55-g7522


From 6fc108a08dcddf8f9113cc7102ddaacf7ed37a6b Mon Sep 17 00:00:00 2001
From: Jan Beulich
Date: Wed, 21 Apr 2010 15:23:44 +0100
Subject: x86: Clean up arch/x86/Kconfig*

No functional change intended.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4BCF2690020000780003B340@vpn.id2.novell.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig            | 49 ++++++++++++++-------------------------------
 arch/x86/Kconfig.debug      |  2 --
 arch/x86/include/asm/boot.h |  2 +-
 3 files changed, 16 insertions(+), 37 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9458685902bd..85e2252625bc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -197,20 +197,17 @@ config HAVE_INTEL_TXT
 
 # Use the generic interrupt handling code in kernel/irq/:
 config GENERIC_HARDIRQS
-	bool
-	default y
+	def_bool y
 
 config GENERIC_HARDIRQS_NO__DO_IRQ
        def_bool y
 
 config GENERIC_IRQ_PROBE
-	bool
-	default y
+	def_bool y
 
 config GENERIC_PENDING_IRQ
-	bool
+	def_bool y
 	depends on GENERIC_HARDIRQS && SMP
-	default y
 
 config USE_GENERIC_SMP_HELPERS
 	def_bool y
@@ -225,14 +222,12 @@ config X86_64_SMP
 	depends on X86_64 && SMP
 
 config X86_HT
-	bool
+	def_bool y
 	depends on SMP
-	default y
 
 config X86_TRAMPOLINE
-	bool
+	def_bool y
 	depends on SMP || (64BIT && ACPI_SLEEP)
-	default y
 
 config X86_32_LAZY_GS
 	def_bool y
@@ -447,7 +442,7 @@ config X86_NUMAQ
 	  firmware with - send email to <Martin.Bligh@us.ibm.com>.
 
 config X86_SUPPORTS_MEMORY_FAILURE
-	bool
+	def_bool y
 	# MCE code calls memory_failure():
 	depends on X86_MCE
 	# On 32-bit this adds too big of NODES_SHIFT and we run out of page flags:
@@ -455,7 +450,6 @@ config X86_SUPPORTS_MEMORY_FAILURE
 	# On 32-bit SPARSEMEM adds too big of SECTIONS_WIDTH:
 	depends on X86_64 || !SPARSEMEM
 	select ARCH_SUPPORTS_MEMORY_FAILURE
-	default y
 
 config X86_VISWS
 	bool "SGI 320/540 (Visual Workstation)"
@@ -570,7 +564,6 @@ config PARAVIRT_SPINLOCKS
 
 config PARAVIRT_CLOCK
 	bool
-	default n
 
 endif
 
@@ -749,7 +742,6 @@ config MAXSMP
 	bool "Configure Maximum number of SMP Processors and NUMA Nodes"
 	depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
 	select CPUMASK_OFFSTACK
-	default n
 	---help---
 	  Configure maximum number of CPUS and NUMA Nodes for this architecture.
 	  If unsure, say N.
@@ -829,7 +821,6 @@ config X86_VISWS_APIC
 
 config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
 	bool "Reroute for broken boot IRQs"
-	default n
 	depends on X86_IO_APIC
 	---help---
 	  This option enables a workaround that fixes a source of
@@ -876,9 +867,8 @@ config X86_MCE_AMD
 	   the DRAM Error Threshold.
 
 config X86_ANCIENT_MCE
-	def_bool n
+	bool "Support for old Pentium 5 / WinChip machine checks"
 	depends on X86_32 && X86_MCE
-	prompt "Support for old Pentium 5 / WinChip machine checks"
 	---help---
 	  Include support for machine check handling on old Pentium 5 or WinChip
 	  systems. These typically need to be enabled explicitely on the command
@@ -886,8 +876,7 @@ config X86_ANCIENT_MCE
 
 config X86_MCE_THRESHOLD
 	depends on X86_MCE_AMD || X86_MCE_INTEL
-	bool
-	default y
+	def_bool y
 
 config X86_MCE_INJECT
 	depends on X86_MCE
@@ -1026,8 +1015,8 @@ config X86_CPUID
 
 choice
 	prompt "High Memory Support"
-	default HIGHMEM4G if !X86_NUMAQ
 	default HIGHMEM64G if X86_NUMAQ
+	default HIGHMEM4G
 	depends on X86_32
 
 config NOHIGHMEM
@@ -1285,7 +1274,7 @@ source "mm/Kconfig"
 
 config HIGHPTE
 	bool "Allocate 3rd-level pagetables from highmem"
-	depends on X86_32 && (HIGHMEM4G || HIGHMEM64G)
+	depends on HIGHMEM
 	---help---
 	  The VM uses one page table entry for each page of physical memory.
 	  For systems with a lot of RAM, this can be wasteful of precious
@@ -1369,8 +1358,7 @@ config MATH_EMULATION
 	  kernel, it won't hurt.
 
 config MTRR
-	bool
-	default y
+	def_bool y
 	prompt "MTRR (Memory Type Range Register) support" if EMBEDDED
 	---help---
 	  On Intel P6 family processors (Pentium Pro, Pentium II and later)
@@ -1436,8 +1424,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
 	  mtrr_spare_reg_nr=N on the kernel command line.
 
 config X86_PAT
-	bool
-	default y
+	def_bool y
 	prompt "x86 PAT support" if EMBEDDED
 	depends on MTRR
 	---help---
@@ -1605,8 +1592,7 @@ config X86_NEED_RELOCS
 	depends on X86_32 && RELOCATABLE
 
 config PHYSICAL_ALIGN
-	hex
-	prompt "Alignment value to which kernel should be aligned" if X86_32
+	hex "Alignment value to which kernel should be aligned" if X86_32
 	default "0x1000000"
 	range 0x2000 0x1000000
 	---help---
@@ -1653,7 +1639,6 @@ config COMPAT_VDSO
 
 config CMDLINE_BOOL
 	bool "Built-in kernel command line"
-	default n
 	---help---
 	  Allow for specifying boot arguments to the kernel at
 	  build time.  On some systems (e.g. embedded ones), it is
@@ -1687,7 +1672,6 @@ config CMDLINE
 
 config CMDLINE_OVERRIDE
 	bool "Built-in command line overrides boot loader arguments"
-	default n
 	depends on CMDLINE_BOOL
 	---help---
 	  Set this option to 'Y' to have the kernel ignore the boot loader
@@ -1723,8 +1707,7 @@ source "drivers/acpi/Kconfig"
 source "drivers/sfi/Kconfig"
 
 config X86_APM_BOOT
-	bool
-	default y
+	def_bool y
 	depends on APM || APM_MODULE
 
 menuconfig APM
@@ -1953,8 +1936,7 @@ config DMAR_DEFAULT_ON
 	  experimental.
 
 config DMAR_BROKEN_GFX_WA
-	def_bool n
-	prompt "Workaround broken graphics drivers (going away soon)"
+	bool "Workaround broken graphics drivers (going away soon)"
 	depends on DMAR && BROKEN
 	---help---
 	  Current Graphics drivers tend to use physical address
@@ -2052,7 +2034,6 @@ config SCx200HR_TIMER
 config OLPC
 	bool "One Laptop Per Child support"
 	select GPIOLIB
-	default n
 	---help---
 	  Add support for detecting the unique features of the OLPC
 	  XO hardware.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bc01e3ebfeb2..e7edd66d866e 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -45,7 +45,6 @@ config EARLY_PRINTK
 
 config EARLY_PRINTK_DBGP
 	bool "Early printk via EHCI debug port"
-	default n
 	depends on EARLY_PRINTK && PCI
 	---help---
 	  Write kernel log output directly into the EHCI debug port.
@@ -76,7 +75,6 @@ config DEBUG_PER_CPU_MAPS
 	bool "Debug access to per_cpu maps"
 	depends on DEBUG_KERNEL
 	depends on SMP
-	default n
 	---help---
 	  Say Y to verify that the per_cpu map being accessed has
 	  been setup.  Adds a fair amount of code to kernel memory
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 7a1065958ba9..3b62ab56c7a0 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -24,7 +24,7 @@
 #define MIN_KERNEL_ALIGN	(_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
 
 #if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
-	(CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2))
+	(CONFIG_PHYSICAL_ALIGN < MIN_KERNEL_ALIGN)
 #error "Invalid value for CONFIG_PHYSICAL_ALIGN"
 #endif
 
-- 
cgit v1.2.3-55-g7522


From 30a564be9d9554c168a654eddc2165869cc0d7bf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 13 Apr 2010 15:31:36 +0200
Subject: x86, hpet: Restrict read back to affected ATI chipsets

After programming the HPET, we do a readback as a workaround for
ATI/SBx00 chipsets as a synchronization.  Unfortunately this triggers
an erratum in newer ICH chipsets (ICH9+) where reading the comparator
immediately after the write returns the old value.  Furthermore, as
always, I/O reads are bad for performance.

Therefore, restrict the readback to the chipsets that need it, or, for
debugging purposes, when we are running with hpet=verbose.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Venkatesh Pallipadi <venki@google.com>
LKML-Reference: <20100225185348.GA9674@linux-os.sc.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/hpet.h |  1 +
 arch/x86/kernel/hpet.c      | 29 +++++++++++++++++------------
 arch/x86/kernel/quirks.c    |  5 +++++
 3 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 1d5c08a1bdfd..004e6e25e913 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -68,6 +68,7 @@ extern unsigned long force_hpet_address;
 extern u8 hpet_blockid;
 extern int hpet_force_user;
 extern u8 hpet_msi_disable;
+extern u8 hpet_readback_cmp;
 extern int is_hpet_enabled(void);
 extern int hpet_enable(void);
 extern void hpet_disable(void);
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 23b4ecdffa9b..a198b7c87a12 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -36,6 +36,7 @@
 unsigned long				hpet_address;
 u8					hpet_blockid; /* OS timer block num */
 u8					hpet_msi_disable;
+u8					hpet_readback_cmp;
 
 #ifdef CONFIG_PCI_MSI
 static unsigned long			hpet_num_timers;
@@ -395,19 +396,23 @@ static int hpet_next_event(unsigned long delta,
 	 * at that point and we would wait for the next hpet interrupt
 	 * forever. We found out that reading the CMP register back
 	 * forces the transfer so we can rely on the comparison with
-	 * the counter register below. If the read back from the
-	 * compare register does not match the value we programmed
-	 * then we might have a real hardware problem. We can not do
-	 * much about it here, but at least alert the user/admin with
-	 * a prominent warning.
-	 * An erratum on some chipsets (ICH9,..), results in comparator read
-	 * immediately following a write returning old value. Workaround
-	 * for this is to read this value second time, when first
-	 * read returns old value.
+	 * the counter register below.
+	 *
+	 * That works fine on those ATI chipsets, but on newer Intel
+	 * chipsets (ICH9...) this triggers due to an erratum: Reading
+	 * the comparator immediately following a write is returning
+	 * the old value.
+	 *
+	 * We restrict the read back to the affected ATI chipsets (set
+	 * by quirks) and also run it with hpet=verbose for debugging
+	 * purposes.
 	 */
-	if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) {
-		WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt,
-		  KERN_WARNING "hpet: compare register read back failed.\n");
+	if (hpet_readback_cmp || hpet_verbose) {
+		u32 cmp = hpet_readl(HPET_Tn_CMP(timer));
+
+		if (cmp != cnt)
+			printk_once(KERN_WARNING
+			    "hpet: compare register read back failed.\n");
 	}
 
 	return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 12e9feaa2f7a..cd2c336c8d09 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -495,10 +495,15 @@ void force_hpet_resume(void)
 /*
  * HPET MSI on some boards (ATI SB700/SB800) has side effect on
  * floppy DMA. Disable HPET MSI on such platforms.
+ *
+ * Also force the read back of the CMP register in hpet_next_event()
+ * to work around the problem that the CMP register write seems to be
+ * delayed. See hpet_next_event() for details.
  */
 static void force_disable_hpet_msi(struct pci_dev *unused)
 {
 	hpet_msi_disable = 1;
+	hpet_readback_cmp = 1;
 }
 
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
-- 
cgit v1.2.3-55-g7522


From b701a47ba48b698976fb2fe05fb285b0edc1d26a Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Thu, 29 Apr 2010 16:03:57 -0700
Subject: x86: Fix LOCK_PREFIX_HERE for uniprocessor build

Checkin b3ac891b67bd4b1fc728d1c784cad1212dea433d:
x86: Add support for lock prefix in alternatives

... did not define LOCK_PREFIX_HERE in the case of a uniprocessor
build.  As a result, it would cause any of the usages of this macro to
fail on a uniprocessor build.  Fix this by defining LOCK_PREFIX_HERE
as a null string.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267005265-27958-2-git-send-email-luca@luca-barbieri.com>
---
 arch/x86/include/asm/alternative.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 55fee12cea6d..e29a6c9bba00 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -38,6 +38,7 @@
 #define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; "
 
 #else /* ! CONFIG_SMP */
+#define LOCK_PREFIX_HERE ""
 #define LOCK_PREFIX ""
 #endif
 
-- 
cgit v1.2.3-55-g7522


From 73266fc1df2f94cf72b3beba3eee3b88ed0b0664 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Thu, 22 Apr 2010 05:05:45 +0200
Subject: hw-breakpoints: Tag ptrace breakpoint as exclude_kernel

Tag ptrace breakpoints with the exclude_kernel attribute set. This
will make it easier to set generic policies on breakpoints, when it
comes to ensure nobody unpriviliged try to breakpoint on the kernel.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/sh/kernel/ptrace_32.c    | 2 +-
 arch/x86/kernel/ptrace.c      | 2 +-
 include/linux/hw_breakpoint.h | 6 ++++++
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index 7759a9a93211..d4104ce9fe53 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -85,7 +85,7 @@ static int set_single_step(struct task_struct *tsk, unsigned long addr)
 
 	bp = thread->ptrace_bps[0];
 	if (!bp) {
-		hw_breakpoint_init(&attr);
+		ptrace_breakpoint_init(&attr);
 
 		attr.bp_addr = addr;
 		attr.bp_len = HW_BREAKPOINT_LEN_2;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 055be0afd330..70c4872cd8aa 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -688,7 +688,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 	struct perf_event_attr attr;
 
 	if (!t->ptrace_bps[nr]) {
-		hw_breakpoint_init(&attr);
+		ptrace_breakpoint_init(&attr);
 		/*
 		 * Put stub len and type to register (reserve) an inactive but
 		 * correct bp
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index c70d27af03f9..a0aa5a9cfb0e 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -34,6 +34,12 @@ static inline void hw_breakpoint_init(struct perf_event_attr *attr)
 	attr->sample_period = 1;
 }
 
+static inline void ptrace_breakpoint_init(struct perf_event_attr *attr)
+{
+	hw_breakpoint_init(attr);
+	attr->exclude_kernel = 1;
+}
+
 static inline unsigned long hw_breakpoint_addr(struct perf_event *bp)
 {
 	return bp->attr.bp_addr;
-- 
cgit v1.2.3-55-g7522


From b2812d031dea86926e9c10f7714af33ac2f6b43d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Sun, 18 Apr 2010 18:11:53 +0200
Subject: hw-breakpoints: Change/Enforce some breakpoints policies

The current policies of breakpoints in x86 and SH are the following:

- task bound breakpoints can only break on userspace addresses
- cpu wide breakpoints can only break on kernel addresses

The former rule prevents ptrace breakpoints to be set to trigger on
kernel addresses, which is good. But as a side effect, we can't
breakpoint on kernel addresses for task bound breakpoints.

The latter rule simply makes no sense, there is no reason why we
can't set breakpoints on userspace while performing cpu bound
profiles.

We want the following new policies:

- task bound breakpoint can set userspace address breakpoints, with
no particular privilege required.
- task bound breakpoints can set kernelspace address breakpoints but
must be privileged to do that.
- cpu bound breakpoints can do what they want as they are privileged
already.

To implement these new policies, this patch checks if we are dealing
with a kernel address breakpoint, if so and if the exclude_kernel
parameter is set, we tell the user that the breakpoint is invalid,
which makes a good generic ptrace protection.
If we don't have exclude_kernel, ensure the user has the right
privileges as kernel breakpoints are quite sensitive (risk of
trap recursion attacks and global performance impacts).

[ Paul Mundt: keep addr space check for sh signal delivery and fix
  double function declaration]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/hw_breakpoint.h  |  5 ++---
 arch/sh/kernel/hw_breakpoint.c       | 34 ++++++------------------------
 arch/x86/include/asm/hw_breakpoint.h |  5 ++---
 arch/x86/kernel/hw_breakpoint.c      | 41 ++++++------------------------------
 kernel/hw_breakpoint.c               | 26 +++++++++++++++++++++--
 5 files changed, 41 insertions(+), 70 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/sh/include/asm/hw_breakpoint.h b/arch/sh/include/asm/hw_breakpoint.h
index 965dd780d51b..382bad937dcc 100644
--- a/arch/sh/include/asm/hw_breakpoint.h
+++ b/arch/sh/include/asm/hw_breakpoint.h
@@ -47,9 +47,8 @@ struct pmu;
 #define HBP_NUM		2
 
 /* arch/sh/kernel/hw_breakpoint.c */
-extern int arch_check_va_in_userspace(unsigned long va, u16 hbp_len);
-extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
-					 struct task_struct *tsk);
+extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
+extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
 extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
 					   unsigned long val, void *data);
 
diff --git a/arch/sh/kernel/hw_breakpoint.c b/arch/sh/kernel/hw_breakpoint.c
index 675eea7785d9..1f2cf6229862 100644
--- a/arch/sh/kernel/hw_breakpoint.c
+++ b/arch/sh/kernel/hw_breakpoint.c
@@ -119,26 +119,17 @@ static int get_hbp_len(u16 hbp_len)
 	return len_in_bytes;
 }
 
-/*
- * Check for virtual address in user space.
- */
-int arch_check_va_in_userspace(unsigned long va, u16 hbp_len)
-{
-	unsigned int len;
-
-	len = get_hbp_len(hbp_len);
-
-	return (va <= TASK_SIZE - len);
-}
-
 /*
  * Check for virtual address in kernel space.
  */
-static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+int arch_check_bp_in_kernelspace(struct perf_event *bp)
 {
 	unsigned int len;
+	unsigned long va;
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 
-	len = get_hbp_len(hbp_len);
+	va = info->address;
+	len = get_hbp_len(info->len);
 
 	return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
 }
@@ -226,8 +217,7 @@ static int arch_build_bp_info(struct perf_event *bp)
 /*
  * Validate the arch-specific HW Breakpoint register settings
  */
-int arch_validate_hwbkpt_settings(struct perf_event *bp,
-				  struct task_struct *tsk)
+int arch_validate_hwbkpt_settings(struct perf_event *bp)
 {
 	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 	unsigned int align;
@@ -270,15 +260,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
 	if (info->address & align)
 		return -EINVAL;
 
-	/* Check that the virtual address is in the proper range */
-	if (tsk) {
-		if (!arch_check_va_in_userspace(info->address, info->len))
-			return -EFAULT;
-	} else {
-		if (!arch_check_va_in_kernelspace(info->address, info->len))
-			return -EFAULT;
-	}
-
 	return 0;
 }
 
@@ -363,8 +344,7 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
 		perf_bp_event(bp, args->regs);
 
 		/* Deliver the signal to userspace */
-		if (arch_check_va_in_userspace(bp->attr.bp_addr,
-					       bp->attr.bp_len)) {
+		if (!arch_check_bp_in_kernelspace(bp)) {
 			siginfo_t info;
 
 			info.si_signo = args->signr;
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index 2a1bd8f4f23a..c77a5a6fab9d 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -44,9 +44,8 @@ struct arch_hw_breakpoint {
 struct perf_event;
 struct pmu;
 
-extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
-extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
-					 struct task_struct *tsk);
+extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
+extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
 extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
 					   unsigned long val, void *data);
 
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index d6cc065f519f..a8f1b803d2fd 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -188,26 +188,17 @@ static int get_hbp_len(u8 hbp_len)
 	return len_in_bytes;
 }
 
-/*
- * Check for virtual address in user space.
- */
-int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
-{
-	unsigned int len;
-
-	len = get_hbp_len(hbp_len);
-
-	return (va <= TASK_SIZE - len);
-}
-
 /*
  * Check for virtual address in kernel space.
  */
-static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+int arch_check_bp_in_kernelspace(struct perf_event *bp)
 {
 	unsigned int len;
+	unsigned long va;
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 
-	len = get_hbp_len(hbp_len);
+	va = info->address;
+	len = get_hbp_len(info->len);
 
 	return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
 }
@@ -300,8 +291,7 @@ static int arch_build_bp_info(struct perf_event *bp)
 /*
  * Validate the arch-specific HW Breakpoint register settings
  */
-int arch_validate_hwbkpt_settings(struct perf_event *bp,
-				  struct task_struct *tsk)
+int arch_validate_hwbkpt_settings(struct perf_event *bp)
 {
 	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 	unsigned int align;
@@ -314,16 +304,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
 
 	ret = -EINVAL;
 
-	if (info->type == X86_BREAKPOINT_EXECUTE)
-		/*
-		 * Ptrace-refactoring code
-		 * For now, we'll allow instruction breakpoint only for user-space
-		 * addresses
-		 */
-		if ((!arch_check_va_in_userspace(info->address, info->len)) &&
-			info->len != X86_BREAKPOINT_EXECUTE)
-			return ret;
-
 	switch (info->len) {
 	case X86_BREAKPOINT_LEN_1:
 		align = 0;
@@ -350,15 +330,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
 	if (info->address & align)
 		return -EINVAL;
 
-	/* Check that the virtual address is in the proper range */
-	if (tsk) {
-		if (!arch_check_va_in_userspace(info->address, info->len))
-			return -EFAULT;
-	} else {
-		if (!arch_check_va_in_kernelspace(info->address, info->len))
-			return -EFAULT;
-	}
-
 	return 0;
 }
 
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 9ed9ae3a48b3..89e8a050c43a 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -308,6 +308,28 @@ int dbg_release_bp_slot(struct perf_event *bp)
 	return 0;
 }
 
+static int validate_hw_breakpoint(struct perf_event *bp)
+{
+	int ret;
+
+	ret = arch_validate_hwbkpt_settings(bp);
+	if (ret)
+		return ret;
+
+	if (arch_check_bp_in_kernelspace(bp)) {
+		if (bp->attr.exclude_kernel)
+			return -EINVAL;
+		/*
+		 * Don't let unprivileged users set a breakpoint in the trap
+		 * path to avoid trap recursion attacks.
+		 */
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+	}
+
+	return 0;
+}
+
 int register_perf_hw_breakpoint(struct perf_event *bp)
 {
 	int ret;
@@ -316,7 +338,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
 	if (ret)
 		return ret;
 
-	ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+	ret = validate_hw_breakpoint(bp);
 
 	/* if arch_validate_hwbkpt_settings() fails then release bp slot */
 	if (ret)
@@ -363,7 +385,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
 	if (attr->disabled)
 		goto end;
 
-	err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+	err = validate_hw_breakpoint(bp);
 	if (!err)
 		perf_event_enable(bp);
 
-- 
cgit v1.2.3-55-g7522


From 0102752e4c9e0655b39734550d4c35327954f7f9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Sun, 11 Apr 2010 18:55:56 +0200
Subject: hw-breakpoints: Separate constraint space for data and instruction
 breakpoints

There are two outstanding fashions for archs to implement hardware
breakpoints.

The first is to separate breakpoint address pattern definition
space between data and instruction breakpoints. We then have
typically distinct instruction address breakpoint registers
and data address breakpoint registers, delivered with
separate control registers for data and instruction breakpoints
as well. This is the case of PowerPc and ARM for example.

The second consists in having merged breakpoint address space
definition between data and instruction breakpoint. Address
registers can host either instruction or data address and
the access mode for the breakpoint is defined in a control
register. This is the case of x86 and Super H.

This patch adds a new CONFIG_HAVE_MIXED_BREAKPOINTS_REGS config
that archs can select if they belong to the second case. Those
will have their slot allocation merged for instructions and
data breakpoints.

The others will have a separate slot tracking between data and
instruction breakpoints.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/Kconfig                  | 11 ++++++
 arch/sh/Kconfig               |  1 +
 arch/x86/Kconfig              |  1 +
 include/linux/hw_breakpoint.h |  9 +++--
 kernel/hw_breakpoint.c        | 86 +++++++++++++++++++++++++++++--------------
 5 files changed, 78 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/Kconfig b/arch/Kconfig
index f06010fb4838..acda512da2e2 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -137,6 +137,17 @@ config HAVE_HW_BREAKPOINT
 	bool
 	depends on PERF_EVENTS
 
+config HAVE_MIXED_BREAKPOINTS_REGS
+	bool
+	depends on HAVE_HW_BREAKPOINT
+	help
+	  Depending on the arch implementation of hardware breakpoints,
+	  some of them have separate registers for data and instruction
+	  breakpoints addresses, others have mixed registers to store
+	  them but define the access type in a control register.
+	  Select this option if your arch implements breakpoints under the
+	  latter fashion.
+
 config HAVE_USER_RETURN_NOTIFIER
 	bool
 
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 8d90564c2bcf..e6d8ab5cfa9d 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH32
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_ARCH_KGDB
 	select HAVE_HW_BREAKPOINT
+	select HAVE_MIXED_BREAKPOINTS_REGS
 	select PERF_EVENTS if HAVE_HW_BREAKPOINT
 	select ARCH_HIBERNATION_POSSIBLE if MMU
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 97a95dfd1181..01177dcbe261 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -53,6 +53,7 @@ config X86
 	select HAVE_KERNEL_LZMA
 	select HAVE_KERNEL_LZO
 	select HAVE_HW_BREAKPOINT
+	select HAVE_MIXED_BREAKPOINTS_REGS
 	select PERF_EVENTS
 	select ANON_INODES
 	select HAVE_ARCH_KMEMCHECK
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index a0aa5a9cfb0e..7e8899093098 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -9,9 +9,12 @@ enum {
 };
 
 enum {
-	HW_BREAKPOINT_R = 1,
-	HW_BREAKPOINT_W = 2,
-	HW_BREAKPOINT_X = 4,
+	HW_BREAKPOINT_EMPTY	= 0,
+	HW_BREAKPOINT_R		= 1,
+	HW_BREAKPOINT_W		= 2,
+	HW_BREAKPOINT_RW	= HW_BREAKPOINT_R | HW_BREAKPOINT_W,
+	HW_BREAKPOINT_X		= 4,
+	HW_BREAKPOINT_INVALID   = HW_BREAKPOINT_RW | HW_BREAKPOINT_X,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 89e8a050c43a..8ead1345e33b 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -45,18 +45,28 @@
 
 #include <linux/hw_breakpoint.h>
 
+enum bp_type_idx {
+	TYPE_INST 	= 0,
+#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS
+	TYPE_DATA	= 0,
+#else
+	TYPE_DATA	= 1,
+#endif
+	TYPE_MAX
+};
+
 /*
  * Constraints data
  */
 
 /* Number of pinned cpu breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
+static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
 
 /* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]);
+static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[TYPE_MAX][HBP_NUM]);
 
 /* Number of non-pinned cpu/task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
+static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
 
 /* Gather the number of total pinned and un-pinned bp in a cpuset */
 struct bp_busy_slots {
@@ -67,14 +77,22 @@ struct bp_busy_slots {
 /* Serialize accesses to the above constraints */
 static DEFINE_MUTEX(nr_bp_mutex);
 
+static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
+{
+	if (bp->attr.bp_type & HW_BREAKPOINT_RW)
+		return TYPE_DATA;
+
+	return TYPE_INST;
+}
+
 /*
  * Report the maximum number of pinned breakpoints a task
  * have in this cpu
  */
-static unsigned int max_task_bp_pinned(int cpu)
+static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 {
 	int i;
-	unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
+	unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
 
 	for (i = HBP_NUM -1; i >= 0; i--) {
 		if (tsk_pinned[i] > 0)
@@ -84,7 +102,7 @@ static unsigned int max_task_bp_pinned(int cpu)
 	return 0;
 }
 
-static int task_bp_pinned(struct task_struct *tsk)
+static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type)
 {
 	struct perf_event_context *ctx = tsk->perf_event_ctxp;
 	struct list_head *list;
@@ -105,7 +123,8 @@ static int task_bp_pinned(struct task_struct *tsk)
 	 */
 	list_for_each_entry(bp, list, event_entry) {
 		if (bp->attr.type == PERF_TYPE_BREAKPOINT)
-			count++;
+			if (find_slot_idx(bp) == type)
+				count++;
 	}
 
 	raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -118,18 +137,19 @@ static int task_bp_pinned(struct task_struct *tsk)
  * a given cpu (cpu > -1) or in all of them (cpu = -1).
  */
 static void
-fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
+fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
+		    enum bp_type_idx type)
 {
 	int cpu = bp->cpu;
 	struct task_struct *tsk = bp->ctx->task;
 
 	if (cpu >= 0) {
-		slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
+		slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
 		if (!tsk)
-			slots->pinned += max_task_bp_pinned(cpu);
+			slots->pinned += max_task_bp_pinned(cpu, type);
 		else
-			slots->pinned += task_bp_pinned(tsk);
-		slots->flexible = per_cpu(nr_bp_flexible, cpu);
+			slots->pinned += task_bp_pinned(tsk, type);
+		slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
 
 		return;
 	}
@@ -137,16 +157,16 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
 	for_each_online_cpu(cpu) {
 		unsigned int nr;
 
-		nr = per_cpu(nr_cpu_bp_pinned, cpu);
+		nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
 		if (!tsk)
-			nr += max_task_bp_pinned(cpu);
+			nr += max_task_bp_pinned(cpu, type);
 		else
-			nr += task_bp_pinned(tsk);
+			nr += task_bp_pinned(tsk, type);
 
 		if (nr > slots->pinned)
 			slots->pinned = nr;
 
-		nr = per_cpu(nr_bp_flexible, cpu);
+		nr = per_cpu(nr_bp_flexible[type], cpu);
 
 		if (nr > slots->flexible)
 			slots->flexible = nr;
@@ -156,14 +176,15 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
 /*
  * Add a pinned breakpoint for the given task in our constraint table
  */
-static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
+static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
+				enum bp_type_idx type)
 {
 	unsigned int *tsk_pinned;
 	int count = 0;
 
-	count = task_bp_pinned(tsk);
+	count = task_bp_pinned(tsk, type);
 
-	tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
+	tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
 	if (enable) {
 		tsk_pinned[count]++;
 		if (count > 0)
@@ -178,7 +199,8 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
 /*
  * Add/remove the given breakpoint in our constraint table
  */
-static void toggle_bp_slot(struct perf_event *bp, bool enable)
+static void
+toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type)
 {
 	int cpu = bp->cpu;
 	struct task_struct *tsk = bp->ctx->task;
@@ -186,20 +208,20 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 	/* Pinned counter task profiling */
 	if (tsk) {
 		if (cpu >= 0) {
-			toggle_bp_task_slot(tsk, cpu, enable);
+			toggle_bp_task_slot(tsk, cpu, enable, type);
 			return;
 		}
 
 		for_each_online_cpu(cpu)
-			toggle_bp_task_slot(tsk, cpu, enable);
+			toggle_bp_task_slot(tsk, cpu, enable, type);
 		return;
 	}
 
 	/* Pinned counter cpu profiling */
 	if (enable)
-		per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
+		per_cpu(nr_cpu_bp_pinned[type], bp->cpu)++;
 	else
-		per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
+		per_cpu(nr_cpu_bp_pinned[type], bp->cpu)--;
 }
 
 /*
@@ -246,14 +268,21 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 static int __reserve_bp_slot(struct perf_event *bp)
 {
 	struct bp_busy_slots slots = {0};
+	enum bp_type_idx type;
 
-	fetch_bp_busy_slots(&slots, bp);
+	/* Basic checks */
+	if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
+	    bp->attr.bp_type == HW_BREAKPOINT_INVALID)
+		return -EINVAL;
+
+	type = find_slot_idx(bp);
+	fetch_bp_busy_slots(&slots, bp, type);
 
 	/* Flexible counters need to keep at least one slot */
 	if (slots.pinned + (!!slots.flexible) == HBP_NUM)
 		return -ENOSPC;
 
-	toggle_bp_slot(bp, true);
+	toggle_bp_slot(bp, true, type);
 
 	return 0;
 }
@@ -273,7 +302,10 @@ int reserve_bp_slot(struct perf_event *bp)
 
 static void __release_bp_slot(struct perf_event *bp)
 {
-	toggle_bp_slot(bp, false);
+	enum bp_type_idx type;
+
+	type = find_slot_idx(bp);
+	toggle_bp_slot(bp, false, type);
 }
 
 void release_bp_slot(struct perf_event *bp)
-- 
cgit v1.2.3-55-g7522


From feef47d0cb530e8419dfa0b48141b538b89b1b1a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Fri, 23 Apr 2010 05:59:55 +0200
Subject: hw-breakpoints: Get the number of available registers on boot
 dynamically

The breakpoint generic layer assumes that archs always know in advance
the static number of address registers available to host breakpoints
through the HBP_NUM macro.

However this is not true for every archs. For example Arm needs to get
this information dynamically to handle the compatiblity between
different versions.

To solve this, this patch proposes to drop the static HBP_NUM macro
and let the arch provide the number of available slots through a
new hw_breakpoint_slots() function. For archs that have
CONFIG_HAVE_MIXED_BREAKPOINTS_REGS selected, it will be called once
as the number of registers fits for instruction and data breakpoints
together.
For the others it will be called first to get the number of
instruction breakpoint registers and another time to get the
data breakpoint registers, the targeted type is given as a
parameter of hw_breakpoint_slots().

Reported-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/sh/include/asm/hw_breakpoint.h  |  5 ++++
 arch/x86/include/asm/hw_breakpoint.h |  5 ++++
 include/linux/hw_breakpoint.h        | 10 +++++++
 kernel/hw_breakpoint.c               | 53 ++++++++++++++++++++++++++++--------
 kernel/trace/trace_ksym.c            | 26 +++++-------------
 5 files changed, 68 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/sh/include/asm/hw_breakpoint.h b/arch/sh/include/asm/hw_breakpoint.h
index 382bad937dcc..e14cad96798f 100644
--- a/arch/sh/include/asm/hw_breakpoint.h
+++ b/arch/sh/include/asm/hw_breakpoint.h
@@ -46,6 +46,11 @@ struct pmu;
 /* Maximum number of UBC channels */
 #define HBP_NUM		2
 
+static inline int hw_breakpoint_slots(int type)
+{
+	return HBP_NUM;
+}
+
 /* arch/sh/kernel/hw_breakpoint.c */
 extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
 extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index c77a5a6fab9d..942255310e6a 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -41,6 +41,11 @@ struct arch_hw_breakpoint {
 /* Total number of available HW breakpoint registers */
 #define HBP_NUM 4
 
+static inline int hw_breakpoint_slots(int type)
+{
+	return HBP_NUM;
+}
+
 struct perf_event;
 struct pmu;
 
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index 7e8899093098..a2d6ea49ec56 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -17,6 +17,16 @@ enum {
 	HW_BREAKPOINT_INVALID   = HW_BREAKPOINT_RW | HW_BREAKPOINT_X,
 };
 
+enum bp_type_idx {
+	TYPE_INST 	= 0,
+#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS
+	TYPE_DATA	= 0,
+#else
+	TYPE_DATA	= 1,
+#endif
+	TYPE_MAX
+};
+
 #ifdef __KERNEL__
 
 #include <linux/perf_event.h>
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 974498b858fc..684b710cbb91 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,20 +40,12 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
 
 #include <linux/hw_breakpoint.h>
 
-enum bp_type_idx {
-	TYPE_INST 	= 0,
-#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS
-	TYPE_DATA	= 0,
-#else
-	TYPE_DATA	= 1,
-#endif
-	TYPE_MAX
-};
 
 /*
  * Constraints data
@@ -63,11 +55,15 @@ enum bp_type_idx {
 static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
 
 /* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[TYPE_MAX][HBP_NUM]);
+static DEFINE_PER_CPU(unsigned int, *nr_task_bp_pinned[TYPE_MAX]);
 
 /* Number of non-pinned cpu/task breakpoints in a cpu */
 static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
 
+static int nr_slots[TYPE_MAX];
+
+static int constraints_initialized;
+
 /* Gather the number of total pinned and un-pinned bp in a cpuset */
 struct bp_busy_slots {
 	unsigned int pinned;
@@ -99,7 +95,7 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 	int i;
 	unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
 
-	for (i = HBP_NUM -1; i >= 0; i--) {
+	for (i = nr_slots[type] - 1; i >= 0; i--) {
 		if (tsk_pinned[i] > 0)
 			return i + 1;
 	}
@@ -292,6 +288,10 @@ static int __reserve_bp_slot(struct perf_event *bp)
 	enum bp_type_idx type;
 	int weight;
 
+	/* We couldn't initialize breakpoint constraints on boot */
+	if (!constraints_initialized)
+		return -ENOMEM;
+
 	/* Basic checks */
 	if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
 	    bp->attr.bp_type == HW_BREAKPOINT_INVALID)
@@ -304,7 +304,7 @@ static int __reserve_bp_slot(struct perf_event *bp)
 	fetch_this_slot(&slots, weight);
 
 	/* Flexible counters need to keep at least one slot */
-	if (slots.pinned + (!!slots.flexible) > HBP_NUM)
+	if (slots.pinned + (!!slots.flexible) > nr_slots[type])
 		return -ENOSPC;
 
 	toggle_bp_slot(bp, true, type, weight);
@@ -551,7 +551,36 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
 
 static int __init init_hw_breakpoint(void)
 {
+	unsigned int **task_bp_pinned;
+	int cpu, err_cpu;
+	int i;
+
+	for (i = 0; i < TYPE_MAX; i++)
+		nr_slots[i] = hw_breakpoint_slots(i);
+
+	for_each_possible_cpu(cpu) {
+		for (i = 0; i < TYPE_MAX; i++) {
+			task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu);
+			*task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i],
+						  GFP_KERNEL);
+			if (!*task_bp_pinned)
+				goto err_alloc;
+		}
+	}
+
+	constraints_initialized = 1;
+
 	return register_die_notifier(&hw_breakpoint_exceptions_nb);
+
+ err_alloc:
+	for_each_possible_cpu(err_cpu) {
+		if (err_cpu == cpu)
+			break;
+		for (i = 0; i < TYPE_MAX; i++)
+			kfree(per_cpu(nr_task_bp_pinned[i], cpu));
+	}
+
+	return -ENOMEM;
 }
 core_initcall(init_hw_breakpoint);
 
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index d59cd6879477..8eaf00749b65 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -34,12 +34,6 @@
 
 #include <asm/atomic.h>
 
-/*
- * For now, let us restrict the no. of symbols traced simultaneously to number
- * of available hardware breakpoint registers.
- */
-#define KSYM_TRACER_MAX HBP_NUM
-
 #define KSYM_TRACER_OP_LEN 3 /* rw- */
 
 struct trace_ksym {
@@ -53,7 +47,6 @@ struct trace_ksym {
 
 static struct trace_array *ksym_trace_array;
 
-static unsigned int ksym_filter_entry_count;
 static unsigned int ksym_tracing_enabled;
 
 static HLIST_HEAD(ksym_filter_head);
@@ -181,13 +174,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
 	struct trace_ksym *entry;
 	int ret = -ENOMEM;
 
-	if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
-		printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
-		" new requests for tracing can be accepted now.\n",
-			KSYM_TRACER_MAX);
-		return -ENOSPC;
-	}
-
 	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
 	if (!entry)
 		return -ENOMEM;
@@ -203,13 +189,17 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
 
 	if (IS_ERR(entry->ksym_hbp)) {
 		ret = PTR_ERR(entry->ksym_hbp);
-		printk(KERN_INFO "ksym_tracer request failed. Try again"
-					" later!!\n");
+		if (ret == -ENOSPC) {
+			printk(KERN_ERR "ksym_tracer: Maximum limit reached."
+			" No new requests for tracing can be accepted now.\n");
+		} else {
+			printk(KERN_INFO "ksym_tracer request failed. Try again"
+					 " later!!\n");
+		}
 		goto err;
 	}
 
 	hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
-	ksym_filter_entry_count++;
 
 	return 0;
 
@@ -265,7 +255,6 @@ static void __ksym_trace_reset(void)
 	hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
 								ksym_hlist) {
 		unregister_wide_hw_breakpoint(entry->ksym_hbp);
-		ksym_filter_entry_count--;
 		hlist_del_rcu(&(entry->ksym_hlist));
 		synchronize_rcu();
 		kfree(entry);
@@ -338,7 +327,6 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 				goto out_unlock;
 		}
 		/* Error or "symbol:---" case: drop it */
-		ksym_filter_entry_count--;
 		hlist_del_rcu(&(entry->ksym_hlist));
 		synchronize_rcu();
 		kfree(entry);
-- 
cgit v1.2.3-55-g7522


From 40d2e76315da38993129090dc5d56377e573c312 Mon Sep 17 00:00:00 2001
From: Brian Gerst
Date: Sun, 21 Mar 2010 09:00:43 -0400
Subject: x86-32: Rework cache flush denied handler

The cache flush denied error is an erratum on some AMD 486 clones.  If an invd
instruction is executed in userspace, the processor calls exception 19 (13 hex)
instead of #GP (13 decimal).  On cpus where XMM is not supported, redirect
exception 19 to do_general_protection().  Also, remove die_if_kernel(), since
this was the last user.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1269176446-2489-2-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig.cpu       |  4 ++++
 arch/x86/kernel/entry_32.S | 19 +++++++++++++++++++
 arch/x86/kernel/traps.c    | 31 +++----------------------------
 3 files changed, 26 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index a19829374e6a..6f6792c56015 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -338,6 +338,10 @@ config X86_F00F_BUG
 	def_bool y
 	depends on M586MMX || M586TSC || M586 || M486 || M386
 
+config X86_INVD_BUG
+	def_bool y
+	depends on M486 || M386
+
 config X86_WP_WORKS_OK
 	def_bool y
 	depends on !M386
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 44a8e0dc6737..cd49141cf153 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -53,6 +53,7 @@
 #include <asm/processor-flags.h>
 #include <asm/ftrace.h>
 #include <asm/irq_vectors.h>
+#include <asm/cpufeature.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -905,7 +906,25 @@ ENTRY(simd_coprocessor_error)
 	RING0_INT_FRAME
 	pushl $0
 	CFI_ADJUST_CFA_OFFSET 4
+#ifdef CONFIG_X86_INVD_BUG
+	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
+661:	pushl $do_general_protection
+662:
+.section .altinstructions,"a"
+	.balign 4
+	.long 661b
+	.long 663f
+	.byte X86_FEATURE_XMM
+	.byte 662b-661b
+	.byte 664f-663f
+.previous
+.section .altinstr_replacement,"ax"
+663:	pushl $do_simd_coprocessor_error
+664:
+.previous
+#else
 	pushl $do_simd_coprocessor_error
+#endif
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1168e4454188..a16c9dfe6b70 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -108,15 +108,6 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 	dec_preempt_count();
 }
 
-#ifdef CONFIG_X86_32
-static inline void
-die_if_kernel(const char *str, struct pt_regs *regs, long err)
-{
-	if (!user_mode_vm(regs))
-		die(str, regs, err);
-}
-#endif
-
 static void __kprobes
 do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 	long error_code, siginfo_t *info)
@@ -729,30 +720,14 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 	conditional_sti(regs);
 
 #ifdef CONFIG_X86_32
-	if (cpu_has_xmm) {
-		/* Handle SIMD FPU exceptions on PIII+ processors. */
-		ignore_fpu_irq = 1;
-		simd_math_error((void __user *)regs->ip);
-		return;
-	}
-	/*
-	 * Handle strange cache flush from user space exception
-	 * in all other cases.  This is undocumented behaviour.
-	 */
-	if (regs->flags & X86_VM_MASK) {
-		handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
-		return;
-	}
-	current->thread.trap_no = 19;
-	current->thread.error_code = error_code;
-	die_if_kernel("cache flush denied", regs, error_code);
-	force_sig(SIGSEGV, current);
+	ignore_fpu_irq = 1;
 #else
 	if (!user_mode(regs) &&
 			kernel_math_error(regs, "kernel simd math error", 19))
 		return;
-	simd_math_error((void __user *)regs->ip);
 #endif
+
+	simd_math_error((void __user *)regs->ip);
 }
 
 dotraplinkage void
-- 
cgit v1.2.3-55-g7522


From 9b6dba9e0798325dab427b9d60c61630ccc39b28 Mon Sep 17 00:00:00 2001
From: Brian Gerst
Date: Sun, 21 Mar 2010 09:00:44 -0400
Subject: x86: Merge simd_math_error() into math_error()

The only difference between FPU and SIMD exceptions is where the
status bits are read from (cwd/swd vs. mxcsr).  This also fixes
the discrepency introduced by commit adf77bac, which fixed FPU
but not SIMD.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1269176446-2489-3-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/traps.h |   2 +-
 arch/x86/kernel/irqinit.c    |   2 +-
 arch/x86/kernel/traps.c      | 100 ++++++++++++++-----------------------------
 3 files changed, 34 insertions(+), 70 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 4da91ad69e0d..f66cda56781d 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -79,7 +79,7 @@ static inline int get_si_code(unsigned long condition)
 
 extern int panic_on_unrecovered_nmi;
 
-void math_error(void __user *);
+void math_error(struct pt_regs *, int, int);
 void math_emulate(struct math_emu_info *);
 #ifndef CONFIG_X86_32
 asmlinkage void smp_thermal_interrupt(void);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 0ed2d300cd46..990ae7cfc578 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -60,7 +60,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
 	outb(0, 0xF0);
 	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
 		return IRQ_NONE;
-	math_error((void __user *)get_irq_regs()->ip);
+	math_error(get_irq_regs(), 0, 16);
 	return IRQ_HANDLED;
 }
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a16c9dfe6b70..a472992cecdc 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -595,36 +595,48 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
  * the correct behaviour even in the presence of the asynchronous
  * IRQ13 behaviour
  */
-void math_error(void __user *ip)
+void math_error(struct pt_regs *regs, int error_code, int trapnr)
 {
 	struct task_struct *task;
 	siginfo_t info;
-	unsigned short cwd, swd, err;
+	unsigned short err;
 
 	/*
 	 * Save the info for the exception handler and clear the error.
 	 */
 	task = current;
 	save_init_fpu(task);
-	task->thread.trap_no = 16;
-	task->thread.error_code = 0;
+	task->thread.trap_no = trapnr;
+	task->thread.error_code = error_code;
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
-	info.si_addr = ip;
-	/*
-	 * (~cwd & swd) will mask out exceptions that are not set to unmasked
-	 * status.  0x3f is the exception bits in these regs, 0x200 is the
-	 * C1 reg you need in case of a stack fault, 0x040 is the stack
-	 * fault bit.  We should only be taking one exception at a time,
-	 * so if this combination doesn't produce any single exception,
-	 * then we have a bad program that isn't synchronizing its FPU usage
-	 * and it will suffer the consequences since we won't be able to
-	 * fully reproduce the context of the exception
-	 */
-	cwd = get_fpu_cwd(task);
-	swd = get_fpu_swd(task);
+	info.si_addr = (void __user *)regs->ip;
+	if (trapnr == 16) {
+		unsigned short cwd, swd;
+		/*
+		 * (~cwd & swd) will mask out exceptions that are not set to unmasked
+		 * status.  0x3f is the exception bits in these regs, 0x200 is the
+		 * C1 reg you need in case of a stack fault, 0x040 is the stack
+		 * fault bit.  We should only be taking one exception at a time,
+		 * so if this combination doesn't produce any single exception,
+		 * then we have a bad program that isn't synchronizing its FPU usage
+		 * and it will suffer the consequences since we won't be able to
+		 * fully reproduce the context of the exception
+		 */
+		cwd = get_fpu_cwd(task);
+		swd = get_fpu_swd(task);
 
-	err = swd & ~cwd;
+		err = swd & ~cwd;
+	} else {
+		/*
+		 * The SIMD FPU exceptions are handled a little differently, as there
+		 * is only a single status/control register.  Thus, to determine which
+		 * unmasked exception was caught we must mask the exception mask bits
+		 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+		 */
+		unsigned short mxcsr = get_fpu_mxcsr(task);
+		err = ~(mxcsr >> 7) & mxcsr;
+	}
 
 	if (err & 0x001) {	/* Invalid op */
 		/*
@@ -663,55 +675,7 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 		return;
 #endif
 
-	math_error((void __user *)regs->ip);
-}
-
-static void simd_math_error(void __user *ip)
-{
-	struct task_struct *task;
-	siginfo_t info;
-	unsigned short mxcsr;
-
-	/*
-	 * Save the info for the exception handler and clear the error.
-	 */
-	task = current;
-	save_init_fpu(task);
-	task->thread.trap_no = 19;
-	task->thread.error_code = 0;
-	info.si_signo = SIGFPE;
-	info.si_errno = 0;
-	info.si_code = __SI_FAULT;
-	info.si_addr = ip;
-	/*
-	 * The SIMD FPU exceptions are handled a little differently, as there
-	 * is only a single status/control register.  Thus, to determine which
-	 * unmasked exception was caught we must mask the exception mask bits
-	 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
-	 */
-	mxcsr = get_fpu_mxcsr(task);
-	switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
-	case 0x000:
-	default:
-		break;
-	case 0x001: /* Invalid Op */
-		info.si_code = FPE_FLTINV;
-		break;
-	case 0x002: /* Denormalize */
-	case 0x010: /* Underflow */
-		info.si_code = FPE_FLTUND;
-		break;
-	case 0x004: /* Zero Divide */
-		info.si_code = FPE_FLTDIV;
-		break;
-	case 0x008: /* Overflow */
-		info.si_code = FPE_FLTOVF;
-		break;
-	case 0x020: /* Precision */
-		info.si_code = FPE_FLTRES;
-		break;
-	}
-	force_sig_info(SIGFPE, &info, task);
+	math_error(regs, error_code, 16);
 }
 
 dotraplinkage void
@@ -727,7 +691,7 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 		return;
 #endif
 
-	simd_math_error((void __user *)regs->ip);
+	math_error(regs, error_code, 19);
 }
 
 dotraplinkage void
-- 
cgit v1.2.3-55-g7522


From e2e75c915de045f0785387dc32f55e92fab0614c Mon Sep 17 00:00:00 2001
From: Brian Gerst
Date: Sun, 21 Mar 2010 09:00:45 -0400
Subject: x86: Merge kernel_math_error() into math_error()

Clean up the kernel exception handling and make it more similar to
the other traps.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1269176446-2489-4-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/traps.c | 44 ++++++++++++++++----------------------------
 1 file changed, 16 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a472992cecdc..53ba86fc5dd8 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -576,20 +576,6 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	return;
 }
 
-#ifdef CONFIG_X86_64
-static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
-{
-	if (fixup_exception(regs))
-		return 1;
-
-	notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
-	/* Illegal floating point operation in the kernel */
-	current->thread.trap_no = trapnr;
-	die(str, regs, 0);
-	return 0;
-}
-#endif
-
 /*
  * Note that we play around with the 'TS' bit in an attempt to get
  * the correct behaviour even in the presence of the asynchronous
@@ -597,14 +583,28 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
  */
 void math_error(struct pt_regs *regs, int error_code, int trapnr)
 {
-	struct task_struct *task;
+	struct task_struct *task = current;
 	siginfo_t info;
 	unsigned short err;
+	char *str = (trapnr == 16) ? "fpu exception" : "simd exception";
+
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
+		return;
+	conditional_sti(regs);
+
+	if (!user_mode_vm(regs))
+	{
+		if (!fixup_exception(regs)) {
+			task->thread.error_code = error_code;
+			task->thread.trap_no = trapnr;
+			die(str, regs, error_code);
+		}
+		return;
+	}
 
 	/*
 	 * Save the info for the exception handler and clear the error.
 	 */
-	task = current;
 	save_init_fpu(task);
 	task->thread.trap_no = trapnr;
 	task->thread.error_code = error_code;
@@ -665,14 +665,8 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
 
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	conditional_sti(regs);
-
 #ifdef CONFIG_X86_32
 	ignore_fpu_irq = 1;
-#else
-	if (!user_mode(regs) &&
-	    kernel_math_error(regs, "kernel x87 math error", 16))
-		return;
 #endif
 
 	math_error(regs, error_code, 16);
@@ -681,14 +675,8 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	conditional_sti(regs);
-
 #ifdef CONFIG_X86_32
 	ignore_fpu_irq = 1;
-#else
-	if (!user_mode(regs) &&
-			kernel_math_error(regs, "kernel simd math error", 19))
-		return;
 #endif
 
 	math_error(regs, error_code, 19);
-- 
cgit v1.2.3-55-g7522


From 250825008f1f94887bc039e9227a8adfb5ba366e Mon Sep 17 00:00:00 2001
From: Brian Gerst
Date: Sun, 21 Mar 2010 09:00:46 -0400
Subject: x86-32: Don't set ignore_fpu_irq in simd exception

Any processor that supports simd will have an internal fpu, and the
irq13 handler will not be enabled.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1269176446-2489-5-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/traps.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 53ba86fc5dd8..00516e1de55d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -675,10 +675,6 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-#ifdef CONFIG_X86_32
-	ignore_fpu_irq = 1;
-#endif
-
 	math_error(regs, error_code, 19);
 }
 
-- 
cgit v1.2.3-55-g7522


From d88d95eb1c2a72b6126a550debe0883ff723a948 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Sat, 24 Apr 2010 09:56:53 +0200
Subject: x86, k8: Fix build error when K8_NB is disabled

K8_NB depends on PCI and when the last is disabled (allnoconfig) we fail
at the final linking stage due to missing exported num_k8_northbridges.
Add a header stub for that.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20100503183036.GJ26107@aftab>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/k8.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index f70e60071fe8..af00bd1d2089 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -16,11 +16,16 @@ extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
 extern int k8_scan_nodes(void);
 
 #ifdef CONFIG_K8_NB
+extern int num_k8_northbridges;
+
 static inline struct pci_dev *node_to_k8_nb_misc(int node)
 {
 	return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
 }
+
 #else
+#define num_k8_northbridges 0
+
 static inline struct pci_dev *node_to_k8_nb_misc(int node)
 {
 	return NULL;
-- 
cgit v1.2.3-55-g7522


From 097c1bd5673edaf2a162724636858b71f658fdd2 Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Mon, 3 May 2010 15:49:31 -0700
Subject: x86, cpu: Make APERF/MPERF a normal table-driven flag

APERF/MPERF can be handled via the table like all the other scattered
CPU flags.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1270065406-1814-4-git-send-email-bp@amd64.org>
---
 arch/x86/kernel/cpu/addon_cpuid_features.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index fd1fc1902a47..10fa5684a662 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -30,13 +30,14 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 	const struct cpuid_bit *cb;
 
 	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
-		{ X86_FEATURE_IDA,   CR_EAX, 1, 0x00000006 },
-		{ X86_FEATURE_ARAT,  CR_EAX, 2, 0x00000006 },
-		{ X86_FEATURE_CPB,   CR_EDX, 9, 0x80000007 },
-		{ X86_FEATURE_NPT,   CR_EDX, 0, 0x8000000a },
-		{ X86_FEATURE_LBRV,  CR_EDX, 1, 0x8000000a },
-		{ X86_FEATURE_SVML,  CR_EDX, 2, 0x8000000a },
-		{ X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
+		{ X86_FEATURE_IDA,   		CR_EAX, 1, 0x00000006 },
+		{ X86_FEATURE_ARAT,  		CR_EAX, 2, 0x00000006 },
+		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006 },
+		{ X86_FEATURE_CPB,   		CR_EDX, 9, 0x80000007 },
+		{ X86_FEATURE_NPT,   		CR_EDX, 0, 0x8000000a },
+		{ X86_FEATURE_LBRV,  		CR_EDX, 1, 0x8000000a },
+		{ X86_FEATURE_SVML,  		CR_EDX, 2, 0x8000000a },
+		{ X86_FEATURE_NRIPS, 		CR_EDX, 3, 0x8000000a },
 		{ 0, 0, 0, 0 }
 	};
 
@@ -54,14 +55,6 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		if (regs[cb->reg] & (1 << cb->bit))
 			set_cpu_cap(c, cb->feature);
 	}
-
-	/*
-	 * common AMD/Intel features
-	 */
-	if (c->cpuid_level >= 6) {
-		if (cpuid_ecx(6) & 0x1)
-			set_cpu_cap(c, X86_FEATURE_APERFMPERF);
-	}
 }
 
 /* leaf 0xb SMT level */
-- 
cgit v1.2.3-55-g7522


From 8f5a2dd83a1f8e89fdc17eb0f2f07c2e713e635a Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Tue, 23 Mar 2010 19:09:51 +0100
Subject: oprofile/x86: rework error handler in nmi_setup()

This patch improves the error handler in nmi_setup(). Most parts of
the code are moved to allocate_msrs(). In case of an error
allocate_msrs() also frees already allocated memory. nmi_setup()
becomes easier and better extendable.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 2c505ee71014..c0c21f200faf 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -295,6 +295,7 @@ static void free_msrs(void)
 		kfree(per_cpu(cpu_msrs, i).controls);
 		per_cpu(cpu_msrs, i).controls = NULL;
 	}
+	nmi_shutdown_mux();
 }
 
 static int allocate_msrs(void)
@@ -307,14 +308,21 @@ static int allocate_msrs(void)
 		per_cpu(cpu_msrs, i).counters = kzalloc(counters_size,
 							GFP_KERNEL);
 		if (!per_cpu(cpu_msrs, i).counters)
-			return 0;
+			goto fail;
 		per_cpu(cpu_msrs, i).controls = kzalloc(controls_size,
 							GFP_KERNEL);
 		if (!per_cpu(cpu_msrs, i).controls)
-			return 0;
+			goto fail;
 	}
 
+	if (!nmi_setup_mux())
+		goto fail;
+
 	return 1;
+
+fail:
+	free_msrs();
+	return 0;
 }
 
 static void nmi_cpu_setup(void *dummy)
@@ -342,17 +350,7 @@ static int nmi_setup(void)
 	int cpu;
 
 	if (!allocate_msrs())
-		err = -ENOMEM;
-	else if (!nmi_setup_mux())
-		err = -ENOMEM;
-	else
-		err = register_die_notifier(&profile_exceptions_nb);
-
-	if (err) {
-		free_msrs();
-		nmi_shutdown_mux();
-		return err;
-	}
+		return -ENOMEM;
 
 	/* We need to serialize save and setup for HT because the subset
 	 * of msrs are distinct for save and setup operations
@@ -374,9 +372,17 @@ static int nmi_setup(void)
 
 		mux_clone(cpu);
 	}
+
+	err = register_die_notifier(&profile_exceptions_nb);
+	if (err)
+		goto fail;
+
 	on_each_cpu(nmi_cpu_setup, NULL, 1);
 	nmi_enabled = 1;
 	return 0;
+fail:
+	free_msrs();
+	return err;
 }
 
 static void nmi_cpu_restore_registers(struct op_msrs *msrs)
@@ -421,7 +427,6 @@ static void nmi_shutdown(void)
 	nmi_enabled = 0;
 	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
 	unregister_die_notifier(&profile_exceptions_nb);
-	nmi_shutdown_mux();
 	msrs = &get_cpu_var(cpu_msrs);
 	model->shutdown(msrs);
 	free_msrs();
-- 
cgit v1.2.3-55-g7522


From d0e4120fda6f87eead438eed4d49032e12060e58 Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Tue, 23 Mar 2010 19:33:21 +0100
Subject: oprofile/x86: reserve counter msrs pairwise

For AMD's and Intel's P6 generic performance counters have pairwise
counter and control msrs. This patch changes the counter reservation
in a way that both msrs must be registered. It joins some counter
loops and also removes the unnecessary NUM_CONTROLS macro in the AMD
implementation.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/op_model_amd.c  | 43 +++++++++++++++++----------------------
 arch/x86/oprofile/op_model_ppro.c | 36 ++++++++++++++++----------------
 2 files changed, 36 insertions(+), 43 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 090cbbec7dbd..2e2bc902b867 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -30,13 +30,10 @@
 #include "op_counter.h"
 
 #define NUM_COUNTERS 4
-#define NUM_CONTROLS 4
 #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
 #define NUM_VIRT_COUNTERS 32
-#define NUM_VIRT_CONTROLS 32
 #else
 #define NUM_VIRT_COUNTERS NUM_COUNTERS
-#define NUM_VIRT_CONTROLS NUM_CONTROLS
 #endif
 
 #define OP_EVENT_MASK			0x0FFF
@@ -134,13 +131,15 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
 	int i;
 
 	for (i = 0; i < NUM_COUNTERS; i++) {
-		if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
-			msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
-	}
-
-	for (i = 0; i < NUM_CONTROLS; i++) {
-		if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
-			msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+		if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
+			continue;
+		if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
+			release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+			continue;
+		}
+		/* both registers must be reserved */
+		msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
+		msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
 	}
 }
 
@@ -160,7 +159,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
 	}
 
 	/* clear all counters */
-	for (i = 0; i < NUM_CONTROLS; ++i) {
+	for (i = 0; i < NUM_COUNTERS; ++i) {
 		if (unlikely(!msrs->controls[i].addr)) {
 			if (counter_config[i].enabled && !smp_processor_id())
 				/*
@@ -175,12 +174,10 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
 			op_x86_warn_in_use(i);
 		val &= model->reserved;
 		wrmsrl(msrs->controls[i].addr, val);
-	}
-
-	/* avoid a false detection of ctr overflows in NMI handler */
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		if (unlikely(!msrs->counters[i].addr))
-			continue;
+		/*
+		 * avoid a false detection of ctr overflows in NMI
+		 * handler
+		 */
 		wrmsrl(msrs->counters[i].addr, -1LL);
 	}
 
@@ -430,12 +427,10 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
 	int i;
 
 	for (i = 0; i < NUM_COUNTERS; ++i) {
-		if (msrs->counters[i].addr)
-			release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-	}
-	for (i = 0; i < NUM_CONTROLS; ++i) {
-		if (msrs->controls[i].addr)
-			release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+		if (!msrs->counters[i].addr)
+			continue;
+		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
 	}
 }
 
@@ -583,7 +578,7 @@ static void op_amd_exit(void)
 
 struct op_x86_model_spec op_amd_spec = {
 	.num_counters		= NUM_COUNTERS,
-	.num_controls		= NUM_CONTROLS,
+	.num_controls		= NUM_COUNTERS,
 	.num_virt_counters	= NUM_VIRT_COUNTERS,
 	.reserved		= MSR_AMD_EVENTSEL_RESERVED,
 	.event_mask		= OP_EVENT_MASK,
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 2bf90fafa7b5..f8e268e8e992 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -35,13 +35,15 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs)
 	int i;
 
 	for (i = 0; i < num_counters; i++) {
-		if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
-			msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
-	}
-
-	for (i = 0; i < num_counters; i++) {
-		if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i))
-			msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
+		if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
+			continue;
+		if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) {
+			release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
+			continue;
+		}
+		/* both registers must be reserved */
+		msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
+		msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
 	}
 }
 
@@ -92,12 +94,10 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
 			op_x86_warn_in_use(i);
 		val &= model->reserved;
 		wrmsrl(msrs->controls[i].addr, val);
-	}
-
-	/* avoid a false detection of ctr overflows in NMI handler */
-	for (i = 0; i < num_counters; ++i) {
-		if (unlikely(!msrs->counters[i].addr))
-			continue;
+		/*
+		 * avoid a false detection of ctr overflows in NMI *
+		 * handler
+		 */
 		wrmsrl(msrs->counters[i].addr, -1LL);
 	}
 
@@ -194,12 +194,10 @@ static void ppro_shutdown(struct op_msrs const * const msrs)
 	int i;
 
 	for (i = 0; i < num_counters; ++i) {
-		if (msrs->counters[i].addr)
-			release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
-	}
-	for (i = 0; i < num_counters; ++i) {
-		if (msrs->controls[i].addr)
-			release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
+		if (!msrs->counters[i].addr)
+			continue;
+		release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
 	}
 	if (reset_value) {
 		kfree(reset_value);
-- 
cgit v1.2.3-55-g7522


From 83300ce0df6b72e156b386457aa0f0902b8c0a98 Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Tue, 23 Mar 2010 20:01:54 +0100
Subject: oprofile/x86: moving shutdown functions

Moving some code in preparation of the next patch.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/op_model_amd.c  | 24 ++++++++++++------------
 arch/x86/oprofile/op_model_p4.c   | 38 ++++++++++++++++++--------------------
 arch/x86/oprofile/op_model_ppro.c | 33 ++++++++++++++++-----------------
 3 files changed, 46 insertions(+), 49 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 2e2bc902b867..7e5886d54bd5 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -126,6 +126,18 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
 
 /* functions for op_amd_spec */
 
+static void op_amd_shutdown(struct op_msrs const * const msrs)
+{
+	int i;
+
+	for (i = 0; i < NUM_COUNTERS; ++i) {
+		if (!msrs->counters[i].addr)
+			continue;
+		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+	}
+}
+
 static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
 {
 	int i;
@@ -422,18 +434,6 @@ static void op_amd_stop(struct op_msrs const * const msrs)
 	op_amd_stop_ibs();
 }
 
-static void op_amd_shutdown(struct op_msrs const * const msrs)
-{
-	int i;
-
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		if (!msrs->counters[i].addr)
-			continue;
-		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-	}
-}
-
 static u8 ibs_eilvt_off;
 
 static inline void apic_init_ibs_nmi_per_cpu(void *arg)
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index e6a160a4684a..7cc80df330d5 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -385,6 +385,24 @@ static unsigned int get_stagger(void)
 
 static unsigned long reset_value[NUM_COUNTERS_NON_HT];
 
+static void p4_shutdown(struct op_msrs const * const msrs)
+{
+	int i;
+
+	for (i = 0; i < num_counters; ++i) {
+		if (msrs->counters[i].addr)
+			release_perfctr_nmi(msrs->counters[i].addr);
+	}
+	/*
+	 * some of the control registers are specially reserved in
+	 * conjunction with the counter registers (hence the starting offset).
+	 * This saves a few bits.
+	 */
+	for (i = num_counters; i < num_controls; ++i) {
+		if (msrs->controls[i].addr)
+			release_evntsel_nmi(msrs->controls[i].addr);
+	}
+}
 
 static void p4_fill_in_addresses(struct op_msrs * const msrs)
 {
@@ -668,26 +686,6 @@ static void p4_stop(struct op_msrs const * const msrs)
 	}
 }
 
-static void p4_shutdown(struct op_msrs const * const msrs)
-{
-	int i;
-
-	for (i = 0; i < num_counters; ++i) {
-		if (msrs->counters[i].addr)
-			release_perfctr_nmi(msrs->counters[i].addr);
-	}
-	/*
-	 * some of the control registers are specially reserved in
-	 * conjunction with the counter registers (hence the starting offset).
-	 * This saves a few bits.
-	 */
-	for (i = num_counters; i < num_controls; ++i) {
-		if (msrs->controls[i].addr)
-			release_evntsel_nmi(msrs->controls[i].addr);
-	}
-}
-
-
 #ifdef CONFIG_SMP
 struct op_x86_model_spec op_p4_ht2_spec = {
 	.num_counters		= NUM_COUNTERS_HT2,
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index f8e268e8e992..b07d25a52f02 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -30,6 +30,22 @@ static int counter_width = 32;
 
 static u64 *reset_value;
 
+static void ppro_shutdown(struct op_msrs const * const msrs)
+{
+	int i;
+
+	for (i = 0; i < num_counters; ++i) {
+		if (!msrs->counters[i].addr)
+			continue;
+		release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
+	}
+	if (reset_value) {
+		kfree(reset_value);
+		reset_value = NULL;
+	}
+}
+
 static void ppro_fill_in_addresses(struct op_msrs * const msrs)
 {
 	int i;
@@ -189,23 +205,6 @@ static void ppro_stop(struct op_msrs const * const msrs)
 	}
 }
 
-static void ppro_shutdown(struct op_msrs const * const msrs)
-{
-	int i;
-
-	for (i = 0; i < num_counters; ++i) {
-		if (!msrs->counters[i].addr)
-			continue;
-		release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
-		release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
-	}
-	if (reset_value) {
-		kfree(reset_value);
-		reset_value = NULL;
-	}
-}
-
-
 struct op_x86_model_spec op_ppro_spec = {
 	.num_counters		= 2,
 	.num_controls		= 2,
-- 
cgit v1.2.3-55-g7522


From 8617f98c001d00b176422d707e6a67b88bcd7e0d Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Fri, 26 Feb 2010 17:20:55 +0100
Subject: oprofile/x86: return -EBUSY if counters are already reserved

In case a counter is already reserved by the watchdog or perf_event
subsystem, oprofile ignored this counters silently. This case is
handled now and oprofile_setup() now reports an error.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c       |  5 ++++-
 arch/x86/oprofile/op_model_amd.c  | 24 +++++++++++++-----------
 arch/x86/oprofile/op_model_p4.c   | 14 +++++++++++++-
 arch/x86/oprofile/op_model_ppro.c | 24 +++++++++++++-----------
 arch/x86/oprofile/op_x86_model.h  |  2 +-
 5 files changed, 44 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index c0c21f200faf..9f001d904599 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -357,7 +357,10 @@ static int nmi_setup(void)
 	 */
 
 	/* Assume saved/restored counters are the same on all CPUs */
-	model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
+	err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
+	if (err)
+		goto fail;
+
 	for_each_possible_cpu(cpu) {
 		if (!cpu)
 			continue;
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 7e5886d54bd5..536d0b0b39a5 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -138,21 +138,30 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
 	}
 }
 
-static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
+static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
 {
 	int i;
 
 	for (i = 0; i < NUM_COUNTERS; i++) {
 		if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
-			continue;
+			goto fail;
 		if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
 			release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-			continue;
+			goto fail;
 		}
 		/* both registers must be reserved */
 		msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
 		msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+		continue;
+	fail:
+		if (!counter_config[i].enabled)
+			continue;
+		op_x86_warn_reserved(i);
+		op_amd_shutdown(msrs);
+		return -EBUSY;
 	}
+
+	return 0;
 }
 
 static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
@@ -172,15 +181,8 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
 
 	/* clear all counters */
 	for (i = 0; i < NUM_COUNTERS; ++i) {
-		if (unlikely(!msrs->controls[i].addr)) {
-			if (counter_config[i].enabled && !smp_processor_id())
-				/*
-				 * counter is reserved, this is on all
-				 * cpus, so report only for cpu #0
-				 */
-				op_x86_warn_reserved(i);
+		if (!msrs->controls[i].addr)
 			continue;
-		}
 		rdmsrl(msrs->controls[i].addr, val);
 		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
 			op_x86_warn_in_use(i);
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 7cc80df330d5..182558dd5515 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -404,7 +404,7 @@ static void p4_shutdown(struct op_msrs const * const msrs)
 	}
 }
 
-static void p4_fill_in_addresses(struct op_msrs * const msrs)
+static int p4_fill_in_addresses(struct op_msrs * const msrs)
 {
 	unsigned int i;
 	unsigned int addr, cccraddr, stag;
@@ -486,6 +486,18 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
 			msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
 		}
 	}
+
+	for (i = 0; i < num_counters; ++i) {
+		if (!counter_config[i].enabled)
+			continue;
+		if (msrs->controls[i].addr)
+			continue;
+		op_x86_warn_reserved(i);
+		p4_shutdown(msrs);
+		return -EBUSY;
+	}
+
+	return 0;
 }
 
 
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index b07d25a52f02..1fd17cfb956b 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -46,21 +46,30 @@ static void ppro_shutdown(struct op_msrs const * const msrs)
 	}
 }
 
-static void ppro_fill_in_addresses(struct op_msrs * const msrs)
+static int ppro_fill_in_addresses(struct op_msrs * const msrs)
 {
 	int i;
 
 	for (i = 0; i < num_counters; i++) {
 		if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
-			continue;
+			goto fail;
 		if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) {
 			release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
-			continue;
+			goto fail;
 		}
 		/* both registers must be reserved */
 		msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
 		msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
+		continue;
+	fail:
+		if (!counter_config[i].enabled)
+			continue;
+		op_x86_warn_reserved(i);
+		ppro_shutdown(msrs);
+		return -EBUSY;
 	}
+
+	return 0;
 }
 
 
@@ -96,15 +105,8 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
 
 	/* clear all counters */
 	for (i = 0; i < num_counters; ++i) {
-		if (unlikely(!msrs->controls[i].addr)) {
-			if (counter_config[i].enabled && !smp_processor_id())
-				/*
-				 * counter is reserved, this is on all
-				 * cpus, so report only for cpu #0
-				 */
-				op_x86_warn_reserved(i);
+		if (!msrs->controls[i].addr)
 			continue;
-		}
 		rdmsrl(msrs->controls[i].addr, val);
 		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
 			op_x86_warn_in_use(i);
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index ff82a755edd4..551401398fba 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -41,7 +41,7 @@ struct op_x86_model_spec {
 	u16		event_mask;
 	int		(*init)(struct oprofile_operations *ops);
 	void		(*exit)(void);
-	void		(*fill_in_addresses)(struct op_msrs * const msrs);
+	int		(*fill_in_addresses)(struct op_msrs * const msrs);
 	void		(*setup_ctrs)(struct op_x86_model_spec const *model,
 				      struct op_msrs const * const msrs);
 	int		(*check_ctrs)(struct pt_regs * const regs,
-- 
cgit v1.2.3-55-g7522


From da759fe5be24ec3b236a76c007b460cf6caf2009 Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Fri, 26 Feb 2010 10:54:56 +0100
Subject: oprofile/x86: move IBS code

Moving code to make future changes easier. This groups all IBS code
together.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/op_model_amd.c | 220 +++++++++++++++++++--------------------
 1 file changed, 110 insertions(+), 110 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 536d0b0b39a5..e159254fb7cd 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -102,116 +102,6 @@ static u32 get_ibs_caps(void)
 	return ibs_caps;
 }
 
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-
-static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
-			       struct op_msrs const * const msrs)
-{
-	u64 val;
-	int i;
-
-	/* enable active counters */
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		int virt = op_x86_phys_to_virt(i);
-		if (!reset_value[virt])
-			continue;
-		rdmsrl(msrs->controls[i].addr, val);
-		val &= model->reserved;
-		val |= op_x86_get_ctrl(model, &counter_config[virt]);
-		wrmsrl(msrs->controls[i].addr, val);
-	}
-}
-
-#endif
-
-/* functions for op_amd_spec */
-
-static void op_amd_shutdown(struct op_msrs const * const msrs)
-{
-	int i;
-
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		if (!msrs->counters[i].addr)
-			continue;
-		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-	}
-}
-
-static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
-{
-	int i;
-
-	for (i = 0; i < NUM_COUNTERS; i++) {
-		if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
-			goto fail;
-		if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
-			release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-			goto fail;
-		}
-		/* both registers must be reserved */
-		msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
-		msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
-		continue;
-	fail:
-		if (!counter_config[i].enabled)
-			continue;
-		op_x86_warn_reserved(i);
-		op_amd_shutdown(msrs);
-		return -EBUSY;
-	}
-
-	return 0;
-}
-
-static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
-			      struct op_msrs const * const msrs)
-{
-	u64 val;
-	int i;
-
-	/* setup reset_value */
-	for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
-		if (counter_config[i].enabled
-		    && msrs->counters[op_x86_virt_to_phys(i)].addr)
-			reset_value[i] = counter_config[i].count;
-		else
-			reset_value[i] = 0;
-	}
-
-	/* clear all counters */
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		if (!msrs->controls[i].addr)
-			continue;
-		rdmsrl(msrs->controls[i].addr, val);
-		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
-			op_x86_warn_in_use(i);
-		val &= model->reserved;
-		wrmsrl(msrs->controls[i].addr, val);
-		/*
-		 * avoid a false detection of ctr overflows in NMI
-		 * handler
-		 */
-		wrmsrl(msrs->counters[i].addr, -1LL);
-	}
-
-	/* enable active counters */
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		int virt = op_x86_phys_to_virt(i);
-		if (!reset_value[virt])
-			continue;
-
-		/* setup counter registers */
-		wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
-
-		/* setup control registers */
-		rdmsrl(msrs->controls[i].addr, val);
-		val &= model->reserved;
-		val |= op_x86_get_ctrl(model, &counter_config[virt]);
-		wrmsrl(msrs->controls[i].addr, val);
-	}
-}
-
 /*
  * 16-bit Linear Feedback Shift Register (LFSR)
  *
@@ -376,6 +266,116 @@ static void op_amd_stop_ibs(void)
 		wrmsrl(MSR_AMD64_IBSOPCTL, 0);
 }
 
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+
+static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
+			       struct op_msrs const * const msrs)
+{
+	u64 val;
+	int i;
+
+	/* enable active counters */
+	for (i = 0; i < NUM_COUNTERS; ++i) {
+		int virt = op_x86_phys_to_virt(i);
+		if (!reset_value[virt])
+			continue;
+		rdmsrl(msrs->controls[i].addr, val);
+		val &= model->reserved;
+		val |= op_x86_get_ctrl(model, &counter_config[virt]);
+		wrmsrl(msrs->controls[i].addr, val);
+	}
+}
+
+#endif
+
+/* functions for op_amd_spec */
+
+static void op_amd_shutdown(struct op_msrs const * const msrs)
+{
+	int i;
+
+	for (i = 0; i < NUM_COUNTERS; ++i) {
+		if (!msrs->counters[i].addr)
+			continue;
+		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+	}
+}
+
+static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
+{
+	int i;
+
+	for (i = 0; i < NUM_COUNTERS; i++) {
+		if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
+			goto fail;
+		if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
+			release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+			goto fail;
+		}
+		/* both registers must be reserved */
+		msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
+		msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+		continue;
+	fail:
+		if (!counter_config[i].enabled)
+			continue;
+		op_x86_warn_reserved(i);
+		op_amd_shutdown(msrs);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
+			      struct op_msrs const * const msrs)
+{
+	u64 val;
+	int i;
+
+	/* setup reset_value */
+	for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
+		if (counter_config[i].enabled
+		    && msrs->counters[op_x86_virt_to_phys(i)].addr)
+			reset_value[i] = counter_config[i].count;
+		else
+			reset_value[i] = 0;
+	}
+
+	/* clear all counters */
+	for (i = 0; i < NUM_COUNTERS; ++i) {
+		if (!msrs->controls[i].addr)
+			continue;
+		rdmsrl(msrs->controls[i].addr, val);
+		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
+			op_x86_warn_in_use(i);
+		val &= model->reserved;
+		wrmsrl(msrs->controls[i].addr, val);
+		/*
+		 * avoid a false detection of ctr overflows in NMI
+		 * handler
+		 */
+		wrmsrl(msrs->counters[i].addr, -1LL);
+	}
+
+	/* enable active counters */
+	for (i = 0; i < NUM_COUNTERS; ++i) {
+		int virt = op_x86_phys_to_virt(i);
+		if (!reset_value[virt])
+			continue;
+
+		/* setup counter registers */
+		wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
+
+		/* setup control registers */
+		rdmsrl(msrs->controls[i].addr, val);
+		val &= model->reserved;
+		val |= op_x86_get_ctrl(model, &counter_config[virt]);
+		wrmsrl(msrs->controls[i].addr, val);
+	}
+}
+
 static int op_amd_check_ctrs(struct pt_regs * const regs,
 			     struct op_msrs const * const msrs)
 {
-- 
cgit v1.2.3-55-g7522


From 5bdb7934ca4115a12c7d585c5a45312b1c36909b Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Wed, 31 Mar 2010 11:58:36 +0200
Subject: oprofile/x86: remove duplicate IBS capability check

The check is already done in ibs_exit().

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/op_model_amd.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index e159254fb7cd..384c52410480 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -490,8 +490,7 @@ static int init_ibs_nmi(void)
 /* uninitialize the APIC for the IBS interrupts if needed */
 static void clear_ibs_nmi(void)
 {
-	if (ibs_caps)
-		on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
+	on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
 }
 
 /* initialize the APIC for the IBS interrupts if available */
-- 
cgit v1.2.3-55-g7522


From 2623a1d55a6260c855e1f6d1895900b50b40a896 Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Mon, 3 May 2010 19:44:32 +0200
Subject: oprofile/x86: fix uninitialized counter usage during cpu hotplug

This fixes a NULL pointer dereference that is triggered when taking a
cpu offline after oprofile was initialized, e.g.:

 $ opcontrol --init
 $ opcontrol --start-daemon
 $ opcontrol --shutdown
 $ opcontrol --deinit
 $ echo 0 > /sys/devices/system/cpu/cpu1/online

See the crash dump below. Though the counter has been disabled the cpu
notifier is still active and trying to use already freed counter data.

This fix is for linux-stable. To proper fix this, the hotplug code
must be rewritten. Thus I will leave a WARN_ON_ONCE() message with
this patch.

BUG: unable to handle kernel NULL pointer dereference at (null)
IP: [<ffffffff8132ad57>] op_amd_stop+0x2d/0x8e
PGD 0
Oops: 0000 [#1] SMP
last sysfs file: /sys/devices/system/cpu/cpu1/online
CPU 1
Modules linked in:

Pid: 0, comm: swapper Not tainted 2.6.34-rc5-oprofile-x86_64-standard-00210-g8c00f06 #16 Anaheim/Anaheim
RIP: 0010:[<ffffffff8132ad57>]  [<ffffffff8132ad57>] op_amd_stop+0x2d/0x8e
RSP: 0018:ffff880001843f28  EFLAGS: 00010006
RAX: 0000000000000000 RBX: 0000000000000000 RCX: dead000000200200
RDX: ffff880001843f68 RSI: dead000000100100 RDI: 0000000000000000
RBP: ffff880001843f48 R08: 0000000000000000 R09: ffff880001843f08
R10: ffffffff8102c9a5 R11: ffff88000184ea80 R12: 0000000000000000
R13: ffff88000184f6c0 R14: 0000000000000000 R15: 0000000000000000
FS:  00007fec6a92e6f0(0000) GS:ffff880001840000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000000 CR3: 000000000163b000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffff88042fcd8000, task ffff88042fcd51d0)
Stack:
 ffff880001843f48 0000000000000001 ffff88042e9f7d38 ffff880001843f68
<0> ffff880001843f58 ffffffff8132a602 ffff880001843f98 ffffffff810521b3
<0> ffff880001843f68 ffff880001843f68 ffff880001843f88 ffff88042fcd9fd8
Call Trace:
 <IRQ>
 [<ffffffff8132a602>] nmi_cpu_stop+0x21/0x23
 [<ffffffff810521b3>] generic_smp_call_function_single_interrupt+0xdf/0x11b
 [<ffffffff8101804f>] smp_call_function_single_interrupt+0x22/0x31
 [<ffffffff810029f3>] call_function_single_interrupt+0x13/0x20
 <EOI>
 [<ffffffff8102c9a5>] ? wake_up_process+0x10/0x12
 [<ffffffff81008701>] ? default_idle+0x22/0x37
 [<ffffffff8100896d>] c1e_idle+0xdf/0xe6
 [<ffffffff813f1170>] ? atomic_notifier_call_chain+0x13/0x15
 [<ffffffff810012fb>] cpu_idle+0x4b/0x7e
 [<ffffffff813e8a4e>] start_secondary+0x1ae/0x1b2
Code: 89 e5 41 55 49 89 fd 41 54 45 31 e4 53 31 db 48 83 ec 08 89 df e8 be f8 ff ff 48 98 48 83 3c c5 10 67 7a 81 00 74 1f 49 8b 45 08 <42> 8b 0c 20 0f 32 48 c1 e2 20 25 ff ff bf ff 48 09 d0 48 89 c2
RIP  [<ffffffff8132ad57>] op_amd_stop+0x2d/0x8e
 RSP <ffff880001843f28>
CR2: 0000000000000000
---[ end trace 679ac372d674b757 ]---
Kernel panic - not syncing: Fatal exception in interrupt
Pid: 0, comm: swapper Tainted: G      D    2.6.34-rc5-oprofile-x86_64-standard-00210-g8c00f06 #16
Call Trace:
 <IRQ>  [<ffffffff813ebd6a>] panic+0x9e/0x10c
 [<ffffffff810474b0>] ? up+0x34/0x39
 [<ffffffff81031ccc>] ? kmsg_dump+0x112/0x12c
 [<ffffffff813eeff1>] oops_end+0x81/0x8e
 [<ffffffff8101efee>] no_context+0x1f3/0x202
 [<ffffffff8101f1b7>] __bad_area_nosemaphore+0x1ba/0x1e0
 [<ffffffff81028d24>] ? enqueue_task_fair+0x16d/0x17a
 [<ffffffff810264dc>] ? activate_task+0x42/0x53
 [<ffffffff8102c967>] ? try_to_wake_up+0x272/0x284
 [<ffffffff8101f1eb>] bad_area_nosemaphore+0xe/0x10
 [<ffffffff813f0f3f>] do_page_fault+0x1c8/0x37c
 [<ffffffff81028d24>] ? enqueue_task_fair+0x16d/0x17a
 [<ffffffff813ee55f>] page_fault+0x1f/0x30
 [<ffffffff8102c9a5>] ? wake_up_process+0x10/0x12
 [<ffffffff8132ad57>] ? op_amd_stop+0x2d/0x8e
 [<ffffffff8132ad46>] ? op_amd_stop+0x1c/0x8e
 [<ffffffff8132a602>] nmi_cpu_stop+0x21/0x23
 [<ffffffff810521b3>] generic_smp_call_function_single_interrupt+0xdf/0x11b
 [<ffffffff8101804f>] smp_call_function_single_interrupt+0x22/0x31
 [<ffffffff810029f3>] call_function_single_interrupt+0x13/0x20
 <EOI>  [<ffffffff8102c9a5>] ? wake_up_process+0x10/0x12
 [<ffffffff81008701>] ? default_idle+0x22/0x37
 [<ffffffff8100896d>] c1e_idle+0xdf/0xe6
 [<ffffffff813f1170>] ? atomic_notifier_call_chain+0x13/0x15
 [<ffffffff810012fb>] cpu_idle+0x4b/0x7e
 [<ffffffff813e8a4e>] start_secondary+0x1ae/0x1b2
------------[ cut here ]------------
WARNING: at /local/rrichter/.source/linux/arch/x86/kernel/smp.c:118 native_smp_send_reschedule+0x27/0x53()
Hardware name: Anaheim
Modules linked in:
Pid: 0, comm: swapper Tainted: G      D    2.6.34-rc5-oprofile-x86_64-standard-00210-g8c00f06 #16
Call Trace:
 <IRQ>  [<ffffffff81017f32>] ? native_smp_send_reschedule+0x27/0x53
 [<ffffffff81030ee2>] warn_slowpath_common+0x77/0xa4
 [<ffffffff81030f1e>] warn_slowpath_null+0xf/0x11
 [<ffffffff81017f32>] native_smp_send_reschedule+0x27/0x53
 [<ffffffff8102634b>] resched_task+0x60/0x62
 [<ffffffff8102653a>] check_preempt_curr_idle+0x10/0x12
 [<ffffffff8102c8ea>] try_to_wake_up+0x1f5/0x284
 [<ffffffff8102c986>] default_wake_function+0xd/0xf
 [<ffffffff810a110d>] pollwake+0x57/0x5a
 [<ffffffff8102c979>] ? default_wake_function+0x0/0xf
 [<ffffffff81026be5>] __wake_up_common+0x46/0x75
 [<ffffffff81026ed0>] __wake_up+0x38/0x50
 [<ffffffff81031694>] printk_tick+0x39/0x3b
 [<ffffffff8103ac37>] update_process_times+0x3f/0x5c
 [<ffffffff8104dc63>] tick_periodic+0x5d/0x69
 [<ffffffff8104dc90>] tick_handle_periodic+0x21/0x71
 [<ffffffff81018fd0>] smp_apic_timer_interrupt+0x82/0x95
 [<ffffffff81002853>] apic_timer_interrupt+0x13/0x20
 [<ffffffff81030cb5>] ? panic_blink_one_second+0x0/0x7b
 [<ffffffff813ebdd6>] ? panic+0x10a/0x10c
 [<ffffffff810474b0>] ? up+0x34/0x39
 [<ffffffff81031ccc>] ? kmsg_dump+0x112/0x12c
 [<ffffffff813eeff1>] ? oops_end+0x81/0x8e
 [<ffffffff8101efee>] ? no_context+0x1f3/0x202
 [<ffffffff8101f1b7>] ? __bad_area_nosemaphore+0x1ba/0x1e0
 [<ffffffff81028d24>] ? enqueue_task_fair+0x16d/0x17a
 [<ffffffff810264dc>] ? activate_task+0x42/0x53
 [<ffffffff8102c967>] ? try_to_wake_up+0x272/0x284
 [<ffffffff8101f1eb>] ? bad_area_nosemaphore+0xe/0x10
 [<ffffffff813f0f3f>] ? do_page_fault+0x1c8/0x37c
 [<ffffffff81028d24>] ? enqueue_task_fair+0x16d/0x17a
 [<ffffffff813ee55f>] ? page_fault+0x1f/0x30
 [<ffffffff8102c9a5>] ? wake_up_process+0x10/0x12
 [<ffffffff8132ad57>] ? op_amd_stop+0x2d/0x8e
 [<ffffffff8132ad46>] ? op_amd_stop+0x1c/0x8e
 [<ffffffff8132a602>] ? nmi_cpu_stop+0x21/0x23
 [<ffffffff810521b3>] ? generic_smp_call_function_single_interrupt+0xdf/0x11b
 [<ffffffff8101804f>] ? smp_call_function_single_interrupt+0x22/0x31
 [<ffffffff810029f3>] ? call_function_single_interrupt+0x13/0x20
 <EOI>  [<ffffffff8102c9a5>] ? wake_up_process+0x10/0x12
 [<ffffffff81008701>] ? default_idle+0x22/0x37
 [<ffffffff8100896d>] ? c1e_idle+0xdf/0xe6
 [<ffffffff813f1170>] ? atomic_notifier_call_chain+0x13/0x15
 [<ffffffff810012fb>] ? cpu_idle+0x4b/0x7e
 [<ffffffff813e8a4e>] ? start_secondary+0x1ae/0x1b2
---[ end trace 679ac372d674b758 ]---

Cc: Andi Kleen <andi@firstfloor.org>
Cc: stable <stable@kernel.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 9f001d904599..24582040b718 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -95,7 +95,10 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs)
 static void nmi_cpu_start(void *dummy)
 {
 	struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
-	model->start(msrs);
+	if (!msrs->controls)
+		WARN_ON_ONCE(1);
+	else
+		model->start(msrs);
 }
 
 static int nmi_start(void)
@@ -107,7 +110,10 @@ static int nmi_start(void)
 static void nmi_cpu_stop(void *dummy)
 {
 	struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
-	model->stop(msrs);
+	if (!msrs->controls)
+		WARN_ON_ONCE(1);
+	else
+		model->stop(msrs);
 }
 
 static void nmi_stop(void)
-- 
cgit v1.2.3-55-g7522


From 216f3d9b4e5121feea4b13fae9d4c83e8d7e1c8a Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Mon, 3 May 2010 11:58:46 +0200
Subject: oprofile/x86: remove CONFIG_SMP macros

CPU notifier register functions also exist if CONFIG_SMP is
disabled. This change is part of hotplug code rework and also
necessary for later patches.

Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 24582040b718..c5df8ee76ee4 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -471,7 +471,6 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
 	return 0;
 }
 
-#ifdef CONFIG_SMP
 static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
 				 void *data)
 {
@@ -491,7 +490,6 @@ static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
 static struct notifier_block oprofile_cpu_nb = {
 	.notifier_call = oprofile_cpu_notifier
 };
-#endif
 
 #ifdef CONFIG_PM
 
@@ -701,9 +699,8 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 		return -ENODEV;
 	}
 
-#ifdef CONFIG_SMP
 	register_cpu_notifier(&oprofile_cpu_nb);
-#endif
+
 	/* default values, can be overwritten by model */
 	ops->create_files	= nmi_create_files;
 	ops->setup		= nmi_setup;
@@ -732,9 +729,7 @@ void op_nmi_exit(void)
 {
 	if (using_nmi) {
 		exit_sysfs();
-#ifdef CONFIG_SMP
 		unregister_cpu_notifier(&oprofile_cpu_nb);
-#endif
 	}
 	if (model->exit)
 		model->exit();
-- 
cgit v1.2.3-55-g7522


From 6ae56b55bc364bc2f2342f599b46581627ba22da Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Thu, 29 Apr 2010 14:55:55 +0200
Subject: oprofile/x86: protect cpu hotplug sections

This patch reworks oprofile cpu hotplug code as follows:

Introduce ctr_running variable to check, if counters are running or
not. The state must be known for taking a cpu on or offline and when
switching counters during counter multiplexing.

Protect on_each_cpu() sections with get_online_cpus()/put_online_cpu()
functions. This is necessary if notifiers or states are
modified. Within these sections the cpu mask may not change.

Switch only between counters in nmi_cpu_switch(), if counters are
running. Otherwise the switch may restart a counter though they are
disabled.

Add nmi_cpu_setup() and nmi_cpu_shutdown() to cpu hotplug code. The
function must also be called to avoid uninitialzed counter usage.

Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 52 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 46 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index c5df8ee76ee4..b56601eaf29d 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -31,8 +31,9 @@ static struct op_x86_model_spec *model;
 static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
 static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
 
-/* 0 == registered but off, 1 == registered and on */
-static int nmi_enabled = 0;
+/* must be protected with get_online_cpus()/put_online_cpus(): */
+static int nmi_enabled;
+static int ctr_running;
 
 struct op_counter_config counter_config[OP_MAX_COUNTER];
 
@@ -103,7 +104,10 @@ static void nmi_cpu_start(void *dummy)
 
 static int nmi_start(void)
 {
+	get_online_cpus();
 	on_each_cpu(nmi_cpu_start, NULL, 1);
+	ctr_running = 1;
+	put_online_cpus();
 	return 0;
 }
 
@@ -118,7 +122,10 @@ static void nmi_cpu_stop(void *dummy)
 
 static void nmi_stop(void)
 {
+	get_online_cpus();
 	on_each_cpu(nmi_cpu_stop, NULL, 1);
+	ctr_running = 0;
+	put_online_cpus();
 }
 
 #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
@@ -258,7 +265,10 @@ static int nmi_switch_event(void)
 	if (nmi_multiplex_on() < 0)
 		return -EINVAL;		/* not necessary */
 
-	on_each_cpu(nmi_cpu_switch, NULL, 1);
+	get_online_cpus();
+	if (ctr_running)
+		on_each_cpu(nmi_cpu_switch, NULL, 1);
+	put_online_cpus();
 
 	return 0;
 }
@@ -386,8 +396,11 @@ static int nmi_setup(void)
 	if (err)
 		goto fail;
 
+	get_online_cpus();
 	on_each_cpu(nmi_cpu_setup, NULL, 1);
 	nmi_enabled = 1;
+	put_online_cpus();
+
 	return 0;
 fail:
 	free_msrs();
@@ -433,8 +446,11 @@ static void nmi_shutdown(void)
 {
 	struct op_msrs *msrs;
 
-	nmi_enabled = 0;
+	get_online_cpus();
 	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
+	nmi_enabled = 0;
+	ctr_running = 0;
+	put_online_cpus();
 	unregister_die_notifier(&profile_exceptions_nb);
 	msrs = &get_cpu_var(cpu_msrs);
 	model->shutdown(msrs);
@@ -442,6 +458,22 @@ static void nmi_shutdown(void)
 	put_cpu_var(cpu_msrs);
 }
 
+static void nmi_cpu_up(void *dummy)
+{
+	if (nmi_enabled)
+		nmi_cpu_setup(dummy);
+	if (ctr_running)
+		nmi_cpu_start(dummy);
+}
+
+static void nmi_cpu_down(void *dummy)
+{
+	if (ctr_running)
+		nmi_cpu_stop(dummy);
+	if (nmi_enabled)
+		nmi_cpu_shutdown(dummy);
+}
+
 static int nmi_create_files(struct super_block *sb, struct dentry *root)
 {
 	unsigned int i;
@@ -478,10 +510,10 @@ static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
 	switch (action) {
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
-		smp_call_function_single(cpu, nmi_cpu_start, NULL, 0);
+		smp_call_function_single(cpu, nmi_cpu_up, NULL, 0);
 		break;
 	case CPU_DOWN_PREPARE:
-		smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1);
+		smp_call_function_single(cpu, nmi_cpu_down, NULL, 1);
 		break;
 	}
 	return NOTIFY_DONE;
@@ -699,7 +731,11 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 		return -ENODEV;
 	}
 
+	get_online_cpus();
 	register_cpu_notifier(&oprofile_cpu_nb);
+	nmi_enabled = 0;
+	ctr_running = 0;
+	put_online_cpus();
 
 	/* default values, can be overwritten by model */
 	ops->create_files	= nmi_create_files;
@@ -729,7 +765,11 @@ void op_nmi_exit(void)
 {
 	if (using_nmi) {
 		exit_sysfs();
+		get_online_cpus();
 		unregister_cpu_notifier(&oprofile_cpu_nb);
+		nmi_enabled = 0;
+		ctr_running = 0;
+		put_online_cpus();
 	}
 	if (model->exit)
 		model->exit();
-- 
cgit v1.2.3-55-g7522


From de654649737696ecf32873c341b305e30f3dc777 Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Mon, 3 May 2010 14:41:22 +0200
Subject: oprofile/x86: stop disabled counters in nmi handler

This patch adds checks to the nmi handler. Now samples are only
generated and counters reenabled, if the counters are running.
Otherwise the counters are stopped, if oprofile is using the nmi. In
other cases it will ignore the nmi notification.

Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index b56601eaf29d..94b5481bb6c6 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -62,12 +62,16 @@ static int profile_exceptions_notify(struct notifier_block *self,
 {
 	struct die_args *args = (struct die_args *)data;
 	int ret = NOTIFY_DONE;
-	int cpu = smp_processor_id();
 
 	switch (val) {
 	case DIE_NMI:
 	case DIE_NMI_IPI:
-		model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
+		if (ctr_running)
+			model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs));
+		else if (!nmi_enabled)
+			break;
+		else
+			model->stop(&__get_cpu_var(cpu_msrs));
 		ret = NOTIFY_STOP;
 		break;
 	default:
@@ -392,6 +396,9 @@ static int nmi_setup(void)
 		mux_clone(cpu);
 	}
 
+	nmi_enabled = 0;
+	ctr_running = 0;
+	barrier();
 	err = register_die_notifier(&profile_exceptions_nb);
 	if (err)
 		goto fail;
@@ -451,6 +458,7 @@ static void nmi_shutdown(void)
 	nmi_enabled = 0;
 	ctr_running = 0;
 	put_online_cpus();
+	barrier();
 	unregister_die_notifier(&profile_exceptions_nb);
 	msrs = &get_cpu_var(cpu_msrs);
 	model->shutdown(msrs);
-- 
cgit v1.2.3-55-g7522


From d30d64c6da3ec7a0708bfffa7e05752d5b9a1093 Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Mon, 3 May 2010 15:52:26 +0200
Subject: oprofile/x86: reordering some functions

Reordering some functions. Necessary for the next patch. No functional
changes.

Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 134 ++++++++++++++++++++++----------------------
 1 file changed, 67 insertions(+), 67 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 94b5481bb6c6..7de0572b0a5e 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -364,56 +364,6 @@ static struct notifier_block profile_exceptions_nb = {
 	.priority = 2
 };
 
-static int nmi_setup(void)
-{
-	int err = 0;
-	int cpu;
-
-	if (!allocate_msrs())
-		return -ENOMEM;
-
-	/* We need to serialize save and setup for HT because the subset
-	 * of msrs are distinct for save and setup operations
-	 */
-
-	/* Assume saved/restored counters are the same on all CPUs */
-	err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
-	if (err)
-		goto fail;
-
-	for_each_possible_cpu(cpu) {
-		if (!cpu)
-			continue;
-
-		memcpy(per_cpu(cpu_msrs, cpu).counters,
-		       per_cpu(cpu_msrs, 0).counters,
-		       sizeof(struct op_msr) * model->num_counters);
-
-		memcpy(per_cpu(cpu_msrs, cpu).controls,
-		       per_cpu(cpu_msrs, 0).controls,
-		       sizeof(struct op_msr) * model->num_controls);
-
-		mux_clone(cpu);
-	}
-
-	nmi_enabled = 0;
-	ctr_running = 0;
-	barrier();
-	err = register_die_notifier(&profile_exceptions_nb);
-	if (err)
-		goto fail;
-
-	get_online_cpus();
-	on_each_cpu(nmi_cpu_setup, NULL, 1);
-	nmi_enabled = 1;
-	put_online_cpus();
-
-	return 0;
-fail:
-	free_msrs();
-	return err;
-}
-
 static void nmi_cpu_restore_registers(struct op_msrs *msrs)
 {
 	struct op_msr *counters = msrs->counters;
@@ -449,23 +399,6 @@ static void nmi_cpu_shutdown(void *dummy)
 	nmi_cpu_restore_registers(msrs);
 }
 
-static void nmi_shutdown(void)
-{
-	struct op_msrs *msrs;
-
-	get_online_cpus();
-	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
-	nmi_enabled = 0;
-	ctr_running = 0;
-	put_online_cpus();
-	barrier();
-	unregister_die_notifier(&profile_exceptions_nb);
-	msrs = &get_cpu_var(cpu_msrs);
-	model->shutdown(msrs);
-	free_msrs();
-	put_cpu_var(cpu_msrs);
-}
-
 static void nmi_cpu_up(void *dummy)
 {
 	if (nmi_enabled)
@@ -531,6 +464,73 @@ static struct notifier_block oprofile_cpu_nb = {
 	.notifier_call = oprofile_cpu_notifier
 };
 
+static int nmi_setup(void)
+{
+	int err = 0;
+	int cpu;
+
+	if (!allocate_msrs())
+		return -ENOMEM;
+
+	/* We need to serialize save and setup for HT because the subset
+	 * of msrs are distinct for save and setup operations
+	 */
+
+	/* Assume saved/restored counters are the same on all CPUs */
+	err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
+	if (err)
+		goto fail;
+
+	for_each_possible_cpu(cpu) {
+		if (!cpu)
+			continue;
+
+		memcpy(per_cpu(cpu_msrs, cpu).counters,
+		       per_cpu(cpu_msrs, 0).counters,
+		       sizeof(struct op_msr) * model->num_counters);
+
+		memcpy(per_cpu(cpu_msrs, cpu).controls,
+		       per_cpu(cpu_msrs, 0).controls,
+		       sizeof(struct op_msr) * model->num_controls);
+
+		mux_clone(cpu);
+	}
+
+	nmi_enabled = 0;
+	ctr_running = 0;
+	barrier();
+	err = register_die_notifier(&profile_exceptions_nb);
+	if (err)
+		goto fail;
+
+	get_online_cpus();
+	on_each_cpu(nmi_cpu_setup, NULL, 1);
+	nmi_enabled = 1;
+	put_online_cpus();
+
+	return 0;
+fail:
+	free_msrs();
+	return err;
+}
+
+static void nmi_shutdown(void)
+{
+	struct op_msrs *msrs;
+
+	get_online_cpus();
+	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
+	nmi_enabled = 0;
+	ctr_running = 0;
+	put_online_cpus();
+	barrier();
+	unregister_die_notifier(&profile_exceptions_nb);
+	msrs = &get_cpu_var(cpu_msrs);
+	model->shutdown(msrs);
+	free_msrs();
+	put_cpu_var(cpu_msrs);
+}
+
 #ifdef CONFIG_PM
 
 static int nmi_suspend(struct sys_device *dev, pm_message_t state)
-- 
cgit v1.2.3-55-g7522


From 2c2df8418ac7908eec4558407b83f16739006c54 Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Tue, 30 Mar 2010 01:07:02 -0700
Subject: x86, acpi/irq: Introduce apci_isa_irq_to_gsi

There are a number of cases where the current code makes the assumption
that isa irqs identity map to the first 16 acpi global system intereupts.
In most instances that assumption is correct as that is the required
behaviour in dual i8259 mode and the default behavior in ioapic mode.

However there are some systems out there that take advantage of acpis
interrupt remapping  for the isa irqs to have a completely different
mapping of isa_irq to gsi.

Introduce acpi_isa_irq_to_gsi to perform this mapping explicitly in the
code that needs it.  Initially this will be just the current assumed
identity mapping to ensure it's introduction does not cause regressions.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-1-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/ia64/kernel/acpi.c     | 8 ++++++++
 arch/x86/kernel/acpi/boot.c | 8 ++++++++
 include/linux/acpi.h        | 1 +
 3 files changed, 17 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 4d1a7e9314cf..c6c90f39f4d9 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -785,6 +785,14 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
 	return 0;
 }
 
+int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
+{
+	if (isa_irq >= 16)
+		return -1;
+	*gsi = isa_irq;
+	return 0;
+}
+
 /*
  *  ACPI based hotplug CPU support
  */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index cd40aba6aa95..da718d672596 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -458,6 +458,14 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
 	return 0;
 }
 
+int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
+{
+	if (isa_irq >= 16)
+		return -1;
+	*gsi = isa_irq;
+	return 0;
+}
+
 /*
  * success: return IRQ number (>=0)
  * failure: return < 0
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index b926afe8c03e..7a937dabcc4a 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -116,6 +116,7 @@ extern unsigned long acpi_realmode_flags;
 
 int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity);
 int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
+int acpi_isa_irq_to_gsi (unsigned isa_irq, u32 *gsi);
 
 #ifdef CONFIG_X86_IO_APIC
 extern int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity);
-- 
cgit v1.2.3-55-g7522


From 9a0a91bb56d2915cdb8585717de38376ad20fef9 Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Tue, 30 Mar 2010 01:07:03 -0700
Subject: x86, acpi/irq: Teach acpi_get_override_irq to take a gsi not an
 isa_irq

In perverse acpi implementations the isa irqs are not identity mapped
to the first 16 gsi.  Furthermore at least the extended interrupt
resource capability may return gsi's and not isa irqs.  So since
what we get from acpi is a gsi teach acpi_get_overrride_irq to
operate on a gsi instead of an isa_irq.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-2-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/io_apic.c | 23 ++++++++++++++---------
 include/linux/acpi.h           |  4 ++--
 2 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 127b8718abfb..73ec92838d83 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -4082,22 +4082,27 @@ int __init io_apic_get_version(int ioapic)
 	return reg_01.bits.version;
 }
 
-int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
 {
-	int i;
+	int ioapic, pin, idx;
 
 	if (skip_ioapic_setup)
 		return -1;
 
-	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].irqtype == mp_INT &&
-		    mp_irqs[i].srcbusirq == bus_irq)
-			break;
-	if (i >= mp_irq_entries)
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0)
+		return -1;
+
+	pin = mp_find_ioapic_pin(ioapic, gsi);
+	if (pin < 0)
+		return -1;
+
+	idx = find_irq_entry(ioapic, pin, mp_INT);
+	if (idx < 0)
 		return -1;
 
-	*trigger = irq_trigger(i);
-	*polarity = irq_polarity(i);
+	*trigger = irq_trigger(idx);
+	*polarity = irq_polarity(idx);
 	return 0;
 }
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 7a937dabcc4a..3da73f5f0ae9 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -119,9 +119,9 @@ int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
 int acpi_isa_irq_to_gsi (unsigned isa_irq, u32 *gsi);
 
 #ifdef CONFIG_X86_IO_APIC
-extern int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity);
+extern int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
 #else
-#define acpi_get_override_irq(bus, trigger, polarity) (-1)
+#define acpi_get_override_irq(gsi, trigger, polarity) (-1)
 #endif
 /*
  * This function undoes the effect of one call to acpi_register_gsi().
-- 
cgit v1.2.3-55-g7522


From 9d2062b879495649bb525cf7979126da2e45d288 Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Tue, 30 Mar 2010 01:07:05 -0700
Subject: x86, acpi/irq: Fix acpi_sci_ioapic_setup so it has both bus_irq and
 gsi

Currently acpi_sci_ioapic_setup calls mp_override_legacy_irq with
bus_irq == gsi, which is wrong if we are comming from an override
Instead pass the bus_irq into acpi_sci_ioapic_setup.

This fix was inspired by a similar fix from:
Yinghai Lu <yinghai@kernel.org>

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-4-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/acpi/boot.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index da718d672596..0a036dc6f9ff 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -313,7 +313,7 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 /*
  * Parse Interrupt Source Override for the ACPI SCI
  */
-static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
+static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi)
 {
 	if (trigger == 0)	/* compatible SCI trigger is level */
 		trigger = 3;
@@ -333,7 +333,7 @@ static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
 	 * If GSI is < 16, this will update its flags,
 	 * else it will create a new mp_irqs[] entry.
 	 */
-	mp_override_legacy_irq(gsi, polarity, trigger, gsi);
+	mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
 
 	/*
 	 * stash over-ride to indicate we've been here
@@ -357,9 +357,10 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
 	acpi_table_print_madt_entry(header);
 
 	if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
-		acpi_sci_ioapic_setup(intsrc->global_irq,
+		acpi_sci_ioapic_setup(intsrc->source_irq,
 				      intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
-				      (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2);
+				      (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
+				      intsrc->global_irq);
 		return 0;
 	}
 
@@ -1162,7 +1163,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 	 * pretend we got one so we can set the SCI flags.
 	 */
 	if (!acpi_sci_override_gsi)
-		acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
+		acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,
+				      acpi_gbl_FADT.sci_interrupt);
 
 	/* Fill in identity legacy mappings where no override */
 	mp_config_acpi_legacy_irqs();
-- 
cgit v1.2.3-55-g7522


From 0fd52670fb6400be0996ac492b5ed77f3d83d69a Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Tue, 30 Mar 2010 01:07:06 -0700
Subject: x86, acpi/irq: Generalize mp_config_acpi_legacy_irqs

Remove the assumption that there is not an override for isa irq 0.
Instead lookup the gsi and from that lookup the ioapic and pin of each
isa irq indivdually.

In general this should not have any behavioural affect but in
perverse cases this gets all of the details correct, instead of
doing something weird.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-5-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/acpi/boot.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 0a036dc6f9ff..3ee92f28a4b2 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -961,8 +961,6 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 void __init mp_config_acpi_legacy_irqs(void)
 {
 	int i;
-	int ioapic;
-	unsigned int dstapic;
 	struct mpc_intsrc mp_irq;
 
 #if defined (CONFIG_MCA) || defined (CONFIG_EISA)
@@ -982,20 +980,28 @@ void __init mp_config_acpi_legacy_irqs(void)
 		return;
 #endif
 
-	/*
-	 * Locate the IOAPIC that manages the ISA IRQs (0-15).
-	 */
-	ioapic = mp_find_ioapic(0);
-	if (ioapic < 0)
-		return;
-	dstapic = mp_ioapics[ioapic].apicid;
-
 	/*
 	 * Use the default configuration for the IRQs 0-15.  Unless
 	 * overridden by (MADT) interrupt source override entries.
 	 */
 	for (i = 0; i < 16; i++) {
+		int ioapic, pin;
+		unsigned int dstapic;
 		int idx;
+		u32 gsi;
+
+		/* Locate the gsi that irq i maps to. */
+		if (acpi_isa_irq_to_gsi(i, &gsi))
+			continue;
+
+		/*
+		 * Locate the IOAPIC that manages the ISA IRQ.
+		 */
+		ioapic = mp_find_ioapic(gsi);
+		if (ioapic < 0)
+			continue;
+		pin = mp_find_ioapic_pin(ioapic, gsi);
+		dstapic = mp_ioapics[ioapic].apicid;
 
 		for (idx = 0; idx < mp_irq_entries; idx++) {
 			struct mpc_intsrc *irq = mp_irqs + idx;
@@ -1005,7 +1011,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 				break;
 
 			/* Do we already have a mapping for this IOAPIC pin */
-			if (irq->dstapic == dstapic && irq->dstirq == i)
+			if (irq->dstapic == dstapic && irq->dstirq == pin)
 				break;
 		}
 
@@ -1020,7 +1026,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 		mp_irq.dstapic = dstapic;
 		mp_irq.irqtype = mp_INT;
 		mp_irq.srcbusirq = i; /* Identity mapped */
-		mp_irq.dstirq = i;
+		mp_irq.dstirq = pin;
 
 		save_mp_irq(&mp_irq);
 	}
-- 
cgit v1.2.3-55-g7522


From 9638fa521e42c9281c874c6b5a382b1ced4ee496 Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Tue, 30 Mar 2010 01:07:07 -0700
Subject: x86, ioapic: Only export mp_find_ioapic and mp_find_ioapic_pin in
 io_apic.h

Multiple declarations of the same function in different headers
is a pain to maintain.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-6-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mpspec.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index d8bf23a88d05..29994f06c7e2 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -106,10 +106,6 @@ struct device;
 extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
 				 int active_high_low);
 extern int acpi_probe_gsi(void);
-#ifdef CONFIG_X86_IO_APIC
-extern int mp_find_ioapic(int gsi);
-extern int mp_find_ioapic_pin(int ioapic, int gsi);
-#endif
 #else /* !CONFIG_ACPI: */
 static inline int acpi_probe_gsi(void)
 {
-- 
cgit v1.2.3-55-g7522


From 4b6b19a1c7302477653d799a53d48063dd53d555 Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Tue, 30 Mar 2010 01:07:08 -0700
Subject: x86, ioapic: Fix io_apic_redir_entries to return the number of
 entries.

io_apic_redir_entries has a huge conceptual bug.  It returns the maximum
redirection entry not the number of redirection entries.  Which simply
does not match what the name of the function.  This just caught me
and it caught  Feng Tang, and  Len Brown when they wrote sfi_parse_ioapic.

Modify io_apic_redir_entries to actually return the number of redirection
entries, and fix the callers so that they properly handle receiving the
number of the number of redirection table entries, instead of the
number of redirection table entries less one.

While the usage in sfi.c does not show up in this patch it is fixed
by virtue of the fact that io_apic_redir_entries now has the semantics
sfi_parse_ioapic most reasonably expects.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-7-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/io_apic.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 73ec92838d83..0a053e61b3ea 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3855,7 +3855,11 @@ int __init io_apic_get_redir_entries (int ioapic)
 	reg_01.raw = io_apic_read(ioapic, 1);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
-	return reg_01.bits.entries;
+	/* The register returns the maximum index redir index
+	 * supported, which is one less than the total number of redir
+	 * entries.
+	 */
+	return reg_01.bits.entries + 1;
 }
 
 void __init probe_nr_irqs_gsi(void)
@@ -3871,7 +3875,7 @@ void __init probe_nr_irqs_gsi(void)
 
 		nr = 0;
 		for (idx = 0; idx < nr_ioapics; idx++)
-			nr += io_apic_get_redir_entries(idx) + 1;
+			nr += io_apic_get_redir_entries(idx);
 
 		if (nr > nr_irqs_gsi)
 			nr_irqs_gsi = nr;
@@ -4306,7 +4310,7 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	 */
 	mp_gsi_routing[idx].gsi_base = gsi_base;
 	mp_gsi_routing[idx].gsi_end = gsi_base +
-	    io_apic_get_redir_entries(idx);
+	    io_apic_get_redir_entries(idx) - 1;
 
 	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
 	       "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
-- 
cgit v1.2.3-55-g7522


From eddb0c55a14074d6bac8c2ef169aefd7e2c6f139 Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Tue, 30 Mar 2010 01:07:09 -0700
Subject: x86, ioapic: Fix the types of gsi values

This patches fixes the types of gsi_base and gsi_end values in
struct mp_ioapic_gsi, and the gsi parameter of mp_find_ioapic
and mp_find_ioapic_pin

A gsi is cannonically a u32, not an int.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-8-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/io_apic.h | 10 +++++-----
 arch/x86/kernel/apic/io_apic.c |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 35832a03a515..feeaf0d92460 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -180,12 +180,12 @@ extern void ioapic_write_entry(int apic, int pin,
 extern void setup_ioapic_ids_from_mpc(void);
 
 struct mp_ioapic_gsi{
-	int gsi_base;
-	int gsi_end;
+	u32 gsi_base;
+	u32 gsi_end;
 };
 extern struct mp_ioapic_gsi  mp_gsi_routing[];
-int mp_find_ioapic(int gsi);
-int mp_find_ioapic_pin(int ioapic, int gsi);
+int mp_find_ioapic(u32 gsi);
+int mp_find_ioapic_pin(int ioapic, u32 gsi);
 void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
 extern void __init pre_init_apic_IRQ0(void);
 
@@ -197,7 +197,7 @@ static const int timer_through_8259 = 0;
 static inline void ioapic_init_mappings(void)	{ }
 static inline void ioapic_insert_resources(void) { }
 static inline void probe_nr_irqs_gsi(void)	{ }
-static inline int mp_find_ioapic(int gsi) { return 0; }
+static inline int mp_find_ioapic(u32 gsi) { return 0; }
 
 struct io_apic_irq_attr;
 static inline int io_apic_set_pci_routing(struct device *dev, int irq,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 0a053e61b3ea..9ab97622b8e6 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -4247,7 +4247,7 @@ void __init ioapic_insert_resources(void)
 	}
 }
 
-int mp_find_ioapic(int gsi)
+int mp_find_ioapic(u32 gsi)
 {
 	int i = 0;
 
@@ -4262,7 +4262,7 @@ int mp_find_ioapic(int gsi)
 	return -1;
 }
 
-int mp_find_ioapic_pin(int ioapic, int gsi)
+int mp_find_ioapic_pin(int ioapic, u32 gsi)
 {
 	if (WARN_ON(ioapic == -1))
 		return -1;
-- 
cgit v1.2.3-55-g7522


From 5777372af5c929b8f3c706ed7b295b7279537c88 Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Tue, 30 Mar 2010 01:07:10 -0700
Subject: x86, ioapic: Teach mp_register_ioapic to compute a global gsi_end

Add the global variable gsi_end and teach mp_register_ioapic
to keep it uptodate as we add more ioapics into the system.

ioapics can only be added early in boot so the code that
runs later can treat gsi_end as a constant.

Remove the have hacks in sfi.c to second guess mp_register_ioapic
by keeping t's own running total of how many gsi's have been seen,
and instead use the gsi_end.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-9-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/io_apic.h | 1 +
 arch/x86/kernel/apic/io_apic.c | 6 ++++++
 arch/x86/kernel/sfi.c          | 4 +---
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index feeaf0d92460..37b0f2bb5034 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -184,6 +184,7 @@ struct mp_ioapic_gsi{
 	u32 gsi_end;
 };
 extern struct mp_ioapic_gsi  mp_gsi_routing[];
+extern u32 gsi_end;
 int mp_find_ioapic(u32 gsi);
 int mp_find_ioapic_pin(int ioapic, u32 gsi);
 void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 9ab97622b8e6..f80725571577 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -89,6 +89,9 @@ int nr_ioapics;
 /* IO APIC gsi routing info */
 struct mp_ioapic_gsi  mp_gsi_routing[MAX_IO_APICS];
 
+/* The last gsi number used */
+u32 gsi_end;
+
 /* MP IRQ source entries */
 struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
@@ -4312,6 +4315,9 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	mp_gsi_routing[idx].gsi_end = gsi_base +
 	    io_apic_get_redir_entries(idx) - 1;
 
+	if (mp_gsi_routing[idx].gsi_end > gsi_end)
+		gsi_end = mp_gsi_routing[idx].gsi_end;
+
 	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
 	       "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
 	       mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
index 34e099382651..7ded57896c0a 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/kernel/sfi.c
@@ -81,7 +81,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table)
 #endif /* CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_IO_APIC
-static u32 gsi_base;
 
 static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 {
@@ -94,8 +93,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 	pentry = (struct sfi_apic_table_entry *)sb->pentry;
 
 	for (i = 0; i < num; i++) {
-		mp_register_ioapic(i, pentry->phys_addr, gsi_base);
-		gsi_base += io_apic_get_redir_entries(i);
+		mp_register_ioapic(i, pentry->phys_addr, gsi_end + 1);
 		pentry++;
 	}
 
-- 
cgit v1.2.3-55-g7522


From cf7500c0ea133d66f8449d86392d83f840102632 Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Tue, 30 Mar 2010 01:07:11 -0700
Subject: x86, ioapic: In mpparse use mp_register_ioapic

Long ago MP_ioapic_info was the primary way of setting up our
ioapic data structures and mp_register_ioapic was a compatibility
shim for acpi code.  Now the situation is reversed and
and mp_register_ioapic is the primary way of setting up our
ioapic data structures.

Keep the setting up of ioapic data structures uniform by
having mp_register_ioapic call mp_register_ioapic.

This changes a few fields:

- type: is now hardset to MP_IOAPIC but type had to
  bey MP_IOAPIC or MP_ioapic_info would not have been called.

- flags: is now hard coded to MPC_APIC_USABLE.
  We require flags to contain at least MPC_APIC_USEBLE in
  MP_ioapic_info and we don't ever examine flags so dropping
  a few flags that might possibly exist that we have never
  used is harmless.

- apicaddr: Unchanged

- apicver: Read from the ioapic instead of using the cached
  hardware value in the MP table.  The real hardware value
  will be more accurate.

- apicid: Now verified to be unique and changed if it is not.
  If the BIOS got this right this is a noop.  If the BIOS did
  not fixing things appears to be the better solution.

This adds gsi_base and gsi_end values to our ioapics defined with
the mpatable, which will make our lives simpler later since
we can always assume gsi_base and gsi_end are valid.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-10-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/mpparse.c | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index e81030f71a8f..5ae5d2426edf 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -115,21 +115,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
 		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
 }
 
-static int bad_ioapic(unsigned long address)
-{
-	if (nr_ioapics >= MAX_IO_APICS) {
-		printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
-		       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
-		panic("Recompile kernel with bigger MAX_IO_APICS!\n");
-	}
-	if (!address) {
-		printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
-		       " found in table, skipping!\n");
-		return 1;
-	}
-	return 0;
-}
-
 static void __init MP_ioapic_info(struct mpc_ioapic *m)
 {
 	if (!(m->flags & MPC_APIC_USABLE))
@@ -138,15 +123,7 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m)
 	printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
 	       m->apicid, m->apicver, m->apicaddr);
 
-	if (bad_ioapic(m->apicaddr))
-		return;
-
-	mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
-	mp_ioapics[nr_ioapics].apicid = m->apicid;
-	mp_ioapics[nr_ioapics].type = m->type;
-	mp_ioapics[nr_ioapics].apicver = m->apicver;
-	mp_ioapics[nr_ioapics].flags = m->flags;
-	nr_ioapics++;
+	mp_register_ioapic(m->apicid, m->apicaddr, gsi_end + 1);
 }
 
 static void print_MP_intsrc_info(struct mpc_intsrc *m)
-- 
cgit v1.2.3-55-g7522


From 7716a5c4ff5f1f3dc5e9edcab125cbf7fceef0af Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Tue, 30 Mar 2010 01:07:12 -0700
Subject: x86, ioapic: Move nr_ioapic_registers calculation to
 mp_register_ioapic.

Now that all ioapic registration happens in mp_register_ioapic we can
move the calculation of nr_ioapic_registers there from enable_IO_APIC.
The number of ioapic registers is already calucated in mp_register_ioapic
so all that really needs to be done is to save the caluclated value
in nr_ioapic_registers.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-11-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/io_apic.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index f80725571577..dae9240bd287 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1953,20 +1953,8 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 
 void __init enable_IO_APIC(void)
 {
-	union IO_APIC_reg_01 reg_01;
 	int i8259_apic, i8259_pin;
 	int apic;
-	unsigned long flags;
-
-	/*
-	 * The number of IO-APIC IRQ registers (== #pins):
-	 */
-	for (apic = 0; apic < nr_ioapics; apic++) {
-		raw_spin_lock_irqsave(&ioapic_lock, flags);
-		reg_01.raw = io_apic_read(apic, 1);
-		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-		nr_ioapic_registers[apic] = reg_01.bits.entries+1;
-	}
 
 	if (!legacy_pic->nr_legacy_irqs)
 		return;
@@ -4293,6 +4281,7 @@ static int bad_ioapic(unsigned long address)
 void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 {
 	int idx = 0;
+	int entries;
 
 	if (bad_ioapic(address))
 		return;
@@ -4311,9 +4300,14 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
 	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
 	 */
+	entries = io_apic_get_redir_entries(idx);
 	mp_gsi_routing[idx].gsi_base = gsi_base;
-	mp_gsi_routing[idx].gsi_end = gsi_base +
-	    io_apic_get_redir_entries(idx) - 1;
+	mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1;
+
+	/*
+	 * The number of IO-APIC IRQ registers (== #pins):
+	 */
+	nr_ioapic_registers[idx] = entries;
 
 	if (mp_gsi_routing[idx].gsi_end > gsi_end)
 		gsi_end = mp_gsi_routing[idx].gsi_end;
-- 
cgit v1.2.3-55-g7522


From d464207c4fdd70c2a0febd4f9c58206fa915bb36 Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Tue, 30 Mar 2010 01:07:13 -0700
Subject: x86, ioapic: Optimize pin_2_irq

Now that all ioapics have valid gsi_base values use this to
accellerate pin_2_irq.  In the case of acpi this also ensures
that pin_2_irq will compute the same irq value for an ioapic
pin as acpi will.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-12-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/io_apic.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index dae9240bd287..0d35f46929d1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1019,7 +1019,7 @@ static inline int irq_trigger(int idx)
 int (*ioapic_renumber_irq)(int ioapic, int irq);
 static int pin_2_irq(int idx, int apic, int pin)
 {
-	int irq, i;
+	int irq;
 	int bus = mp_irqs[idx].srcbus;
 
 	/*
@@ -1031,18 +1031,13 @@ static int pin_2_irq(int idx, int apic, int pin)
 	if (test_bit(bus, mp_bus_not_pci)) {
 		irq = mp_irqs[idx].srcbusirq;
 	} else {
-		/*
-		 * PCI IRQs are mapped in order
-		 */
-		i = irq = 0;
-		while (i < apic)
-			irq += nr_ioapic_registers[i++];
-		irq += pin;
+		u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
 		/*
                  * For MPS mode, so far only needed by ES7000 platform
                  */
 		if (ioapic_renumber_irq)
-			irq = ioapic_renumber_irq(apic, irq);
+			gsi = ioapic_renumber_irq(apic, gsi);
+		irq = gsi;
 	}
 
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3-55-g7522


From 4afc51a835d3aeba11c35090f524e05c84586d27 Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Tue, 30 Mar 2010 01:07:14 -0700
Subject: x86, ioapic: Simplify probe_nr_irqs_gsi.

Use the global gsi_end value now that all ioapics have
valid gsi numbers instead of a combination of acpi_probe_gsi
and walking all of the ioapics and couting their number of
entries by hand if acpi_probe_gsi gave us an answer we did
not like.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-13-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mpspec.h  |  6 ------
 arch/x86/kernel/acpi/boot.c    | 23 -----------------------
 arch/x86/kernel/apic/io_apic.c | 17 +++--------------
 3 files changed, 3 insertions(+), 43 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 29994f06c7e2..c82868e9f905 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -105,12 +105,6 @@ extern void mp_config_acpi_legacy_irqs(void);
 struct device;
 extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
 				 int active_high_low);
-extern int acpi_probe_gsi(void);
-#else /* !CONFIG_ACPI: */
-static inline int acpi_probe_gsi(void)
-{
-	return 0;
-}
 #endif /* CONFIG_ACPI */
 
 #define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_APICS)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 3ee92f28a4b2..07a63ce5811a 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -876,29 +876,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
 extern int es7000_plat;
 #endif
 
-int __init acpi_probe_gsi(void)
-{
-	int idx;
-	int gsi;
-	int max_gsi = 0;
-
-	if (acpi_disabled)
-		return 0;
-
-	if (!acpi_ioapic)
-		return 0;
-
-	max_gsi = 0;
-	for (idx = 0; idx < nr_ioapics; idx++) {
-		gsi = mp_gsi_routing[idx].gsi_end;
-
-		if (gsi > max_gsi)
-			max_gsi = gsi;
-	}
-
-	return max_gsi + 1;
-}
-
 static void assign_to_mp_irq(struct mpc_intsrc *m,
 				    struct mpc_intsrc *mp_irq)
 {
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 0d35f46929d1..9f3f6ca86dac 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3850,22 +3850,11 @@ int __init io_apic_get_redir_entries (int ioapic)
 
 void __init probe_nr_irqs_gsi(void)
 {
-	int nr = 0;
+	int nr;
 
-	nr = acpi_probe_gsi();
-	if (nr > nr_irqs_gsi) {
+	nr = gsi_end + 1;
+	if (nr > nr_irqs_gsi)
 		nr_irqs_gsi = nr;
-	} else {
-		/* for acpi=off or acpi is not compiled in */
-		int idx;
-
-		nr = 0;
-		for (idx = 0; idx < nr_ioapics; idx++)
-			nr += io_apic_get_redir_entries(idx);
-
-		if (nr > nr_irqs_gsi)
-			nr_irqs_gsi = nr;
-	}
 
 	printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
 }
-- 
cgit v1.2.3-55-g7522


From 988856ee1623bd37e384105f7bb2b7fe44c009f6 Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Tue, 30 Mar 2010 01:07:15 -0700
Subject: x86, acpi/irq: Handle isa irqs that are not identity mapped to gsi's.

ACPI irq source overrides are allowed for the 16 isa irqs and are
allowed to map any gsi to any isa irq.  A few motherboards have been
seen to take advantage of this and put the isa irqs on the 2nd or
3rd ioapic.  This causes some problems, most notably the fact
that we can not use any gsi < 16.

To correct this move the gsis that are not isa irqs and have
a gsi number < 16 into the linux irq space just past gsi_end.
This is what the es7000 platform is doing today.  Moving only the
low 16 gsis above the rest of the gsi's only penalizes weird
platforms, leaving sane acpi implementations with a 1-1 mapping
of gsis and irqs.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-14-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/acpi/boot.c    | 57 +++++++++++++++++++++++++++++++++++++++---
 arch/x86/kernel/apic/io_apic.c |  8 ++++--
 2 files changed, 59 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 07a63ce5811a..325fbbab7f89 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -93,6 +93,53 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
 enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
 
 
+/*
+ * ISA irqs by default are the first 16 gsis but can be
+ * any gsi as specified by an interrupt source override.
+ */
+static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+static unsigned int gsi_to_irq(unsigned int gsi)
+{
+	unsigned int irq = gsi + NR_IRQS_LEGACY;
+	unsigned int i;
+
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
+		if (isa_irq_to_gsi[i] == gsi) {
+			return i;
+		}
+	}
+
+	/* Provide an identity mapping of gsi == irq
+	 * except on truly weird platforms that have
+	 * non isa irqs in the first 16 gsis.
+	 */
+	if (gsi >= NR_IRQS_LEGACY)
+		irq = gsi;
+	else
+		irq = gsi_end + 1 + gsi;
+
+	return irq;
+}
+
+static u32 irq_to_gsi(int irq)
+{
+	unsigned int gsi;
+
+	if (irq < NR_IRQS_LEGACY)
+		gsi = isa_irq_to_gsi[irq];
+	else if (irq <= gsi_end)
+		gsi = irq;
+	else if (irq <= (gsi_end + NR_IRQS_LEGACY))
+		gsi = irq - gsi_end;
+	else
+		gsi = 0xffffffff;
+
+	return gsi;
+}
+
 /*
  * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
  * to map the target physical address. The problem is that set_fixmap()
@@ -449,7 +496,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
 
 int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
 {
-	*irq = gsi;
+	*irq = gsi_to_irq(gsi);
 
 #ifdef CONFIG_X86_IO_APIC
 	if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
@@ -463,7 +510,7 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
 {
 	if (isa_irq >= 16)
 		return -1;
-	*gsi = isa_irq;
+	*gsi = irq_to_gsi(isa_irq);
 	return 0;
 }
 
@@ -491,7 +538,7 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 		plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
 	}
 #endif
-	irq = plat_gsi;
+	irq = gsi_to_irq(plat_gsi);
 
 	return irq;
 }
@@ -933,6 +980,8 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 	mp_irq.dstirq = pin;	/* INTIN# */
 
 	save_mp_irq(&mp_irq);
+
+	isa_irq_to_gsi[bus_irq] = gsi;
 }
 
 void __init mp_config_acpi_legacy_irqs(void)
@@ -1086,7 +1135,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 	set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
 			     trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
 			     polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-	io_apic_set_pci_routing(dev, gsi, &irq_attr);
+	io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
 
 	return gsi;
 }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 9f3f6ca86dac..594827c3c615 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1037,7 +1037,11 @@ static int pin_2_irq(int idx, int apic, int pin)
                  */
 		if (ioapic_renumber_irq)
 			gsi = ioapic_renumber_irq(apic, gsi);
-		irq = gsi;
+
+		if (gsi >= NR_IRQS_LEGACY)
+			irq = gsi;
+		else
+			irq = gsi_end + 1 + gsi;
 	}
 
 #ifdef CONFIG_X86_32
@@ -3852,7 +3856,7 @@ void __init probe_nr_irqs_gsi(void)
 {
 	int nr;
 
-	nr = gsi_end + 1;
+	nr = gsi_end + 1 + NR_IRQS_LEGACY;
 	if (nr > nr_irqs_gsi)
 		nr_irqs_gsi = nr;
 
-- 
cgit v1.2.3-55-g7522


From 7b20bd5fb902088579af4e70f7f802b93181a628 Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Tue, 30 Mar 2010 01:07:16 -0700
Subject: x86, irq: Kill io_apic_renumber_irq

Now that the generic irq layer is performing the exact same remapping as
io_apic_renumber_irq we can kill this weird  es7000 specific function.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-15-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/io_apic.h   |  1 -
 arch/x86/kernel/acpi/boot.c      |  5 -----
 arch/x86/kernel/apic/es7000_32.c | 19 -------------------
 arch/x86/kernel/apic/io_apic.c   |  6 ------
 4 files changed, 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 37b0f2bb5034..9da192a17f0f 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -159,7 +159,6 @@ struct io_apic_irq_attr;
 extern int io_apic_set_pci_routing(struct device *dev, int irq,
 		 struct io_apic_irq_attr *irq_attr);
 void setup_IO_APIC_irq_extra(u32 gsi);
-extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 extern void ioapic_init_mappings(void);
 extern void ioapic_insert_resources(void);
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 325fbbab7f89..9a5ed58f09dc 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1117,11 +1117,6 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 
 	ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
 
-#ifdef CONFIG_X86_32
-	if (ioapic_renumber_irq)
-		gsi = ioapic_renumber_irq(ioapic, gsi);
-#endif
-
 	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
 		printk(KERN_ERR "Invalid reference to IOAPIC pin "
 		       "%d-%d\n", mp_ioapics[ioapic].apicid,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 03ba1b895f5e..425e53a87feb 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -131,24 +131,6 @@ int					es7000_plat;
 
 static unsigned int			base;
 
-static int
-es7000_rename_gsi(int ioapic, int gsi)
-{
-	if (es7000_plat == ES7000_ZORRO)
-		return gsi;
-
-	if (!base) {
-		int i;
-		for (i = 0; i < nr_ioapics; i++)
-			base += nr_ioapic_registers[i];
-	}
-
-	if (!ioapic && (gsi < 16))
-		gsi += base;
-
-	return gsi;
-}
-
 static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
 {
 	unsigned long vect = 0, psaival = 0;
@@ -190,7 +172,6 @@ static void setup_unisys(void)
 		es7000_plat = ES7000_ZORRO;
 	else
 		es7000_plat = ES7000_CLASSIC;
-	ioapic_renumber_irq = es7000_rename_gsi;
 }
 
 /*
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 594827c3c615..d174d8866547 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1016,7 +1016,6 @@ static inline int irq_trigger(int idx)
 	return MPBIOS_trigger(idx);
 }
 
-int (*ioapic_renumber_irq)(int ioapic, int irq);
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq;
@@ -1032,11 +1031,6 @@ static int pin_2_irq(int idx, int apic, int pin)
 		irq = mp_irqs[idx].srcbusirq;
 	} else {
 		u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
-		/*
-                 * For MPS mode, so far only needed by ES7000 platform
-                 */
-		if (ioapic_renumber_irq)
-			gsi = ioapic_renumber_irq(apic, gsi);
 
 		if (gsi >= NR_IRQS_LEGACY)
 			irq = gsi;
-- 
cgit v1.2.3-55-g7522


From 4f47b4c9f0b711bf84adb8c27774ae80d346b628 Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Wed, 5 May 2010 13:22:25 -0700
Subject: x86, acpi/irq: Define gsi_end when X86_IO_APIC is undefined

My recent changes introducing a global gsi_end variable
failed to take into account the case of using acpi on a system
not built to support IO_APICs, causing the build to fail.

Define gsi_end to 15 when CONFIG_X86_IO_APIC is not set to avoid
compile errors.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <m1tyqm14la.fsf_-_@fess.ebiederm.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/io_apic.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9da192a17f0f..63cb4096c3dc 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -197,6 +197,7 @@ static const int timer_through_8259 = 0;
 static inline void ioapic_init_mappings(void)	{ }
 static inline void ioapic_insert_resources(void) { }
 static inline void probe_nr_irqs_gsi(void)	{ }
+#define gsi_end (NR_IRQS_LEGACY - 1)
 static inline int mp_find_ioapic(u32 gsi) { return 0; }
 
 struct io_apic_irq_attr;
-- 
cgit v1.2.3-55-g7522


From b0c4d952a158a6a2547672cf4fc9d55e415410de Mon Sep 17 00:00:00 2001
From: David Rientjes
Date: Thu, 6 May 2010 02:24:34 -0700
Subject: x86: Fix fake apicid to node mapping for numa emulation

With NUMA emulation, it's possible for a single cpu to be bound
to multiple nodes since more than one may have affinity if
allocated on a physical node that is local to the cpu.

APIC ids must therefore be mapped to the lowest node ids to
maintain generic kernel use of functions such as cpu_to_node()
that determine device affinity.  For example, if a device has
proximity to physical node 1, for instance, and a cpu happens to
be mapped to a higher emulated node id 8, the proximity may not
be correctly determined by comparison in generic code even
though the cpu may be truly local and allocated on physical node 1.

When this happens, the true topology of the machine isn't
accurately represented in the emulated environment; although
this isn't critical to the system's uptime, any generic code
that is NUMA aware benefits from the physical topology being
accurately represented.

This can affect any system that maps multiple APIC ids to a
single node and is booted with numa=fake=N where N is greater
than the number of physical nodes.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <alpine.DEB.2.00.1005060224140.19473@chino.kir.corp.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/srat_64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 28c68762648f..38512d0c4742 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -461,7 +461,8 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 		 * node, it must now point to the fake node ID.
 		 */
 		for (j = 0; j < MAX_LOCAL_APIC; j++)
-			if (apicid_to_node[j] == nid)
+			if (apicid_to_node[j] == nid &&
+			    fake_apicid_to_node[j] == NUMA_NO_NODE)
 				fake_apicid_to_node[j] = i;
 	}
 	for (i = 0; i < num_nodes; i++)
-- 
cgit v1.2.3-55-g7522


From 3de668ee8d5b1e08da3200f926ff5a28aeb99bc2 Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Mon, 3 May 2010 15:00:25 +0200
Subject: oprofile/x86: notify cpus only when daemon is running

This patch moves the cpu notifier registration from nmi_init() to
nmi_setup(). The corresponding unregistration function is now in
nmi_shutdown(). Thus, the hotplug code is only active, if the oprofile
daemon is running.

Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 7de0572b0a5e..2a086726cad1 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -504,6 +504,7 @@ static int nmi_setup(void)
 		goto fail;
 
 	get_online_cpus();
+	register_cpu_notifier(&oprofile_cpu_nb);
 	on_each_cpu(nmi_cpu_setup, NULL, 1);
 	nmi_enabled = 1;
 	put_online_cpus();
@@ -519,6 +520,7 @@ static void nmi_shutdown(void)
 	struct op_msrs *msrs;
 
 	get_online_cpus();
+	unregister_cpu_notifier(&oprofile_cpu_nb);
 	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
 	nmi_enabled = 0;
 	ctr_running = 0;
@@ -739,12 +741,6 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 		return -ENODEV;
 	}
 
-	get_online_cpus();
-	register_cpu_notifier(&oprofile_cpu_nb);
-	nmi_enabled = 0;
-	ctr_running = 0;
-	put_online_cpus();
-
 	/* default values, can be overwritten by model */
 	ops->create_files	= nmi_create_files;
 	ops->setup		= nmi_setup;
@@ -771,14 +767,8 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 
 void op_nmi_exit(void)
 {
-	if (using_nmi) {
+	if (using_nmi)
 		exit_sysfs();
-		get_online_cpus();
-		unregister_cpu_notifier(&oprofile_cpu_nb);
-		nmi_enabled = 0;
-		ctr_running = 0;
-		put_online_cpus();
-	}
 	if (model->exit)
 		model->exit();
 }
-- 
cgit v1.2.3-55-g7522


From bae663bc635e2726c7c5228dbf0f2051e16d1c81 Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Wed, 5 May 2010 17:47:17 +0200
Subject: oprofile/x86: make AMD IBS hotplug capable

Current IBS code is not hotplug capable. An offline cpu might not be
initialized or deinitialized properly. This patch fixes this by
removing on_each_cpu() functions. The IBS init/deinit code is executed
in the per-cpu functions model->setup_ctrs() and model->cpu_down()
which are also called by hotplug notifiers. model->cpu_down() replaces
model->exit() that became obsolete.

Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c      |  4 +--
 arch/x86/oprofile/op_model_amd.c | 54 ++++++++++++----------------------------
 arch/x86/oprofile/op_x86_model.h |  2 +-
 3 files changed, 19 insertions(+), 41 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 2a086726cad1..b28d2f1253bb 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -397,6 +397,8 @@ static void nmi_cpu_shutdown(void *dummy)
 	apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu));
 	apic_write(APIC_LVTERR, v);
 	nmi_cpu_restore_registers(msrs);
+	if (model->cpu_down)
+		model->cpu_down();
 }
 
 static void nmi_cpu_up(void *dummy)
@@ -769,6 +771,4 @@ void op_nmi_exit(void)
 {
 	if (using_nmi)
 		exit_sysfs();
-	if (model->exit)
-		model->exit();
 }
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 384c52410480..b67a6b5aa8d4 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -374,6 +374,15 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
 		val |= op_x86_get_ctrl(model, &counter_config[virt]);
 		wrmsrl(msrs->controls[i].addr, val);
 	}
+
+	if (ibs_caps)
+		setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
+}
+
+static void op_amd_cpu_shutdown(void)
+{
+	if (ibs_caps)
+		setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
 }
 
 static int op_amd_check_ctrs(struct pt_regs * const regs,
@@ -436,28 +445,16 @@ static void op_amd_stop(struct op_msrs const * const msrs)
 	op_amd_stop_ibs();
 }
 
-static u8 ibs_eilvt_off;
-
-static inline void apic_init_ibs_nmi_per_cpu(void *arg)
-{
-	ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
-}
-
-static inline void apic_clear_ibs_nmi_per_cpu(void *arg)
-{
-	setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
-}
-
-static int init_ibs_nmi(void)
+static int __init_ibs_nmi(void)
 {
 #define IBSCTL_LVTOFFSETVAL		(1 << 8)
 #define IBSCTL				0x1cc
 	struct pci_dev *cpu_cfg;
 	int nodes;
 	u32 value = 0;
+	u8 ibs_eilvt_off;
 
-	/* per CPU setup */
-	on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1);
+	ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
 
 	nodes = 0;
 	cpu_cfg = NULL;
@@ -487,21 +484,15 @@ static int init_ibs_nmi(void)
 	return 0;
 }
 
-/* uninitialize the APIC for the IBS interrupts if needed */
-static void clear_ibs_nmi(void)
-{
-	on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
-}
-
 /* initialize the APIC for the IBS interrupts if available */
-static void ibs_init(void)
+static void init_ibs(void)
 {
 	ibs_caps = get_ibs_caps();
 
 	if (!ibs_caps)
 		return;
 
-	if (init_ibs_nmi()) {
+	if (__init_ibs_nmi()) {
 		ibs_caps = 0;
 		return;
 	}
@@ -510,14 +501,6 @@ static void ibs_init(void)
 	       (unsigned)ibs_caps);
 }
 
-static void ibs_exit(void)
-{
-	if (!ibs_caps)
-		return;
-
-	clear_ibs_nmi();
-}
-
 static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
 
 static int setup_ibs_files(struct super_block *sb, struct dentry *root)
@@ -566,17 +549,12 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
 
 static int op_amd_init(struct oprofile_operations *ops)
 {
-	ibs_init();
+	init_ibs();
 	create_arch_files = ops->create_files;
 	ops->create_files = setup_ibs_files;
 	return 0;
 }
 
-static void op_amd_exit(void)
-{
-	ibs_exit();
-}
-
 struct op_x86_model_spec op_amd_spec = {
 	.num_counters		= NUM_COUNTERS,
 	.num_controls		= NUM_COUNTERS,
@@ -584,9 +562,9 @@ struct op_x86_model_spec op_amd_spec = {
 	.reserved		= MSR_AMD_EVENTSEL_RESERVED,
 	.event_mask		= OP_EVENT_MASK,
 	.init			= op_amd_init,
-	.exit			= op_amd_exit,
 	.fill_in_addresses	= &op_amd_fill_in_addresses,
 	.setup_ctrs		= &op_amd_setup_ctrs,
+	.cpu_down		= &op_amd_cpu_shutdown,
 	.check_ctrs		= &op_amd_check_ctrs,
 	.start			= &op_amd_start,
 	.stop			= &op_amd_stop,
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 551401398fba..89017fa1fd63 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -40,10 +40,10 @@ struct op_x86_model_spec {
 	u64		reserved;
 	u16		event_mask;
 	int		(*init)(struct oprofile_operations *ops);
-	void		(*exit)(void);
 	int		(*fill_in_addresses)(struct op_msrs * const msrs);
 	void		(*setup_ctrs)(struct op_x86_model_spec const *model,
 				      struct op_msrs const * const msrs);
+	void		(*cpu_down)(void);
 	int		(*check_ctrs)(struct pt_regs * const regs,
 				      struct op_msrs const * const msrs);
 	void		(*start)(struct op_msrs const * const msrs);
-- 
cgit v1.2.3-55-g7522


From a2a47c6c3d1a7c01da4464b3b7be93b924f874c1 Mon Sep 17 00:00:00 2001
From: Ky Srinivasan
Date: Thu, 6 May 2010 12:08:41 -0700
Subject: x86: Detect running on a Microsoft HyperV system

This patch integrates HyperV detection within the framework currently
used by VmWare. With this patch, we can avoid having to replicate the
HyperV detection code in each of the Microsoft HyperV drivers.

Reworked and tweaked by Greg K-H to build properly.

Signed-off-by: K. Y. Srinivasan <ksrinivasan@novell.com>
LKML-Reference: <20100506190841.GA1605@kroah.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Vadim Rozenfeld <vrozenfe@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "K.Prasad" <prasad@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/hyperv.h    |  6 ++++
 arch/x86/include/asm/mshyperv.h  |  7 +++++
 arch/x86/include/asm/processor.h |  3 ++
 arch/x86/kernel/cpu/Makefile     |  2 +-
 arch/x86/kernel/cpu/hypervisor.c | 10 ++++--
 arch/x86/kernel/cpu/mshyperv.c   | 67 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 91 insertions(+), 4 deletions(-)
 create mode 100644 arch/x86/include/asm/mshyperv.h
 create mode 100644 arch/x86/kernel/cpu/mshyperv.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h
index e153a2b3889a..46040473e122 100644
--- a/arch/x86/include/asm/hyperv.h
+++ b/arch/x86/include/asm/hyperv.h
@@ -14,6 +14,9 @@
 #define HYPERV_CPUID_ENLIGHTMENT_INFO		0x40000004
 #define HYPERV_CPUID_IMPLEMENT_LIMITS		0x40000005
 
+#define HYPERV_HYPERVISOR_PRESENT_BIT		0x80000000
+#define HYPERV_CPUID_MIN			0x40000005
+
 /*
  * Feature identification. EAX indicates which features are available
  * to the partition based upon the current partition privileges.
@@ -129,6 +132,9 @@
 /* MSR used to provide vcpu index */
 #define HV_X64_MSR_VP_INDEX			0x40000002
 
+/* MSR used to read the per-partition time reference counter */
+#define HV_X64_MSR_TIME_REF_COUNT		0x40000020
+
 /* Define the virtual APIC registers */
 #define HV_X64_MSR_EOI				0x40000070
 #define HV_X64_MSR_ICR				0x40000071
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
new file mode 100644
index 000000000000..6cd8101d1344
--- /dev/null
+++ b/arch/x86/include/asm/mshyperv.h
@@ -0,0 +1,7 @@
+#ifndef ASM_X86__MSHYPER_H
+#define ASM_X86__MSHYPER_H
+
+int ms_hyperv_platform(void);
+void __cpuinit ms_hyperv_set_feature_bits(struct cpuinfo_x86 *c);
+
+#endif
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b753ea59703a..597c041bd124 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -114,6 +114,8 @@ struct cpuinfo_x86 {
 	u16			cpu_index;
 #endif
 	unsigned int		x86_hyper_vendor;
+	/* The layout of this field is hypervisor specific */
+	unsigned int		x86_hyper_features;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define X86_VENDOR_INTEL	0
@@ -129,6 +131,7 @@ struct cpuinfo_x86 {
 
 #define X86_HYPER_VENDOR_NONE  0
 #define X86_HYPER_VENDOR_VMWARE 1
+#define X86_HYPER_VENDOR_MSFT	2
 
 /*
  * capabilities of CPUs
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c202b62f3671..3a785da34b6f 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,7 +14,7 @@ CFLAGS_common.o		:= $(nostackp)
 
 obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
 obj-y			+= proc.o capflags.o powerflags.o common.o
-obj-y			+= vmware.o hypervisor.o sched.o
+obj-y			+= vmware.o hypervisor.o sched.o mshyperv.o
 
 obj-$(CONFIG_X86_32)	+= bugs.o cmpxchg.o
 obj-$(CONFIG_X86_64)	+= bugs_64.o
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 08be922de33a..de3f4e0ce8eb 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -23,6 +23,7 @@
 
 #include <asm/processor.h>
 #include <asm/vmware.h>
+#include <asm/mshyperv.h>
 #include <asm/hypervisor.h>
 
 static inline void __cpuinit
@@ -30,6 +31,8 @@ detect_hypervisor_vendor(struct cpuinfo_x86 *c)
 {
 	if (vmware_platform())
 		c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
+	else if (ms_hyperv_platform())
+		c->x86_hyper_vendor = X86_HYPER_VENDOR_MSFT;
 	else
 		c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
 }
@@ -37,10 +40,11 @@ detect_hypervisor_vendor(struct cpuinfo_x86 *c)
 static inline void __cpuinit
 hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
 {
-	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) {
+	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
 		vmware_set_feature_bits(c);
-		return;
-	}
+	else if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_MSFT)
+		ms_hyperv_set_feature_bits(c);
+	return;
 }
 
 void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
new file mode 100644
index 000000000000..2443b61cdb17
--- /dev/null
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -0,0 +1,67 @@
+/*
+ * HyperV  Detection code.
+ *
+ * Copyright (C) 2010, Novell, Inc.
+ * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/types.h>
+#include <asm/processor.h>
+#include <asm/hyperv.h>
+#include <asm/mshyperv.h>
+
+
+int ms_hyperv_platform(void)
+{
+	u32 eax, ebx, ecx, edx;
+	char hyp_signature[13];
+
+	cpuid(1, &eax, &ebx, &ecx, &edx);
+	if (!(ecx & HYPERV_HYPERVISOR_PRESENT_BIT))
+		return 0;
+
+	cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &ebx, &ecx, &edx);
+	*(u32 *)(hyp_signature + 0) = ebx;
+	*(u32 *)(hyp_signature + 4) = ecx;
+	*(u32 *)(hyp_signature + 8) = edx;
+
+	if ((eax < HYPERV_CPUID_MIN) || (memcmp("Microsoft Hv", hyp_signature, 12)))
+		return 0;
+	return 1;
+}
+
+void __cpuinit ms_hyperv_set_feature_bits(struct cpuinfo_x86 *c)
+{
+	u32 eax, ebx, ecx, edx;
+
+	c->x86_hyper_features = 0;
+	/*
+	 * Extract the features, recommendations etc.
+	 * The first 9 bits will be used to track hypervisor features.
+	 * The next 6 bits will be used to track the hypervisor
+	 * recommendations.
+	 */
+	cpuid(HYPERV_CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
+	c->x86_hyper_features |= (eax & 0x1ff);
+
+	cpuid(HYPERV_CPUID_ENLIGHTMENT_INFO, &eax, &ebx, &ecx, &edx);
+	c->x86_hyper_features |= ((eax & 0x3f) << 9);
+	printk(KERN_INFO "Detected HyperV with features: %x\n",
+		c->x86_hyper_features);
+}
-- 
cgit v1.2.3-55-g7522


From 4261e0e0efd9e04b6c69e0773c3cf4d6f337c416 Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Tue, 13 Apr 2010 22:23:10 +0200
Subject: perf, x86: Move perfctr init code to x86_setup_perfctr()

Split __hw_perf_event_init() to configure pmu events other than
perfctrs. Perfctr code is moved to a separate function
x86_setup_perfctr(). This and the following patches refactor the code.

Split in multiple patches for better review.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1271190201-25705-2-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 7de70613e6c3..801441a54245 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -426,6 +426,8 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
 	return 0;
 }
 
+static int x86_setup_perfctr(struct perf_event *event);
+
 static int x86_pmu_hw_config(struct perf_event *event)
 {
 	/*
@@ -453,9 +455,6 @@ static int x86_pmu_hw_config(struct perf_event *event)
  */
 static int __hw_perf_event_init(struct perf_event *event)
 {
-	struct perf_event_attr *attr = &event->attr;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 config;
 	int err;
 
 	if (!x86_pmu_initialized())
@@ -482,15 +481,24 @@ static int __hw_perf_event_init(struct perf_event *event)
 
 	event->destroy = hw_perf_event_destroy;
 
-	hwc->idx = -1;
-	hwc->last_cpu = -1;
-	hwc->last_tag = ~0ULL;
+	event->hw.idx = -1;
+	event->hw.last_cpu = -1;
+	event->hw.last_tag = ~0ULL;
 
 	/* Processor specifics */
 	err = x86_pmu.hw_config(event);
 	if (err)
 		return err;
 
+	return x86_setup_perfctr(event);
+}
+
+static int x86_setup_perfctr(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 config;
+
 	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
 		hwc->last_period = hwc->sample_period;
-- 
cgit v1.2.3-55-g7522


From c1726f343b3bfc2ee037e191907c632a31903021 Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Tue, 13 Apr 2010 22:23:11 +0200
Subject: perf, x86: Move x86_setup_perfctr()

Move x86_setup_perfctr(), no other changes made.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1271190201-25705-3-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 120 +++++++++++++++++++--------------------
 1 file changed, 59 insertions(+), 61 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 801441a54245..3d3bceb9e830 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -426,7 +426,65 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
 	return 0;
 }
 
-static int x86_setup_perfctr(struct perf_event *event);
+static int x86_setup_perfctr(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 config;
+
+	if (!hwc->sample_period) {
+		hwc->sample_period = x86_pmu.max_period;
+		hwc->last_period = hwc->sample_period;
+		atomic64_set(&hwc->period_left, hwc->sample_period);
+	} else {
+		/*
+		 * If we have a PMU initialized but no APIC
+		 * interrupts, we cannot sample hardware
+		 * events (user-space has to fall back and
+		 * sample via a hrtimer based software event):
+		 */
+		if (!x86_pmu.apic)
+			return -EOPNOTSUPP;
+	}
+
+	if (attr->type == PERF_TYPE_RAW)
+		return 0;
+
+	if (attr->type == PERF_TYPE_HW_CACHE)
+		return set_ext_hw_attr(hwc, attr);
+
+	if (attr->config >= x86_pmu.max_events)
+		return -EINVAL;
+
+	/*
+	 * The generic map:
+	 */
+	config = x86_pmu.event_map(attr->config);
+
+	if (config == 0)
+		return -ENOENT;
+
+	if (config == -1LL)
+		return -EINVAL;
+
+	/*
+	 * Branch tracing:
+	 */
+	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
+	    (hwc->sample_period == 1)) {
+		/* BTS is not supported by this architecture. */
+		if (!x86_pmu.bts)
+			return -EOPNOTSUPP;
+
+		/* BTS is currently only allowed for user-mode. */
+		if (!attr->exclude_kernel)
+			return -EOPNOTSUPP;
+	}
+
+	hwc->config |= config;
+
+	return 0;
+}
 
 static int x86_pmu_hw_config(struct perf_event *event)
 {
@@ -493,66 +551,6 @@ static int __hw_perf_event_init(struct perf_event *event)
 	return x86_setup_perfctr(event);
 }
 
-static int x86_setup_perfctr(struct perf_event *event)
-{
-	struct perf_event_attr *attr = &event->attr;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 config;
-
-	if (!hwc->sample_period) {
-		hwc->sample_period = x86_pmu.max_period;
-		hwc->last_period = hwc->sample_period;
-		atomic64_set(&hwc->period_left, hwc->sample_period);
-	} else {
-		/*
-		 * If we have a PMU initialized but no APIC
-		 * interrupts, we cannot sample hardware
-		 * events (user-space has to fall back and
-		 * sample via a hrtimer based software event):
-		 */
-		if (!x86_pmu.apic)
-			return -EOPNOTSUPP;
-	}
-
-	if (attr->type == PERF_TYPE_RAW)
-		return 0;
-
-	if (attr->type == PERF_TYPE_HW_CACHE)
-		return set_ext_hw_attr(hwc, attr);
-
-	if (attr->config >= x86_pmu.max_events)
-		return -EINVAL;
-
-	/*
-	 * The generic map:
-	 */
-	config = x86_pmu.event_map(attr->config);
-
-	if (config == 0)
-		return -ENOENT;
-
-	if (config == -1LL)
-		return -EINVAL;
-
-	/*
-	 * Branch tracing:
-	 */
-	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
-	    (hwc->sample_period == 1)) {
-		/* BTS is not supported by this architecture. */
-		if (!x86_pmu.bts)
-			return -EOPNOTSUPP;
-
-		/* BTS is currently only allowed for user-mode. */
-		if (!attr->exclude_kernel)
-			return -EOPNOTSUPP;
-	}
-
-	hwc->config |= config;
-
-	return 0;
-}
-
 static void x86_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-- 
cgit v1.2.3-55-g7522


From 9d0fcba67e47ff398a6fa86476d4884d472dc98a Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Tue, 13 Apr 2010 22:23:12 +0200
Subject: perf, x86: Call x86_setup_perfctr() from .hw_config()

The perfctr setup calls are in the corresponding .hw_config()
functions now. This makes it possible to introduce config functions
for other pmu events that are not perfctr specific.

Also, all of a sudden the code looks much nicer.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1271190201-25705-4-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c    | 9 ++-------
 arch/x86/kernel/cpu/perf_event_p4.c | 2 +-
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3d3bceb9e830..c2c1e10f7b03 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -505,7 +505,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
 	if (event->attr.type == PERF_TYPE_RAW)
 		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
-	return 0;
+	return x86_setup_perfctr(event);
 }
 
 /*
@@ -543,12 +543,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 	event->hw.last_cpu = -1;
 	event->hw.last_tag = ~0ULL;
 
-	/* Processor specifics */
-	err = x86_pmu.hw_config(event);
-	if (err)
-		return err;
-
-	return x86_setup_perfctr(event);
+	return x86_pmu.hw_config(event);
 }
 
 static void x86_pmu_disable_all(void)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 15367cce66bd..9e002054cb5f 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -455,7 +455,7 @@ static int p4_hw_config(struct perf_event *event)
 		(p4_config_pack_escr(P4_ESCR_MASK_HT) |
 		 p4_config_pack_cccr(P4_CCCR_MASK_HT));
 
-	return 0;
+	return x86_setup_perfctr(event);
 }
 
 static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
-- 
cgit v1.2.3-55-g7522


From 31fa58af57c41d2912debf62d47d5811062411f1 Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Tue, 13 Apr 2010 22:23:14 +0200
Subject: perf, x86: Pass enable bit mask to __x86_pmu_enable_event()

To reuse this function for events with different enable bit masks,
this mask is part of the function's argument list now.

The function will be used later to control ibs events too.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1271190201-25705-6-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       | 9 +++++----
 arch/x86/kernel/cpu/perf_event_intel.c | 5 +++--
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c2c1e10f7b03..4e218d7ac852 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -844,10 +844,10 @@ void hw_perf_enable(void)
 	x86_pmu.enable_all(added);
 }
 
-static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
+					  u64 enable_mask)
 {
-	wrmsrl(hwc->config_base + hwc->idx,
-			      hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
+	wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
 }
 
 static inline void x86_pmu_disable_event(struct perf_event *event)
@@ -919,7 +919,8 @@ static void x86_pmu_enable_event(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	if (cpuc->enabled)
-		__x86_pmu_enable_event(&event->hw);
+		__x86_pmu_enable_event(&event->hw,
+				       ARCH_PERFMON_EVENTSEL_ENABLE);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a099df96f916..a4b56ac425cb 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -513,7 +513,8 @@ static void intel_pmu_nhm_enable_all(int added)
 			if (!event)
 				continue;
 
-			__x86_pmu_enable_event(&event->hw);
+			__x86_pmu_enable_event(&event->hw,
+					       ARCH_PERFMON_EVENTSEL_ENABLE);
 		}
 	}
 	intel_pmu_enable_all(added);
@@ -617,7 +618,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
 	if (unlikely(event->attr.precise))
 		intel_pmu_pebs_enable(event);
 
-	__x86_pmu_enable_event(hwc);
+	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
 }
 
 /*
-- 
cgit v1.2.3-55-g7522


From a1f2b70a942b8d858a0ab02567da3999b60a99b2 Mon Sep 17 00:00:00 2001
From: Robert Richter
Date: Tue, 13 Apr 2010 22:23:15 +0200
Subject: perf, x86: Use weight instead of cmask in for_each_event_constraint()

There may exist constraints with a cmask set to zero. In this case
for_each_event_constraint() will not work properly. Now weight is used
instead of the cmask for loop exit detection. Weight is always a value
other than zero since the default contains the HWEIGHT from the
counter mask and in other cases a value of zero does not fit too.

This is in preparation of ibs event constraints that wont have a
cmask.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1271190201-25705-7-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 4e218d7ac852..4a3f1f2b9b91 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -170,7 +170,7 @@ struct cpu_hw_events {
 	EVENT_CONSTRAINT(0, 0, 0)
 
 #define for_each_event_constraint(e, c)	\
-	for ((e) = (c); (e)->cmask; (e)++)
+	for ((e) = (c); (e)->weight; (e)++)
 
 union perf_capabilities {
 	struct {
-- 
cgit v1.2.3-55-g7522


From 1e9a6d8d44cb6dcd2799b36ceb23007e6a423bfe Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Tue, 4 May 2010 16:30:21 +0200
Subject: perf, x86: Remove PEBS SAMPLE_RAW support

Its broken, we really should get PERF_SAMPLE_REGS sorted.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index ec8b2e12e104..080b9b065bdd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -459,7 +459,6 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	struct perf_event *event = cpuc->events[0]; /* PMC0 only */
 	struct pebs_record_core *at, *top;
 	struct perf_sample_data data;
-	struct perf_raw_record raw;
 	struct pt_regs regs;
 	int n;
 
@@ -499,12 +498,6 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	perf_sample_data_init(&data, 0);
 	data.period = event->hw.last_period;
 
-	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-		raw.size = x86_pmu.pebs_record_size;
-		raw.data = at;
-		data.raw = &raw;
-	}
-
 	/*
 	 * We use the interrupt regs as a base because the PEBS record
 	 * does not contain a full regs set, specifically it seems to
@@ -536,7 +529,6 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	struct pebs_record_nhm *at, *top;
 	struct perf_sample_data data;
 	struct perf_event *event = NULL;
-	struct perf_raw_record raw;
 	struct pt_regs regs;
 	u64 status = 0;
 	int bit, n;
@@ -585,12 +577,6 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		perf_sample_data_init(&data, 0);
 		data.period = event->hw.last_period;
 
-		if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-			raw.size = x86_pmu.pebs_record_size;
-			raw.data = at;
-			data.raw = &raw;
-		}
-
 		/*
 		 * See the comment in intel_pmu_drain_pebs_core()
 		 */
-- 
cgit v1.2.3-55-g7522


From 2b0b5c6fe9b383f3cf35a0a6371c9d577bd523ff Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 8 Apr 2010 23:03:20 +0200
Subject: perf, x86: Consolidate some code repetition

Remove some duplicated logic.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 97 ++++++++++++++-----------------
 1 file changed, 44 insertions(+), 53 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 080b9b065bdd..35056f715e9e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -452,14 +452,54 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 
 static int intel_pmu_save_and_restart(struct perf_event *event);
 
+static void __intel_pmu_pebs_event(struct perf_event *event,
+				   struct pt_regs *iregs, void *__pebs)
+{
+	/*
+	 * We cast to pebs_record_core since that is a subset of
+	 * both formats and we don't use the other fields in this
+	 * routine.
+	 */
+	struct pebs_record_core *pebs = __pebs;
+	struct perf_sample_data data;
+	struct pt_regs regs;
+
+	if (!intel_pmu_save_and_restart(event))
+		return;
+
+	perf_sample_data_init(&data, 0);
+	data.period = event->hw.last_period;
+
+	/*
+	 * We use the interrupt regs as a base because the PEBS record
+	 * does not contain a full regs set, specifically it seems to
+	 * lack segment descriptors, which get used by things like
+	 * user_mode().
+	 *
+	 * In the simple case fix up only the IP and BP,SP regs, for
+	 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
+	 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
+	 */
+	regs = *iregs;
+	regs.ip = pebs->ip;
+	regs.bp = pebs->bp;
+	regs.sp = pebs->sp;
+
+	if (intel_pmu_pebs_fixup_ip(regs))
+		regs.flags |= PERF_EFLAGS_EXACT;
+	else
+		regs.flags &= ~PERF_EFLAGS_EXACT;
+
+	if (perf_event_overflow(event, 1, &data, &regs))
+		x86_pmu_stop(event);
+}
+
 static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
 	struct perf_event *event = cpuc->events[0]; /* PMC0 only */
 	struct pebs_record_core *at, *top;
-	struct perf_sample_data data;
-	struct pt_regs regs;
 	int n;
 
 	if (!ds || !x86_pmu.pebs)
@@ -485,9 +525,6 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	if (n <= 0)
 		return;
 
-	if (!intel_pmu_save_and_restart(event))
-		return;
-
 	/*
 	 * Should not happen, we program the threshold at 1 and do not
 	 * set a reset value.
@@ -495,31 +532,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	WARN_ON_ONCE(n > 1);
 	at += n - 1;
 
-	perf_sample_data_init(&data, 0);
-	data.period = event->hw.last_period;
-
-	/*
-	 * We use the interrupt regs as a base because the PEBS record
-	 * does not contain a full regs set, specifically it seems to
-	 * lack segment descriptors, which get used by things like
-	 * user_mode().
-	 *
-	 * In the simple case fix up only the IP and BP,SP regs, for
-	 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
-	 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
-	 */
-	regs = *iregs;
-	regs.ip = at->ip;
-	regs.bp = at->bp;
-	regs.sp = at->sp;
-
-	if (intel_pmu_pebs_fixup_ip(&regs))
-		regs.flags |= PERF_EFLAGS_EXACT;
-	else
-		regs.flags &= ~PERF_EFLAGS_EXACT;
-
-	if (perf_event_overflow(event, 1, &data, &regs))
-		x86_pmu_stop(event);
+	__intel_pmu_pebs_event(event, iregs, at);
 }
 
 static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
@@ -527,9 +540,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
 	struct pebs_record_nhm *at, *top;
-	struct perf_sample_data data;
 	struct perf_event *event = NULL;
-	struct pt_regs regs;
 	u64 status = 0;
 	int bit, n;
 
@@ -571,27 +582,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		if (!event || bit >= MAX_PEBS_EVENTS)
 			continue;
 
-		if (!intel_pmu_save_and_restart(event))
-			continue;
-
-		perf_sample_data_init(&data, 0);
-		data.period = event->hw.last_period;
-
-		/*
-		 * See the comment in intel_pmu_drain_pebs_core()
-		 */
-		regs = *iregs;
-		regs.ip = at->ip;
-		regs.bp = at->bp;
-		regs.sp = at->sp;
-
-		if (intel_pmu_pebs_fixup_ip(&regs))
-			regs.flags |= PERF_EFLAGS_EXACT;
-		else
-			regs.flags &= ~PERF_EFLAGS_EXACT;
-
-		if (perf_event_overflow(event, 1, &data, &regs))
-			x86_pmu_stop(event);
+		__intel_pmu_pebs_event(event, iregs, at);
 	}
 }
 
-- 
cgit v1.2.3-55-g7522


From ab608344bcbde4f55ec4cd911b686b0ce3eae076 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 8 Apr 2010 23:03:20 +0200
Subject: perf, x86: Improve the PEBS ABI

Rename perf_event_attr::precise to perf_event_attr::precise_ip and
widen it to 2 bits. This new field describes the required precision of
the PERF_SAMPLE_IP field:

  0 - SAMPLE_IP can have arbitrary skid
  1 - SAMPLE_IP must have constant skid
  2 - SAMPLE_IP requested to have 0 skid
  3 - SAMPLE_IP must have 0 skid

And modify the Intel PEBS code accordingly. The PEBS implementation
now supports up to precise_ip == 2, where we perform the IP fixup.

Also s/PERF_RECORD_MISC_EXACT/&_IP/ to clarify its meaning, this bit
should be set for each PERF_SAMPLE_IP field known to match the actual
instruction triggering the event.

This new scheme allows for a PEBS mode that uses the buffer for more
than a single event.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Stephane Eranian <eranian@google.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c          | 17 ++++++++++++++++-
 arch/x86/kernel/cpu/perf_event_intel.c    |  4 ++--
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 12 ++++++------
 include/linux/perf_event.h                | 23 +++++++++++++++++++----
 tools/perf/builtin-top.c                  |  2 +-
 tools/perf/util/parse-events.c            | 25 ++++++++++++++++---------
 6 files changed, 60 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 4a3f1f2b9b91..27fa9eeed024 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -488,6 +488,21 @@ static int x86_setup_perfctr(struct perf_event *event)
 
 static int x86_pmu_hw_config(struct perf_event *event)
 {
+	if (event->attr.precise_ip) {
+		int precise = 0;
+
+		/* Support for constant skid */
+		if (x86_pmu.pebs)
+			precise++;
+
+		/* Support for IP fixup */
+		if (x86_pmu.lbr_nr)
+			precise++;
+
+		if (event->attr.precise_ip > precise)
+			return -EOPNOTSUPP;
+	}
+
 	/*
 	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
@@ -1780,7 +1795,7 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
 	}
 
 	if (regs->flags & PERF_EFLAGS_EXACT)
-		misc |= PERF_RECORD_MISC_EXACT;
+		misc |= PERF_RECORD_MISC_EXACT_IP;
 
 	return misc;
 }
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a4b56ac425cb..fdbc652d3feb 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -563,7 +563,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
 
 	x86_pmu_disable_event(event);
 
-	if (unlikely(event->attr.precise))
+	if (unlikely(event->attr.precise_ip))
 		intel_pmu_pebs_disable(event);
 }
 
@@ -615,7 +615,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
 		return;
 	}
 
-	if (unlikely(event->attr.precise))
+	if (unlikely(event->attr.precise_ip))
 		intel_pmu_pebs_enable(event);
 
 	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 35056f715e9e..18018d1311cd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -307,7 +307,7 @@ intel_pebs_constraints(struct perf_event *event)
 {
 	struct event_constraint *c;
 
-	if (!event->attr.precise)
+	if (!event->attr.precise_ip)
 		return NULL;
 
 	if (x86_pmu.pebs_constraints) {
@@ -330,7 +330,7 @@ static void intel_pmu_pebs_enable(struct perf_event *event)
 	cpuc->pebs_enabled |= 1ULL << hwc->idx;
 	WARN_ON_ONCE(cpuc->enabled);
 
-	if (x86_pmu.intel_cap.pebs_trap)
+	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
 		intel_pmu_lbr_enable(event);
 }
 
@@ -345,7 +345,7 @@ static void intel_pmu_pebs_disable(struct perf_event *event)
 
 	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
 
-	if (x86_pmu.intel_cap.pebs_trap)
+	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
 		intel_pmu_lbr_disable(event);
 }
 
@@ -485,7 +485,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 	regs.bp = pebs->bp;
 	regs.sp = pebs->sp;
 
-	if (intel_pmu_pebs_fixup_ip(regs))
+	if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
 		regs.flags |= PERF_EFLAGS_EXACT;
 	else
 		regs.flags &= ~PERF_EFLAGS_EXACT;
@@ -518,7 +518,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 
 	WARN_ON_ONCE(!event);
 
-	if (!event->attr.precise)
+	if (!event->attr.precise_ip)
 		return;
 
 	n = top - at;
@@ -570,7 +570,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 
 			WARN_ON_ONCE(!event);
 
-			if (!event->attr.precise)
+			if (!event->attr.precise_ip)
 				continue;
 
 			if (__test_and_set_bit(bit, (unsigned long *)&status))
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6be4a0f9137c..23cd0057a681 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -203,9 +203,19 @@ struct perf_event_attr {
 				enable_on_exec :  1, /* next exec enables     */
 				task           :  1, /* trace fork/exit       */
 				watermark      :  1, /* wakeup_watermark      */
-				precise        :  1, /* OoO invariant counter */
-
-				__reserved_1   : 48;
+				/*
+				 * precise_ip:
+				 *
+				 *  0 - SAMPLE_IP can have arbitrary skid
+				 *  1 - SAMPLE_IP must have constant skid
+				 *  2 - SAMPLE_IP requested to have 0 skid
+				 *  3 - SAMPLE_IP must have 0 skid
+				 *
+				 *  See also PERF_RECORD_MISC_EXACT_IP
+				 */
+				precise_ip     :  2, /* skid constraint       */
+
+				__reserved_1   : 47;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -296,7 +306,12 @@ struct perf_event_mmap_page {
 #define PERF_RECORD_MISC_GUEST_KERNEL		(4 << 0)
 #define PERF_RECORD_MISC_GUEST_USER		(5 << 0)
 
-#define PERF_RECORD_MISC_EXACT			(1 << 14)
+/*
+ * Indicates that the content of PERF_SAMPLE_IP points to
+ * the actual instruction that triggered the event. See also
+ * perf_event_attr::precise_ip.
+ */
+#define PERF_RECORD_MISC_EXACT_IP		(1 << 14)
 /*
  * Reserve the last bit to indicate some extended misc field
  */
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 3de397764cb3..ed9b5b6905fa 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1021,7 +1021,7 @@ static void event__process_sample(const event_t *self,
 		return;
 	}
 
-	if (self->header.misc & PERF_RECORD_MISC_EXACT)
+	if (self->header.misc & PERF_RECORD_MISC_EXACT_IP)
 		exact_samples++;
 
 	if (event__preprocess_sample(self, session, &al, symbol_filter) < 0 ||
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index bc8b7e614207..ae7f5917935c 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -654,10 +654,6 @@ parse_raw_event(const char **strp, struct perf_event_attr *attr)
 		return EVT_FAILED;
 	n = hex2u64(str + 1, &config);
 	if (n > 0) {
-		if (str[n+1] == 'p') {
-			attr->precise = 1;
-			n++;
-		}
 		*strp = str + n + 1;
 		attr->type = PERF_TYPE_RAW;
 		attr->config = config;
@@ -692,19 +688,29 @@ static enum event_result
 parse_event_modifier(const char **strp, struct perf_event_attr *attr)
 {
 	const char *str = *strp;
-	int eu = 1, ek = 1, eh = 1;
+	int exclude = 0;
+	int eu = 0, ek = 0, eh = 0, precise = 0;
 
 	if (*str++ != ':')
 		return 0;
 	while (*str) {
-		if (*str == 'u')
+		if (*str == 'u') {
+			if (!exclude)
+				exclude = eu = ek = eh = 1;
 			eu = 0;
-		else if (*str == 'k')
+		} else if (*str == 'k') {
+			if (!exclude)
+				exclude = eu = ek = eh = 1;
 			ek = 0;
-		else if (*str == 'h')
+		} else if (*str == 'h') {
+			if (!exclude)
+				exclude = eu = ek = eh = 1;
 			eh = 0;
-		else
+		} else if (*str == 'p') {
+			precise++;
+		} else
 			break;
+
 		++str;
 	}
 	if (str >= *strp + 2) {
@@ -712,6 +718,7 @@ parse_event_modifier(const char **strp, struct perf_event_attr *attr)
 		attr->exclude_user   = eu;
 		attr->exclude_kernel = ek;
 		attr->exclude_hv     = eh;
+		attr->precise_ip     = precise;
 		return 1;
 	}
 	return 0;
-- 
cgit v1.2.3-55-g7522


From 4d1c52b02d977d884abb21d0bbaba6b5d6bc8374 Mon Sep 17 00:00:00 2001
From: Lin Ming
Date: Fri, 23 Apr 2010 13:56:12 +0800
Subject: perf, x86: implement group scheduling transactional APIs

Convert to the transactional PMU API and remove the duplication of
group_sched_in().

Reviewed-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1272002172.5707.61.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 180 +++++++++++++++------------------------
 1 file changed, 67 insertions(+), 113 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 27fa9eeed024..fd4db0db3708 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -110,6 +110,8 @@ struct cpu_hw_events {
 	u64			tags[X86_PMC_IDX_MAX];
 	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
 
+	unsigned int		group_flag;
+
 	/*
 	 * Intel DebugStore bits
 	 */
@@ -961,6 +963,14 @@ static int x86_pmu_enable(struct perf_event *event)
 	if (n < 0)
 		return n;
 
+	/*
+	 * If group events scheduling transaction was started,
+	 * skip the schedulability test here, it will be peformed
+	 * at commit time(->commit_txn) as a whole
+	 */
+	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+		goto out;
+
 	ret = x86_pmu.schedule_events(cpuc, n, assign);
 	if (ret)
 		return ret;
@@ -970,6 +980,7 @@ static int x86_pmu_enable(struct perf_event *event)
 	 */
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
+out:
 	cpuc->n_events = n;
 	cpuc->n_added += n - n0;
 
@@ -1227,119 +1238,6 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 	return &unconstrained;
 }
 
-static int x86_event_sched_in(struct perf_event *event,
-			  struct perf_cpu_context *cpuctx)
-{
-	int ret = 0;
-
-	event->state = PERF_EVENT_STATE_ACTIVE;
-	event->oncpu = smp_processor_id();
-	event->tstamp_running += event->ctx->time - event->tstamp_stopped;
-
-	if (!is_x86_event(event))
-		ret = event->pmu->enable(event);
-
-	if (!ret && !is_software_event(event))
-		cpuctx->active_oncpu++;
-
-	if (!ret && event->attr.exclusive)
-		cpuctx->exclusive = 1;
-
-	return ret;
-}
-
-static void x86_event_sched_out(struct perf_event *event,
-			    struct perf_cpu_context *cpuctx)
-{
-	event->state = PERF_EVENT_STATE_INACTIVE;
-	event->oncpu = -1;
-
-	if (!is_x86_event(event))
-		event->pmu->disable(event);
-
-	event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
-
-	if (!is_software_event(event))
-		cpuctx->active_oncpu--;
-
-	if (event->attr.exclusive || !cpuctx->active_oncpu)
-		cpuctx->exclusive = 0;
-}
-
-/*
- * Called to enable a whole group of events.
- * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
- * Assumes the caller has disabled interrupts and has
- * frozen the PMU with hw_perf_save_disable.
- *
- * called with PMU disabled. If successful and return value 1,
- * then guaranteed to call perf_enable() and hw_perf_enable()
- */
-int hw_perf_group_sched_in(struct perf_event *leader,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_event_context *ctx)
-{
-	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-	struct perf_event *sub;
-	int assign[X86_PMC_IDX_MAX];
-	int n0, n1, ret;
-
-	if (!x86_pmu_initialized())
-		return 0;
-
-	/* n0 = total number of events */
-	n0 = collect_events(cpuc, leader, true);
-	if (n0 < 0)
-		return n0;
-
-	ret = x86_pmu.schedule_events(cpuc, n0, assign);
-	if (ret)
-		return ret;
-
-	ret = x86_event_sched_in(leader, cpuctx);
-	if (ret)
-		return ret;
-
-	n1 = 1;
-	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-		if (sub->state > PERF_EVENT_STATE_OFF) {
-			ret = x86_event_sched_in(sub, cpuctx);
-			if (ret)
-				goto undo;
-			++n1;
-		}
-	}
-	/*
-	 * copy new assignment, now we know it is possible
-	 * will be used by hw_perf_enable()
-	 */
-	memcpy(cpuc->assign, assign, n0*sizeof(int));
-
-	cpuc->n_events  = n0;
-	cpuc->n_added  += n1;
-	ctx->nr_active += n1;
-
-	/*
-	 * 1 means successful and events are active
-	 * This is not quite true because we defer
-	 * actual activation until hw_perf_enable() but
-	 * this way we* ensure caller won't try to enable
-	 * individual events
-	 */
-	return 1;
-undo:
-	x86_event_sched_out(leader, cpuctx);
-	n0  = 1;
-	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
-			x86_event_sched_out(sub, cpuctx);
-			if (++n0 == n1)
-				break;
-		}
-	}
-	return ret;
-}
-
 #include "perf_event_amd.c"
 #include "perf_event_p6.c"
 #include "perf_event_p4.c"
@@ -1471,6 +1369,59 @@ static inline void x86_pmu_read(struct perf_event *event)
 	x86_perf_event_update(event);
 }
 
+/*
+ * Start group events scheduling transaction
+ * Set the flag to make pmu::enable() not perform the
+ * schedulability test, it will be performed at commit time
+ */
+static void x86_pmu_start_txn(const struct pmu *pmu)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+}
+
+/*
+ * Stop group events scheduling transaction
+ * Clear the flag and pmu::enable() will perform the
+ * schedulability test.
+ */
+static void x86_pmu_cancel_txn(const struct pmu *pmu)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+}
+
+/*
+ * Commit group events scheduling transaction
+ * Perform the group schedulability test as a whole
+ * Return 0 if success
+ */
+static int x86_pmu_commit_txn(const struct pmu *pmu)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int assign[X86_PMC_IDX_MAX];
+	int n, ret;
+
+	n = cpuc->n_events;
+
+	if (!x86_pmu_initialized())
+		return -EAGAIN;
+
+	ret = x86_pmu.schedule_events(cpuc, n, assign);
+	if (ret)
+		return ret;
+
+	/*
+	 * copy new assignment, now we know it is possible
+	 * will be used by hw_perf_enable()
+	 */
+	memcpy(cpuc->assign, assign, n*sizeof(int));
+
+	return 0;
+}
+
 static const struct pmu pmu = {
 	.enable		= x86_pmu_enable,
 	.disable	= x86_pmu_disable,
@@ -1478,6 +1429,9 @@ static const struct pmu pmu = {
 	.stop		= x86_pmu_stop,
 	.read		= x86_pmu_read,
 	.unthrottle	= x86_pmu_unthrottle,
+	.start_txn	= x86_pmu_start_txn,
+	.cancel_txn	= x86_pmu_cancel_txn,
+	.commit_txn	= x86_pmu_commit_txn,
 };
 
 /*
-- 
cgit v1.2.3-55-g7522


From 2b107d93635616db0c3f893c8cc2e6d5cd8d77b2 Mon Sep 17 00:00:00 2001
From: Jacob Pan
Date: Fri, 7 May 2010 14:59:45 -0700
Subject: x86: Avoid check hlt for newer cpus

Check hlt instruction was targeted for some older CPUs. It is an expensive
operation in that it takes 4 ticks to break out the check.  We can avoid
such check completely for newer x86 cpus (family >= 5).

[ hpa: corrected family > 5 to family >= 5 ]

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
LKML-Reference: <1273269585-14346-1-git-send-email-jacob.jun.pan@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/bugs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 01a265212395..c39576cb3018 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -86,7 +86,7 @@ static void __init check_fpu(void)
 
 static void __init check_hlt(void)
 {
-	if (paravirt_enabled())
+	if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
 		return;
 
 	printk(KERN_INFO "Checking 'hlt' instruction... ");
-- 
cgit v1.2.3-55-g7522


From 9fa02317429449e8176c9bb6da3ac00eb14d52d3 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman
Date: Fri, 7 May 2010 16:55:41 -0700
Subject: x86, HyperV: fix up the license to mshyperv.c

This should have been GPLv2 only, we cut and pasted from the wrong file
originally, sorry.

Also removed some unneeded boilerplate license code, we all know where
to find the GPLv2, and that there's no warranty as that is implicit from
the license.

Cc: Ky Srinivasan <ksrinivasan@novell.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
LKML-Reference: <20100507235541.GA15448@kroah.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mshyperv.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 2443b61cdb17..a58d8e64fc7c 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -6,18 +6,7 @@
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ * the Free Software Foundation; version 2 of the License.
  *
  */
 
-- 
cgit v1.2.3-55-g7522


From e08cae4181af9483b04ecfac48f01c8e5a5f27bf Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Fri, 7 May 2010 16:57:28 -0700
Subject: x86: Clean up the hypervisor layer

Clean up the hypervisor layer and the hypervisor drivers, using an ops
structure instead of an enumeration with if statements.

The identity of the hypervisor, if needed, can be tested by testing
the pointer value in x86_hyper.

The MS-HyperV private state is moved into a normal global variable
(it's per-system state, not per-CPU state).  Being a normal bss
variable, it will be left at all zero on non-HyperV platforms, and so
can generally be tested for HyperV-specific features without
additional qualification.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Greg KH <greg@kroah.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Ky Srinivasan <ksrinivasan@novell.com>
LKML-Reference: <4BE49778.6060800@zytor.com>
---
 arch/x86/include/asm/hyperv.h     |  5 ++--
 arch/x86/include/asm/hypervisor.h | 27 +++++++++++++++++--
 arch/x86/include/asm/mshyperv.h   | 15 ++++++++---
 arch/x86/include/asm/processor.h  |  7 -----
 arch/x86/include/asm/vmware.h     | 27 -------------------
 arch/x86/kernel/cpu/hypervisor.c  | 56 ++++++++++++++++++++++++---------------
 arch/x86/kernel/cpu/mshyperv.c    | 51 +++++++++++++++++------------------
 arch/x86/kernel/cpu/vmware.c      | 36 ++++++++++++++-----------
 8 files changed, 117 insertions(+), 107 deletions(-)
 delete mode 100644 arch/x86/include/asm/vmware.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h
index 46040473e122..5df477ac3af7 100644
--- a/arch/x86/include/asm/hyperv.h
+++ b/arch/x86/include/asm/hyperv.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_X86_KVM_HYPERV_H
-#define _ASM_X86_KVM_HYPERV_H
+#ifndef _ASM_X86_HYPERV_H
+#define _ASM_X86_HYPERV_H
 
 #include <linux/types.h>
 
@@ -16,6 +16,7 @@
 
 #define HYPERV_HYPERVISOR_PRESENT_BIT		0x80000000
 #define HYPERV_CPUID_MIN			0x40000005
+#define HYPERV_CPUID_MAX			0x4000ffff
 
 /*
  * Feature identification. EAX indicates which features are available
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index b78c0941e422..70abda7058c8 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -17,10 +17,33 @@
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  *
  */
-#ifndef ASM_X86__HYPERVISOR_H
-#define ASM_X86__HYPERVISOR_H
+#ifndef _ASM_X86_HYPERVISOR_H
+#define _ASM_X86_HYPERVISOR_H
 
 extern void init_hypervisor(struct cpuinfo_x86 *c);
 extern void init_hypervisor_platform(void);
 
+/*
+ * x86 hypervisor information
+ */
+struct hypervisor_x86 {
+	/* Hypervisor name */
+	const char	*name;
+
+	/* Detection routine */
+	bool		(*detect)(void);
+
+	/* Adjust CPU feature bits (run once per CPU) */
+	void		(*set_cpu_features)(struct cpuinfo_x86 *);
+
+	/* Platform setup (run once per boot) */
+	void		(*init_platform)(void);
+};
+
+extern const struct hypervisor_x86 *x86_hyper;
+
+/* Recognized hypervisors */
+extern const struct hypervisor_x86 x86_hyper_vmware;
+extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
+
 #endif
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 6cd8101d1344..79ce5685ab64 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -1,7 +1,14 @@
-#ifndef ASM_X86__MSHYPER_H
-#define ASM_X86__MSHYPER_H
+#ifndef _ASM_X86_MSHYPER_H
+#define _ASM_X86_MSHYPER_H
 
-int ms_hyperv_platform(void);
-void __cpuinit ms_hyperv_set_feature_bits(struct cpuinfo_x86 *c);
+#include <linux/types.h>
+#include <asm/hyperv.h>
+
+struct ms_hyperv_info {
+	u32 features;
+	u32 hints;
+};
+
+extern struct ms_hyperv_info ms_hyperv;
 
 #endif
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 597c041bd124..e4f1dfb2d05b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -113,9 +113,6 @@ struct cpuinfo_x86 {
 	/* Index into per_cpu list: */
 	u16			cpu_index;
 #endif
-	unsigned int		x86_hyper_vendor;
-	/* The layout of this field is hypervisor specific */
-	unsigned int		x86_hyper_features;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define X86_VENDOR_INTEL	0
@@ -129,10 +126,6 @@ struct cpuinfo_x86 {
 
 #define X86_VENDOR_UNKNOWN	0xff
 
-#define X86_HYPER_VENDOR_NONE  0
-#define X86_HYPER_VENDOR_VMWARE 1
-#define X86_HYPER_VENDOR_MSFT	2
-
 /*
  * capabilities of CPUs
  */
diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h
deleted file mode 100644
index e49ed6d2fd4e..000000000000
--- a/arch/x86/include/asm/vmware.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (C) 2008, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- */
-#ifndef ASM_X86__VMWARE_H
-#define ASM_X86__VMWARE_H
-
-extern void vmware_platform_setup(void);
-extern int vmware_platform(void);
-extern void vmware_set_feature_bits(struct cpuinfo_x86 *c);
-
-#endif
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index de3f4e0ce8eb..87381759d3cb 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -22,40 +22,52 @@
  */
 
 #include <asm/processor.h>
-#include <asm/vmware.h>
-#include <asm/mshyperv.h>
 #include <asm/hypervisor.h>
 
-static inline void __cpuinit
-detect_hypervisor_vendor(struct cpuinfo_x86 *c)
+/*
+ * Hypervisor detect order.  This is specified explicitly here because
+ * some hypervisors might implement compatibility modes for other
+ * hypervisors and therefore need to be detected in specific sequence.
+ */
+static const __initconst struct hypervisor_x86 * const hypervisors[] =
 {
-	if (vmware_platform())
-		c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
-	else if (ms_hyperv_platform())
-		c->x86_hyper_vendor = X86_HYPER_VENDOR_MSFT;
-	else
-		c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
-}
+	&x86_hyper_vmware,
+	&x86_hyper_ms_hyperv,
+};
 
-static inline void __cpuinit
-hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
+const struct hypervisor_x86 *x86_hyper;
+
+static inline void __init
+detect_hypervisor_vendor(void)
 {
-	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
-		vmware_set_feature_bits(c);
-	else if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_MSFT)
-		ms_hyperv_set_feature_bits(c);
-	return;
+	const struct hypervisor_x86 *h, * const *p;
+
+	for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
+		h = *p;
+		if (h->detect()) {
+			x86_hyper = h;
+			printk(KERN_INFO "Hypervisor detected: %s\n", h->name);
+			break;
+		}
+	}
 }
 
 void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
 {
-	detect_hypervisor_vendor(c);
-	hypervisor_set_feature_bits(c);
+	if (x86_hyper && x86_hyper->set_cpu_features)
+		x86_hyper->set_cpu_features(c);
 }
 
 void __init init_hypervisor_platform(void)
 {
+
+	detect_hypervisor_vendor();
+
+	if (!x86_hyper)
+		return;
+
 	init_hypervisor(&boot_cpu_data);
-	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
-		vmware_platform_setup();
+
+	if (x86_hyper->init_platform)
+		x86_hyper->init_platform();
 }
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index a58d8e64fc7c..5969c3ee3186 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -12,45 +12,42 @@
 
 #include <linux/types.h>
 #include <asm/processor.h>
+#include <asm/hypervisor.h>
 #include <asm/hyperv.h>
 #include <asm/mshyperv.h>
 
+struct ms_hyperv_info ms_hyperv;
 
-int ms_hyperv_platform(void)
+static bool __init ms_hyperv_platform(void)
 {
-	u32 eax, ebx, ecx, edx;
-	char hyp_signature[13];
+	u32 eax;
+	u32 hyp_signature[3];
 
-	cpuid(1, &eax, &ebx, &ecx, &edx);
-	if (!(ecx & HYPERV_HYPERVISOR_PRESENT_BIT))
-		return 0;
+	if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
+		return false;
 
-	cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &ebx, &ecx, &edx);
-	*(u32 *)(hyp_signature + 0) = ebx;
-	*(u32 *)(hyp_signature + 4) = ecx;
-	*(u32 *)(hyp_signature + 8) = edx;
+	cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
+	      &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
 
-	if ((eax < HYPERV_CPUID_MIN) || (memcmp("Microsoft Hv", hyp_signature, 12)))
-		return 0;
-	return 1;
+	return eax >= HYPERV_CPUID_MIN &&
+		eax <= HYPERV_CPUID_MAX &&
+		!memcmp("Microsoft Hv", hyp_signature, 12);
 }
 
-void __cpuinit ms_hyperv_set_feature_bits(struct cpuinfo_x86 *c)
+static void __init ms_hyperv_init_platform(void)
 {
-	u32 eax, ebx, ecx, edx;
-
-	c->x86_hyper_features = 0;
 	/*
-	 * Extract the features, recommendations etc.
-	 * The first 9 bits will be used to track hypervisor features.
-	 * The next 6 bits will be used to track the hypervisor
-	 * recommendations.
+	 * Extract the features and hints
 	 */
-	cpuid(HYPERV_CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
-	c->x86_hyper_features |= (eax & 0x1ff);
+	ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+	ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
 
-	cpuid(HYPERV_CPUID_ENLIGHTMENT_INFO, &eax, &ebx, &ecx, &edx);
-	c->x86_hyper_features |= ((eax & 0x3f) << 9);
-	printk(KERN_INFO "Detected HyperV with features: %x\n",
-		c->x86_hyper_features);
+	printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
+	       ms_hyperv.features, ms_hyperv.hints);
 }
+
+const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
+	.name			= "Microsoft HyperV",
+	.detect			= ms_hyperv_platform,
+	.init_platform		= ms_hyperv_init_platform,
+};
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 1cbed97b59cf..46a5b5d3ba5e 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -23,8 +23,8 @@
 
 #include <linux/dmi.h>
 #include <asm/div64.h>
-#include <asm/vmware.h>
 #include <asm/x86_init.h>
+#include <asm/hypervisor.h>
 
 #define CPUID_VMWARE_INFO_LEAF	0x40000000
 #define VMWARE_HYPERVISOR_MAGIC	0x564D5868
@@ -64,7 +64,7 @@ static unsigned long vmware_get_tsc_khz(void)
 	return tsc_hz;
 }
 
-void __init vmware_platform_setup(void)
+static void __init vmware_platform_setup(void)
 {
 	uint32_t eax, ebx, ecx, edx;
 
@@ -82,24 +82,21 @@ void __init vmware_platform_setup(void)
  * serial key should be enough, as this will always have a VMware
  * specific string when running under VMware hypervisor.
  */
-int vmware_platform(void)
+static bool __init vmware_platform(void)
 {
 	if (cpu_has_hypervisor) {
-		unsigned int eax, ebx, ecx, edx;
-		char hyper_vendor_id[13];
-
-		cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx);
-		memcpy(hyper_vendor_id + 0, &ebx, 4);
-		memcpy(hyper_vendor_id + 4, &ecx, 4);
-		memcpy(hyper_vendor_id + 8, &edx, 4);
-		hyper_vendor_id[12] = '\0';
-		if (!strcmp(hyper_vendor_id, "VMwareVMware"))
-			return 1;
+		unsigned int eax;
+		unsigned int hyper_vendor_id[3];
+
+		cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
+		      &hyper_vendor_id[1], &hyper_vendor_id[2]);
+		if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
+			return true;
 	} else if (dmi_available && dmi_name_in_serial("VMware") &&
 		   __vmware_platform())
-		return 1;
+		return true;
 
-	return 0;
+	return false;
 }
 
 /*
@@ -114,8 +111,15 @@ int vmware_platform(void)
  * so that the kernel could just trust the hypervisor with providing a
  * reliable virtual TSC that is suitable for timekeeping.
  */
-void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c)
+static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c)
 {
 	set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 	set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
 }
+
+const __refconst struct hypervisor_x86 x86_hyper_vmware = {
+	.name			= "VMware",
+	.detect			= vmware_platform,
+	.set_cpu_features	= vmware_set_cpu_features,
+	.init_platform		= vmware_platform_setup,
+};
-- 
cgit v1.2.3-55-g7522


From de902d967feb96f2dfddfbe9dbd69dc22fd5ebcb Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Sat, 8 May 2010 15:39:52 +0400
Subject: x86, perf: P4 PMU -- configure predefined events

If an event is not RAW we should not exit p4_hw_config
early but call x86_setup_perfctr as well.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Robert Richter <robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 9e002054cb5f..b1f532d1d36f 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -439,21 +439,20 @@ static int p4_hw_config(struct perf_event *event)
 	if (p4_ht_active() && p4_ht_thread(cpu))
 		event->hw.config = p4_set_ht_bit(event->hw.config);
 
-	if (event->attr.type != PERF_TYPE_RAW)
-		return 0;
-
-	/*
-	 * We don't control raw events so it's up to the caller
-	 * to pass sane values (and we don't count the thread number
-	 * on HT machine but allow HT-compatible specifics to be
-	 * passed on)
-	 *
-	 * XXX: HT wide things should check perf_paranoid_cpu() &&
-	 *      CAP_SYS_ADMIN
-	 */
-	event->hw.config |= event->attr.config &
-		(p4_config_pack_escr(P4_ESCR_MASK_HT) |
-		 p4_config_pack_cccr(P4_CCCR_MASK_HT));
+	if (event->attr.type == PERF_TYPE_RAW) {
+		/*
+		 * We don't control raw events so it's up to the caller
+		 * to pass sane values (and we don't count the thread number
+		 * on HT machine but allow HT-compatible specifics to be
+		 * passed on)
+		 *
+		 * XXX: HT wide things should check perf_paranoid_cpu() &&
+		 *      CAP_SYS_ADMIN
+		 */
+		event->hw.config |= event->attr.config &
+			(p4_config_pack_escr(P4_ESCR_MASK_HT) |
+			 p4_config_pack_cccr(P4_CCCR_MASK_HT));
+	}
 
 	return x86_setup_perfctr(event);
 }
-- 
cgit v1.2.3-55-g7522


From 137351e0feeb9f25d99488ee1afc1c79f5499a9a Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Sat, 8 May 2010 15:25:52 +0400
Subject: x86, perf: P4 PMU -- protect sensible procedures from preemption

Steven reported:

|
| I'm getting:
|
| Pid: 3477, comm: perf Not tainted 2.6.34-rc6 #2727
| Call Trace:
|  [<ffffffff811c7565>] debug_smp_processor_id+0xd5/0xf0
|  [<ffffffff81019874>] p4_hw_config+0x2b/0x15c
|  [<ffffffff8107acbc>] ? trace_hardirqs_on_caller+0x12b/0x14f
|  [<ffffffff81019143>] hw_perf_event_init+0x468/0x7be
|  [<ffffffff810782fd>] ? debug_mutex_init+0x31/0x3c
|  [<ffffffff810c68b2>] T.850+0x273/0x42e
|  [<ffffffff810c6cab>] sys_perf_event_open+0x23e/0x3f1
|  [<ffffffff81009e6a>] ? sysret_check+0x2e/0x69
|  [<ffffffff81009e32>] system_call_fastpath+0x16/0x1b
|
| When running perf record in latest tip/perf/core
|

Due to the fact that p4 counters are shared between HT threads
we synthetically divide the whole set of counters into two
non-intersected subsets. And while we're "borrowing" counters
from these subsets we should not be preempted (well, strictly
speaking in p4_hw_config we just pre-set reference to the
subset which allow to save some cycles in schedule routine
if it happens on the same cpu). So use get_cpu/put_cpu pair.

Also p4_pmu_schedule_events should use smp_processor_id rather
than raw_ version. This allow us to catch up preemption issue
(if there will ever be).

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <20100508112716.963478928@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index b1f532d1d36f..ca40180c41d4 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -421,7 +421,8 @@ static u64 p4_pmu_event_map(int hw_event)
 
 static int p4_hw_config(struct perf_event *event)
 {
-	int cpu = raw_smp_processor_id();
+	int cpu = get_cpu();
+	int rc = 0;
 	u32 escr, cccr;
 
 	/*
@@ -454,7 +455,10 @@ static int p4_hw_config(struct perf_event *event)
 			 p4_config_pack_cccr(P4_CCCR_MASK_HT));
 	}
 
-	return x86_setup_perfctr(event);
+	rc = x86_setup_perfctr(event);
+	put_cpu();
+
+	return rc;
 }
 
 static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
-- 
cgit v1.2.3-55-g7522


From 3f51b7119d052827dbb0e40c966acdf2bdc6f47f Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Sat, 8 May 2010 15:25:53 +0400
Subject: x86, perf: P4 PMU -- Get rid of redundant check for array index

The caller already has done such a check.
And it was wrong anyway, it had to be '>=' rather than '>'

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <20100508112717.130386882@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ca40180c41d4..b8c2d379eea6 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -406,11 +406,6 @@ static u64 p4_pmu_event_map(int hw_event)
 	unsigned int esel;
 	u64 config;
 
-	if (hw_event > ARRAY_SIZE(p4_general_events)) {
-		printk_once(KERN_ERR "P4 PMU: Bad index: %i\n", hw_event);
-		return 0;
-	}
-
 	config = p4_general_events[hw_event];
 	bind = p4_config_get_bind(config);
 	esel = P4_OPCODE_ESEL(bind->opcode);
-- 
cgit v1.2.3-55-g7522


From c7993165ef0c1d636ca05f4787739f8414584e6d Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Sat, 8 May 2010 15:25:54 +0400
Subject: x86, perf: P4 PMU -- check for proper event index in RAW events

RAW events are special and we should be ready for user passing
in insane event index values.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <20100508112717.315897547@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index b8c2d379eea6..a603930271f3 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -418,6 +418,7 @@ static int p4_hw_config(struct perf_event *event)
 {
 	int cpu = get_cpu();
 	int rc = 0;
+	unsigned int evnt;
 	u32 escr, cccr;
 
 	/*
@@ -436,6 +437,14 @@ static int p4_hw_config(struct perf_event *event)
 		event->hw.config = p4_set_ht_bit(event->hw.config);
 
 	if (event->attr.type == PERF_TYPE_RAW) {
+
+		/* user data may have out-of-bound event index */
+		evnt = p4_config_unpack_event(event->attr.config);
+		if (evnt >= ARRAY_SIZE(p4_event_bind_map)) {
+			rc = -EINVAL;
+			goto out;
+		}
+
 		/*
 		 * We don't control raw events so it's up to the caller
 		 * to pass sane values (and we don't count the thread number
@@ -451,8 +460,8 @@ static int p4_hw_config(struct perf_event *event)
 	}
 
 	rc = x86_setup_perfctr(event);
+out:
 	put_cpu();
-
 	return rc;
 }
 
-- 
cgit v1.2.3-55-g7522


From 96f6e775b58687d85ee33004d414419b5ec34106 Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Sun, 9 May 2010 01:10:34 -0700
Subject: x86, hypervisor: Export the x86_hyper* symbols

Export x86_hyper and the related specific structures, allowing for
hypervisor identification by modules.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Greg KH <greg@kroah.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Ky Srinivasan <ksrinivasan@novell.com>
Cc: Dmitry Torokhov <dtor@vmware.com>
LKML-Reference: <4BE49778.6060800@zytor.com>
---
 arch/x86/kernel/cpu/hypervisor.c | 1 +
 arch/x86/kernel/cpu/mshyperv.c   | 1 +
 arch/x86/kernel/cpu/vmware.c     | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 87381759d3cb..4afb5a2130ed 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -36,6 +36,7 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
 };
 
 const struct hypervisor_x86 *x86_hyper;
+EXPORT_SYMBOL(x86_hyper);
 
 static inline void __init
 detect_hypervisor_vendor(void)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 5969c3ee3186..0f1371724c86 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -51,3 +51,4 @@ const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
 	.detect			= ms_hyperv_platform,
 	.init_platform		= ms_hyperv_init_platform,
 };
+EXPORT_SYMBOL(x86_hyper_ms_hyperv);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 265b432f6e64..b9d1ff588445 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -99,7 +99,6 @@ static bool __init vmware_platform(void)
 
 	return false;
 }
-EXPORT_SYMBOL(vmware_platform);
 
 /*
  * VMware hypervisor takes care of exporting a reliable TSC to the guest.
@@ -125,3 +124,4 @@ const __refconst struct hypervisor_x86 x86_hyper_vmware = {
 	.set_cpu_features	= vmware_set_cpu_features,
 	.init_platform		= vmware_platform_setup,
 };
+EXPORT_SYMBOL(x86_hyper_vmware);
-- 
cgit v1.2.3-55-g7522


From 3998d095354d2a3062bdaa821ef07a1e1c82873c Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Sun, 9 May 2010 22:46:54 -0700
Subject: x86, hypervisor: add missing <linux/module.h>

EXPORT_SYMBOL() needs <linux/module.h> to be included; fixes modular
builds of the VMware balloon driver, and any future modular drivers
which depends on the hypervisor.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Greg KH <greg@kroah.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Ky Srinivasan <ksrinivasan@novell.com>
Cc: Dmitry Torokhov <dtor@vmware.com>
LKML-Reference: <4BE49778.6060800@zytor.com>
---
 arch/x86/kernel/cpu/hypervisor.c | 1 +
 arch/x86/kernel/cpu/mshyperv.c   | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 4afb5a2130ed..dd531cc56a8f 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -21,6 +21,7 @@
  *
  */
 
+#include <linux/module.h>
 #include <asm/processor.h>
 #include <asm/hypervisor.h>
 
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 0f1371724c86..16f41bbe46b6 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/types.h>
+#include <linux/module.h>
 #include <asm/processor.h>
 #include <asm/hypervisor.h>
 #include <asm/hyperv.h>
-- 
cgit v1.2.3-55-g7522


From c9ad488289144ae5ef53b012e15895ef1f5e4bb6 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Thu, 6 May 2010 11:45:45 +0300
Subject: x86: Eliminate TS_XSAVE

The fpu code currently uses current->thread_info->status & TS_XSAVE as
a way to distinguish between XSAVE capable processors and older processors.
The decision is not really task specific; instead we use the task status to
avoid a global memory reference - the value should be the same across all
threads.

Eliminate this tie-in into the task structure by using an alternative
instruction keyed off the XSAVE cpu feature; this results in shorter and
faster code, without introducing a global memory reference.

[ hpa: in the future, this probably should use an asm jmp ]

Signed-off-by: Avi Kivity <avi@redhat.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1273135546-29690-2-git-send-email-avi@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/i387.h        | 20 ++++++++++++++++----
 arch/x86/include/asm/thread_info.h |  1 -
 arch/x86/kernel/cpu/common.c       |  5 +----
 arch/x86/kernel/i387.c             |  5 +----
 arch/x86/kernel/xsave.c            |  6 +++---
 5 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index da2930924501..a301a6825c3a 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -56,6 +56,18 @@ extern int restore_i387_xstate_ia32(void __user *buf);
 
 #define X87_FSW_ES (1 << 7)	/* Exception Summary */
 
+static inline bool use_xsave(void)
+{
+	u8 has_xsave;
+
+	alternative_io("mov $0, %0",
+		       "mov $1, %0",
+		       X86_FEATURE_XSAVE,
+		       "=g"(has_xsave));
+
+	return has_xsave;
+}
+
 #ifdef CONFIG_X86_64
 
 /* Ignore delayed exceptions from user space */
@@ -99,7 +111,7 @@ static inline void clear_fpu_state(struct task_struct *tsk)
 	/*
 	 * xsave header may indicate the init state of the FP.
 	 */
-	if ((task_thread_info(tsk)->status & TS_XSAVE) &&
+	if (use_xsave() &&
 	    !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
 		return;
 
@@ -164,7 +176,7 @@ static inline void fxsave(struct task_struct *tsk)
 
 static inline void __save_init_fpu(struct task_struct *tsk)
 {
-	if (task_thread_info(tsk)->status & TS_XSAVE)
+	if (use_xsave())
 		xsave(tsk);
 	else
 		fxsave(tsk);
@@ -218,7 +230,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
  */
 static inline void __save_init_fpu(struct task_struct *tsk)
 {
-	if (task_thread_info(tsk)->status & TS_XSAVE) {
+	if (use_xsave()) {
 		struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
 		struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
 
@@ -266,7 +278,7 @@ end:
 
 static inline int restore_fpu_checking(struct task_struct *tsk)
 {
-	if (task_thread_info(tsk)->status & TS_XSAVE)
+	if (use_xsave())
 		return xrstor_checking(&tsk->thread.xstate->xsave);
 	else
 		return fxrstor_checking(&tsk->thread.xstate->fxsave);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e0d28901e969..e9e341505ab3 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -244,7 +244,6 @@ static inline struct thread_info *current_thread_info(void)
 #define TS_POLLING		0x0004	/* true if in idle loop
 					   and not sleeping */
 #define TS_RESTORE_SIGMASK	0x0008	/* restore signal mask in do_signal() */
-#define TS_XSAVE		0x0010	/* Use xsave/xrstor */
 
 #define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4868e4a951ee..c1c00d0b1692 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1243,10 +1243,7 @@ void __cpuinit cpu_init(void)
 	/*
 	 * Force FPU initialization:
 	 */
-	if (cpu_has_xsave)
-		current_thread_info()->status = TS_XSAVE;
-	else
-		current_thread_info()->status = 0;
+	current_thread_info()->status = 0;
 	clear_used_math();
 	mxcsr_feature_mask_init();
 
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 54c31c285488..14ca1dc7a703 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -102,10 +102,7 @@ void __cpuinit fpu_init(void)
 
 	mxcsr_feature_mask_init();
 	/* clean state in init */
-	if (cpu_has_xsave)
-		current_thread_info()->status = TS_XSAVE;
-	else
-		current_thread_info()->status = 0;
+	current_thread_info()->status = 0;
 	clear_used_math();
 }
 #endif	/* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 782c3a362ec6..c1b0a11033a2 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -99,7 +99,7 @@ int save_i387_xstate(void __user *buf)
 		if (err)
 			return err;
 
-		if (task_thread_info(tsk)->status & TS_XSAVE)
+		if (use_xsave())
 			err = xsave_user(buf);
 		else
 			err = fxsave_user(buf);
@@ -116,7 +116,7 @@ int save_i387_xstate(void __user *buf)
 
 	clear_used_math(); /* trigger finit */
 
-	if (task_thread_info(tsk)->status & TS_XSAVE) {
+	if (use_xsave()) {
 		struct _fpstate __user *fx = buf;
 		struct _xstate __user *x = buf;
 		u64 xstate_bv;
@@ -225,7 +225,7 @@ int restore_i387_xstate(void __user *buf)
 		clts();
 		task_thread_info(current)->status |= TS_USEDFPU;
 	}
-	if (task_thread_info(tsk)->status & TS_XSAVE)
+	if (use_xsave())
 		err = restore_user_xstate(buf);
 	else
 		err = fxrstor_checking((__force struct i387_fxsave_struct *)
-- 
cgit v1.2.3-55-g7522


From 86603283326c9e95e5ad4e9fdddeec93cac5d9ad Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Thu, 6 May 2010 11:45:46 +0300
Subject: x86: Introduce 'struct fpu' and related API

Currently all fpu state access is through tsk->thread.xstate.  Since we wish
to generalize fpu access to non-task contexts, wrap the state in a new
'struct fpu' and convert existing access to use an fpu API.

Signal frame handlers are not converted to the API since they will remain
task context only things.

Signed-off-by: Avi Kivity <avi@redhat.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1273135546-29690-3-git-send-email-avi@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/i387.h      | 115 +++++++++++++++++++++++++++++----------
 arch/x86/include/asm/processor.h |   6 +-
 arch/x86/include/asm/xsave.h     |   7 ++-
 arch/x86/kernel/i387.c           | 102 +++++++++++++++++-----------------
 arch/x86/kernel/process.c        |  21 +++----
 arch/x86/kernel/process_32.c     |   2 +-
 arch/x86/kernel/process_64.c     |   2 +-
 arch/x86/kernel/xsave.c          |   2 +-
 arch/x86/math-emu/fpu_aux.c      |   6 +-
 9 files changed, 160 insertions(+), 103 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index a301a6825c3a..1a8cca33b736 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -16,6 +16,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/regset.h>
 #include <linux/hardirq.h>
+#include <linux/slab.h>
 #include <asm/asm.h>
 #include <asm/processor.h>
 #include <asm/sigcontext.h>
@@ -103,10 +104,10 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
    values. The kernel data segment can be sometimes 0 and sometimes
    new user value. Both should be ok.
    Use the PDA as safe address because it should be already in L1. */
-static inline void clear_fpu_state(struct task_struct *tsk)
+static inline void fpu_clear(struct fpu *fpu)
 {
-	struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
-	struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+	struct xsave_struct *xstate = &fpu->state->xsave;
+	struct i387_fxsave_struct *fx = &fpu->state->fxsave;
 
 	/*
 	 * xsave header may indicate the init state of the FP.
@@ -123,6 +124,11 @@ static inline void clear_fpu_state(struct task_struct *tsk)
 			  X86_FEATURE_FXSAVE_LEAK);
 }
 
+static inline void clear_fpu_state(struct task_struct *tsk)
+{
+	fpu_clear(&tsk->thread.fpu);
+}
+
 static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 {
 	int err;
@@ -147,7 +153,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 	return err;
 }
 
-static inline void fxsave(struct task_struct *tsk)
+static inline void fpu_fxsave(struct fpu *fpu)
 {
 	/* Using "rex64; fxsave %0" is broken because, if the memory operand
 	   uses any extended registers for addressing, a second REX prefix
@@ -157,42 +163,45 @@ static inline void fxsave(struct task_struct *tsk)
 	/* Using "fxsaveq %0" would be the ideal choice, but is only supported
 	   starting with gas 2.16. */
 	__asm__ __volatile__("fxsaveq %0"
-			     : "=m" (tsk->thread.xstate->fxsave));
+			     : "=m" (fpu->state->fxsave));
 #elif 0
 	/* Using, as a workaround, the properly prefixed form below isn't
 	   accepted by any binutils version so far released, complaining that
 	   the same type of prefix is used twice if an extended register is
 	   needed for addressing (fix submitted to mainline 2005-11-21). */
 	__asm__ __volatile__("rex64/fxsave %0"
-			     : "=m" (tsk->thread.xstate->fxsave));
+			     : "=m" (fpu->state->fxsave));
 #else
 	/* This, however, we can work around by forcing the compiler to select
 	   an addressing mode that doesn't require extended registers. */
 	__asm__ __volatile__("rex64/fxsave (%1)"
-			     : "=m" (tsk->thread.xstate->fxsave)
-			     : "cdaSDb" (&tsk->thread.xstate->fxsave));
+			     : "=m" (fpu->state->fxsave)
+			     : "cdaSDb" (&fpu->state->fxsave));
 #endif
 }
 
-static inline void __save_init_fpu(struct task_struct *tsk)
+static inline void fpu_save_init(struct fpu *fpu)
 {
 	if (use_xsave())
-		xsave(tsk);
+		fpu_xsave(fpu);
 	else
-		fxsave(tsk);
+		fpu_fxsave(fpu);
 
-	clear_fpu_state(tsk);
+	fpu_clear(fpu);
+}
+
+static inline void __save_init_fpu(struct task_struct *tsk)
+{
+	fpu_save_init(&tsk->thread.fpu);
 	task_thread_info(tsk)->status &= ~TS_USEDFPU;
 }
 
 #else  /* CONFIG_X86_32 */
 
 #ifdef CONFIG_MATH_EMULATION
-extern void finit_task(struct task_struct *tsk);
+extern void finit_soft_fpu(struct i387_soft_struct *soft);
 #else
-static inline void finit_task(struct task_struct *tsk)
-{
-}
+static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
 #endif
 
 static inline void tolerant_fwait(void)
@@ -228,13 +237,13 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 /*
  * These must be called with preempt disabled
  */
-static inline void __save_init_fpu(struct task_struct *tsk)
+static inline void fpu_save_init(struct fpu *fpu)
 {
 	if (use_xsave()) {
-		struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
-		struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+		struct xsave_struct *xstate = &fpu->state->xsave;
+		struct i387_fxsave_struct *fx = &fpu->state->fxsave;
 
-		xsave(tsk);
+		fpu_xsave(fpu);
 
 		/*
 		 * xsave header may indicate the init state of the FP.
@@ -258,8 +267,8 @@ static inline void __save_init_fpu(struct task_struct *tsk)
 		"fxsave %[fx]\n"
 		"bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
 		X86_FEATURE_FXSR,
-		[fx] "m" (tsk->thread.xstate->fxsave),
-		[fsw] "m" (tsk->thread.xstate->fxsave.swd) : "memory");
+		[fx] "m" (fpu->state->fxsave),
+		[fsw] "m" (fpu->state->fxsave.swd) : "memory");
 clear_state:
 	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
 	   is pending.  Clear the x87 state here by setting it to fixed
@@ -271,17 +280,34 @@ clear_state:
 		X86_FEATURE_FXSAVE_LEAK,
 		[addr] "m" (safe_address));
 end:
+	;
+}
+
+static inline void __save_init_fpu(struct task_struct *tsk)
+{
+	fpu_save_init(&tsk->thread.fpu);
 	task_thread_info(tsk)->status &= ~TS_USEDFPU;
 }
 
+
 #endif	/* CONFIG_X86_64 */
 
-static inline int restore_fpu_checking(struct task_struct *tsk)
+static inline int fpu_fxrstor_checking(struct fpu *fpu)
+{
+	return fxrstor_checking(&fpu->state->fxsave);
+}
+
+static inline int fpu_restore_checking(struct fpu *fpu)
 {
 	if (use_xsave())
-		return xrstor_checking(&tsk->thread.xstate->xsave);
+		return fpu_xrstor_checking(fpu);
 	else
-		return fxrstor_checking(&tsk->thread.xstate->fxsave);
+		return fpu_fxrstor_checking(fpu);
+}
+
+static inline int restore_fpu_checking(struct task_struct *tsk)
+{
+	return fpu_restore_checking(&tsk->thread.fpu);
 }
 
 /*
@@ -409,30 +435,59 @@ static inline void clear_fpu(struct task_struct *tsk)
 static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
 {
 	if (cpu_has_fxsr) {
-		return tsk->thread.xstate->fxsave.cwd;
+		return tsk->thread.fpu.state->fxsave.cwd;
 	} else {
-		return (unsigned short)tsk->thread.xstate->fsave.cwd;
+		return (unsigned short)tsk->thread.fpu.state->fsave.cwd;
 	}
 }
 
 static inline unsigned short get_fpu_swd(struct task_struct *tsk)
 {
 	if (cpu_has_fxsr) {
-		return tsk->thread.xstate->fxsave.swd;
+		return tsk->thread.fpu.state->fxsave.swd;
 	} else {
-		return (unsigned short)tsk->thread.xstate->fsave.swd;
+		return (unsigned short)tsk->thread.fpu.state->fsave.swd;
 	}
 }
 
 static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
 {
 	if (cpu_has_xmm) {
-		return tsk->thread.xstate->fxsave.mxcsr;
+		return tsk->thread.fpu.state->fxsave.mxcsr;
 	} else {
 		return MXCSR_DEFAULT;
 	}
 }
 
+static bool fpu_allocated(struct fpu *fpu)
+{
+	return fpu->state != NULL;
+}
+
+static inline int fpu_alloc(struct fpu *fpu)
+{
+	if (fpu_allocated(fpu))
+		return 0;
+	fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
+	if (!fpu->state)
+		return -ENOMEM;
+	WARN_ON((unsigned long)fpu->state & 15);
+	return 0;
+}
+
+static inline void fpu_free(struct fpu *fpu)
+{
+	if (fpu->state) {
+		kmem_cache_free(task_xstate_cachep, fpu->state);
+		fpu->state = NULL;
+	}
+}
+
+static inline void fpu_copy(struct fpu *dst, struct fpu *src)
+{
+	memcpy(dst->state, src->state, xstate_size);
+}
+
 #endif /* __ASSEMBLY__ */
 
 #define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b753ea59703a..b684f587647c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -380,6 +380,10 @@ union thread_xstate {
 	struct xsave_struct		xsave;
 };
 
+struct fpu {
+	union thread_xstate *state;
+};
+
 #ifdef CONFIG_X86_64
 DECLARE_PER_CPU(struct orig_ist, orig_ist);
 
@@ -457,7 +461,7 @@ struct thread_struct {
 	unsigned long		trap_no;
 	unsigned long		error_code;
 	/* floating point and extended processor state */
-	union thread_xstate	*xstate;
+	struct fpu		fpu;
 #ifdef CONFIG_X86_32
 	/* Virtual 86 mode info */
 	struct vm86_struct __user *vm86_info;
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index ddc04ccad03b..2c4390cae228 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -37,8 +37,9 @@ extern int check_for_xstate(struct i387_fxsave_struct __user *buf,
 			    void __user *fpstate,
 			    struct _fpx_sw_bytes *sw);
 
-static inline int xrstor_checking(struct xsave_struct *fx)
+static inline int fpu_xrstor_checking(struct fpu *fpu)
 {
+	struct xsave_struct *fx = &fpu->state->xsave;
 	int err;
 
 	asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
@@ -110,12 +111,12 @@ static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
 		     :   "memory");
 }
 
-static inline void xsave(struct task_struct *tsk)
+static inline void fpu_xsave(struct fpu *fpu)
 {
 	/* This, however, we can work around by forcing the compiler to select
 	   an addressing mode that doesn't require extended registers. */
 	__asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27"
-			     : : "D" (&(tsk->thread.xstate->xsave)),
+			     : : "D" (&(fpu->state->xsave)),
 				 "a" (-1), "d"(-1) : "memory");
 }
 #endif
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 14ca1dc7a703..86cef6b32253 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -107,57 +107,57 @@ void __cpuinit fpu_init(void)
 }
 #endif	/* CONFIG_X86_64 */
 
-/*
- * The _current_ task is using the FPU for the first time
- * so initialize it and set the mxcsr to its default
- * value at reset if we support XMM instructions and then
- * remeber the current task has used the FPU.
- */
-int init_fpu(struct task_struct *tsk)
+static void fpu_finit(struct fpu *fpu)
 {
-	if (tsk_used_math(tsk)) {
-		if (HAVE_HWFP && tsk == current)
-			unlazy_fpu(tsk);
-		return 0;
-	}
-
-	/*
-	 * Memory allocation at the first usage of the FPU and other state.
-	 */
-	if (!tsk->thread.xstate) {
-		tsk->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
-						      GFP_KERNEL);
-		if (!tsk->thread.xstate)
-			return -ENOMEM;
-	}
-
 #ifdef CONFIG_X86_32
 	if (!HAVE_HWFP) {
-		memset(tsk->thread.xstate, 0, xstate_size);
-		finit_task(tsk);
-		set_stopped_child_used_math(tsk);
-		return 0;
+		finit_soft_fpu(&fpu->state->soft);
+		return;
 	}
 #endif
 
 	if (cpu_has_fxsr) {
-		struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+		struct i387_fxsave_struct *fx = &fpu->state->fxsave;
 
 		memset(fx, 0, xstate_size);
 		fx->cwd = 0x37f;
 		if (cpu_has_xmm)
 			fx->mxcsr = MXCSR_DEFAULT;
 	} else {
-		struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
+		struct i387_fsave_struct *fp = &fpu->state->fsave;
 		memset(fp, 0, xstate_size);
 		fp->cwd = 0xffff037fu;
 		fp->swd = 0xffff0000u;
 		fp->twd = 0xffffffffu;
 		fp->fos = 0xffff0000u;
 	}
+}
+
+/*
+ * The _current_ task is using the FPU for the first time
+ * so initialize it and set the mxcsr to its default
+ * value at reset if we support XMM instructions and then
+ * remeber the current task has used the FPU.
+ */
+int init_fpu(struct task_struct *tsk)
+{
+	int ret;
+
+	if (tsk_used_math(tsk)) {
+		if (HAVE_HWFP && tsk == current)
+			unlazy_fpu(tsk);
+		return 0;
+	}
+
 	/*
-	 * Only the device not available exception or ptrace can call init_fpu.
+	 * Memory allocation at the first usage of the FPU and other state.
 	 */
+	ret = fpu_alloc(&tsk->thread.fpu);
+	if (ret)
+		return ret;
+
+	fpu_finit(&tsk->thread.fpu);
+
 	set_stopped_child_used_math(tsk);
 	return 0;
 }
@@ -191,7 +191,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 		return ret;
 
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   &target->thread.xstate->fxsave, 0, -1);
+				   &target->thread.fpu.state->fxsave, 0, -1);
 }
 
 int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -208,19 +208,19 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 		return ret;
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &target->thread.xstate->fxsave, 0, -1);
+				 &target->thread.fpu.state->fxsave, 0, -1);
 
 	/*
 	 * mxcsr reserved bits must be masked to zero for security reasons.
 	 */
-	target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+	target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
 
 	/*
 	 * update the header bits in the xsave header, indicating the
 	 * presence of FP and SSE state.
 	 */
 	if (cpu_has_xsave)
-		target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
+		target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
 
 	return ret;
 }
@@ -243,14 +243,14 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 	 * memory layout in the thread struct, so that we can copy the entire
 	 * xstateregs to the user using one user_regset_copyout().
 	 */
-	memcpy(&target->thread.xstate->fxsave.sw_reserved,
+	memcpy(&target->thread.fpu.state->fxsave.sw_reserved,
 	       xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
 
 	/*
 	 * Copy the xstate memory layout.
 	 */
 	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				  &target->thread.xstate->xsave, 0, -1);
+				  &target->thread.fpu.state->xsave, 0, -1);
 	return ret;
 }
 
@@ -269,14 +269,14 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 		return ret;
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &target->thread.xstate->xsave, 0, -1);
+				 &target->thread.fpu.state->xsave, 0, -1);
 
 	/*
 	 * mxcsr reserved bits must be masked to zero for security reasons.
 	 */
-	target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+	target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
 
-	xsave_hdr = &target->thread.xstate->xsave.xsave_hdr;
+	xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr;
 
 	xsave_hdr->xstate_bv &= pcntxt_mask;
 	/*
@@ -362,7 +362,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
 static void
 convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
 {
-	struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
+	struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
 	struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
 	struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
 	int i;
@@ -402,7 +402,7 @@ static void convert_to_fxsr(struct task_struct *tsk,
 			    const struct user_i387_ia32_struct *env)
 
 {
-	struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
+	struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
 	struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
 	struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
 	int i;
@@ -442,7 +442,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 
 	if (!cpu_has_fxsr) {
 		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					   &target->thread.xstate->fsave, 0,
+					   &target->thread.fpu.state->fsave, 0,
 					   -1);
 	}
 
@@ -472,7 +472,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 
 	if (!cpu_has_fxsr) {
 		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-					  &target->thread.xstate->fsave, 0, -1);
+					  &target->thread.fpu.state->fsave, 0, -1);
 	}
 
 	if (pos > 0 || count < sizeof(env))
@@ -487,7 +487,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	 * presence of FP.
 	 */
 	if (cpu_has_xsave)
-		target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
+		target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
 	return ret;
 }
 
@@ -498,7 +498,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
-	struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
+	struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave;
 
 	fp->status = fp->swd;
 	if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
@@ -509,7 +509,7 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
 static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
-	struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+	struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
 	struct user_i387_ia32_struct env;
 	int err = 0;
 
@@ -544,7 +544,7 @@ static int save_i387_xsave(void __user *buf)
 	 * header as well as change any contents in the memory layout.
 	 * xrestore as part of sigreturn will capture all the changes.
 	 */
-	tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
+	tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
 
 	if (save_i387_fxsave(fx) < 0)
 		return -1;
@@ -596,7 +596,7 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
 
-	return __copy_from_user(&tsk->thread.xstate->fsave, buf,
+	return __copy_from_user(&tsk->thread.fpu.state->fsave, buf,
 				sizeof(struct i387_fsave_struct));
 }
 
@@ -607,10 +607,10 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
 	struct user_i387_ia32_struct env;
 	int err;
 
-	err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
+	err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0],
 			       size);
 	/* mxcsr reserved bits must be masked to zero for security reasons */
-	tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+	tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
 	if (err || __copy_from_user(&env, buf, sizeof(env)))
 		return 1;
 	convert_to_fxsr(tsk, &env);
@@ -626,7 +626,7 @@ static int restore_i387_xsave(void __user *buf)
 	struct i387_fxsave_struct __user *fx =
 		(struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
 	struct xsave_hdr_struct *xsave_hdr =
-				&current->thread.xstate->xsave.xsave_hdr;
+				&current->thread.fpu.state->xsave.xsave_hdr;
 	u64 mask;
 	int err;
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 28ad9f4d8b94..f18fd9c15247 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -32,25 +32,22 @@ struct kmem_cache *task_xstate_cachep;
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
+	int ret;
+
 	*dst = *src;
-	if (src->thread.xstate) {
-		dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
-						      GFP_KERNEL);
-		if (!dst->thread.xstate)
-			return -ENOMEM;
-		WARN_ON((unsigned long)dst->thread.xstate & 15);
-		memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
+	if (fpu_allocated(&src->thread.fpu)) {
+		memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
+		ret = fpu_alloc(&dst->thread.fpu);
+		if (ret)
+			return ret;
+		fpu_copy(&dst->thread.fpu, &src->thread.fpu);
 	}
 	return 0;
 }
 
 void free_thread_xstate(struct task_struct *tsk)
 {
-	if (tsk->thread.xstate) {
-		kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
-		tsk->thread.xstate = NULL;
-	}
-
+	fpu_free(&tsk->thread.fpu);
 	WARN(tsk->thread.ds_ctx, "leaking DS context\n");
 }
 
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f6c62667e30c..0a7a4f5bbaa9 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -317,7 +317,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	/* we're going to use this soon, after a few expensive things */
 	if (preload_fpu)
-		prefetch(next->xstate);
+		prefetch(next->fpu.state);
 
 	/*
 	 * Reload esp0.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 17cb3295cbf7..979215f51985 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -396,7 +396,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	/* we're going to use this soon, after a few expensive things */
 	if (preload_fpu)
-		prefetch(next->xstate);
+		prefetch(next->fpu.state);
 
 	/*
 	 * Reload esp0, LDT and the page table pointer:
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index c1b0a11033a2..37e68fc5e24a 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -109,7 +109,7 @@ int save_i387_xstate(void __user *buf)
 		task_thread_info(tsk)->status &= ~TS_USEDFPU;
 		stts();
 	} else {
-		if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
+		if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
 				   xstate_size))
 			return -1;
 	}
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index aa0987088774..62797f930511 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -30,10 +30,10 @@ static void fclex(void)
 }
 
 /* Needs to be externally visible */
-void finit_task(struct task_struct *tsk)
+void finit_soft_fpu(struct i387_soft_struct *soft)
 {
-	struct i387_soft_struct *soft = &tsk->thread.xstate->soft;
 	struct address *oaddr, *iaddr;
+	memset(soft, 0, sizeof(*soft));
 	soft->cwd = 0x037f;
 	soft->swd = 0;
 	soft->ftop = 0;	/* We don't keep top in the status word internally. */
@@ -52,7 +52,7 @@ void finit_task(struct task_struct *tsk)
 
 void finit(void)
 {
-	finit_task(current);
+	finit_task(&current->thread.fpu);
 }
 
 /*
-- 
cgit v1.2.3-55-g7522


From c3f8978ea332cd4be88e12574452a025892ac9af Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Mon, 10 May 2010 13:37:16 -0700
Subject: x86, fpu: Unbreak FPU emulation

Unbreak FPU emulation, broken by checkin
86603283326c9e95e5ad4e9fdddeec93cac5d9ad:
x86: Introduce 'struct fpu' and related API

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1273135546-29690-3-git-send-email-avi@redhat.com>
---
 arch/x86/math-emu/fpu_aux.c    | 2 +-
 arch/x86/math-emu/fpu_entry.c  | 4 ++--
 arch/x86/math-emu/fpu_system.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index 62797f930511..dc8adad10a2f 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -52,7 +52,7 @@ void finit_soft_fpu(struct i387_soft_struct *soft)
 
 void finit(void)
 {
-	finit_task(&current->thread.fpu);
+	finit_soft_fpu(&current->thread.fpu.state->soft);
 }
 
 /*
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 5d87f586f8d7..7718541541d4 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -681,7 +681,7 @@ int fpregs_soft_set(struct task_struct *target,
 		    unsigned int pos, unsigned int count,
 		    const void *kbuf, const void __user *ubuf)
 {
-	struct i387_soft_struct *s387 = &target->thread.xstate->soft;
+	struct i387_soft_struct *s387 = &target->thread.fpu.state->soft;
 	void *space = s387->st_space;
 	int ret;
 	int offset, other, i, tags, regnr, tag, newtop;
@@ -733,7 +733,7 @@ int fpregs_soft_get(struct task_struct *target,
 		    unsigned int pos, unsigned int count,
 		    void *kbuf, void __user *ubuf)
 {
-	struct i387_soft_struct *s387 = &target->thread.xstate->soft;
+	struct i387_soft_struct *s387 = &target->thread.fpu.state->soft;
 	const void *space = s387->st_space;
 	int ret;
 	int offset = (S387->ftop & 7) * 10, other = 80 - offset;
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index 50fa0ec2c8a5..2c614410a5f3 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -31,7 +31,7 @@
 #define SEG_EXPAND_DOWN(s)	(((s).b & ((1 << 11) | (1 << 10))) \
 				 == (1 << 10))
 
-#define I387			(current->thread.xstate)
+#define I387			(current->thread.fpu.state)
 #define FPU_info		(I387->soft.info)
 
 #define FPU_CS			(*(unsigned short *) &(FPU_info->regs->cs))
-- 
cgit v1.2.3-55-g7522


From dce8bf4e115aa44d590802ce3554e926840c9042 Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Mon, 10 May 2010 13:41:41 -0700
Subject: x86, fpu: Use the proper asm constraint in use_xsave()

The proper constraint for a receiving 8-bit variable is "=qm", not
"=g" which equals "=rim"; even though the "i" will never match, bugs
can and do happen due to the difference between "q" and "r".

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1273135546-29690-2-git-send-email-avi@redhat.com>
---
 arch/x86/include/asm/i387.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 1a8cca33b736..8002e9ce25fc 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -64,7 +64,7 @@ static inline bool use_xsave(void)
 	alternative_io("mov $0, %0",
 		       "mov $1, %0",
 		       X86_FEATURE_XSAVE,
-		       "=g"(has_xsave));
+		       "=qm" (has_xsave));
 
 	return has_xsave;
 }
-- 
cgit v1.2.3-55-g7522


From 829e92458532b1dbfeb972435d45bb060cdbf5a3 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu
Date: Tue, 27 Apr 2010 18:33:49 -0400
Subject: kprobes/x86: Fix removed int3 checking order

Fix kprobe/x86 to check removed int3 when failing to get kprobe
from hlist. Since we have a time window between checking int3
exists on probed address and getting kprobe on that address,
we can have following scenario:

 -------
 CPU1                     CPU2
 hit int3
 check int3 exists
                          remove int3
                          remove kprobe from hlist
 get kprobe from hlist
 no kprobe->OOPS!
 -------

This patch moves int3 checking if there is no kprobe on that
address for fixing this problem as follows:

 ------
 CPU1                     CPU2
 hit int3
                          remove int3
                          remove kprobe from hlist
 get kprobe from hlist
 no kprobe->check int3 exists
          ->rollback&retry
 ------

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Dave Anderson <anderson@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100427223348.2322.9112.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/kprobes.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b43bbaebe2c0..1658efdfb4e5 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -534,20 +534,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 	struct kprobe_ctlblk *kcb;
 
 	addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
-	if (*addr != BREAKPOINT_INSTRUCTION) {
-		/*
-		 * The breakpoint instruction was removed right
-		 * after we hit it.  Another cpu has removed
-		 * either a probepoint or a debugger breakpoint
-		 * at this address.  In either case, no further
-		 * handling of this interrupt is appropriate.
-		 * Back up over the (now missing) int3 and run
-		 * the original instruction.
-		 */
-		regs->ip = (unsigned long)addr;
-		return 1;
-	}
-
 	/*
 	 * We don't want to be preempted for the entire
 	 * duration of kprobe processing. We conditionally
@@ -579,6 +565,19 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 				setup_singlestep(p, regs, kcb, 0);
 			return 1;
 		}
+	} else if (*addr != BREAKPOINT_INSTRUCTION) {
+		/*
+		 * The breakpoint instruction was removed right
+		 * after we hit it.  Another cpu has removed
+		 * either a probepoint or a debugger breakpoint
+		 * at this address.  In either case, no further
+		 * handling of this interrupt is appropriate.
+		 * Back up over the (now missing) int3 and run
+		 * the original instruction.
+		 */
+		regs->ip = (unsigned long)addr;
+		preempt_enable_no_resched();
+		return 1;
 	} else if (kprobe_running()) {
 		p = __get_cpu_var(current_kprobe);
 		if (p->break_handler && p->break_handler(p, regs)) {
-- 
cgit v1.2.3-55-g7522


From d19f61f098ae9315b76a97962007f687683916d4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Wed, 17 Feb 2010 14:35:25 +0000
Subject: x86/PCI: Convert pci_config_lock to raw_spinlock

pci_config_lock must be a real spinlock in preempt-rt. Convert it to
raw_spinlock. No change for !RT kernels.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/pci_x86.h |  2 +-
 arch/x86/pci/common.c          |  2 +-
 arch/x86/pci/direct.c          | 16 ++++++++--------
 arch/x86/pci/mmconfig_32.c     |  8 ++++----
 arch/x86/pci/numaq_32.c        |  8 ++++----
 arch/x86/pci/pcbios.c          |  8 ++++----
 6 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 1a0422348d6d..8d8797eae5d7 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -83,7 +83,7 @@ struct irq_routing_table {
 
 extern unsigned int pcibios_irq_mask;
 
-extern spinlock_t pci_config_lock;
+extern raw_spinlock_t pci_config_lock;
 
 extern int (*pcibios_enable_irq)(struct pci_dev *dev);
 extern void (*pcibios_disable_irq)(struct pci_dev *dev);
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index cf2e93869c48..215a27ae050d 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -76,7 +76,7 @@ struct pci_ops pci_root_ops = {
  * This interrupt-safe spinlock protects all accesses to PCI
  * configuration space.
  */
-DEFINE_SPINLOCK(pci_config_lock);
+DEFINE_RAW_SPINLOCK(pci_config_lock);
 
 static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
 {
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index 347d882b3bb3..bd33620b0071 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -27,7 +27,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus,
 		return -EINVAL;
 	}
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8);
 
@@ -43,7 +43,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus,
 		break;
 	}
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
@@ -56,7 +56,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus,
 	if ((bus > 255) || (devfn > 255) || (reg > 4095))
 		return -EINVAL;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8);
 
@@ -72,7 +72,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus,
 		break;
 	}
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
@@ -108,7 +108,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus,
 	if (dev & 0x10) 
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	outb((u8)(0xF0 | (fn << 1)), 0xCF8);
 	outb((u8)bus, 0xCFA);
@@ -127,7 +127,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus,
 
 	outb(0, 0xCF8);
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
@@ -147,7 +147,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,
 	if (dev & 0x10) 
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	outb((u8)(0xF0 | (fn << 1)), 0xCF8);
 	outb((u8)bus, 0xCFA);
@@ -166,7 +166,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,
 
 	outb(0, 0xCF8);    
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index 90d5fd476ed4..a3d9c54792ae 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -64,7 +64,7 @@ err:		*value = -1;
 	if (!base)
 		goto err;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	pci_exp_set_dev_base(base, bus, devfn);
 
@@ -79,7 +79,7 @@ err:		*value = -1;
 		*value = mmio_config_readl(mmcfg_virt_addr + reg);
 		break;
 	}
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
@@ -97,7 +97,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
 	if (!base)
 		return -EINVAL;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	pci_exp_set_dev_base(base, bus, devfn);
 
@@ -112,7 +112,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
 		mmio_config_writel(mmcfg_virt_addr + reg, value);
 		break;
 	}
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 8223738ad806..5c9e2458df4e 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -37,7 +37,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
 	if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
 		return -EINVAL;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	write_cf8(bus, devfn, reg);
 
@@ -62,7 +62,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
 		break;
 	}
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
@@ -76,7 +76,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
 	if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) 
 		return -EINVAL;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	write_cf8(bus, devfn, reg);
 
@@ -101,7 +101,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
 		break;
 	}
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 59a225c17b84..2492d165096a 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -162,7 +162,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
 	if (!value || (bus > 255) || (devfn > 255) || (reg > 255))
 		return -EINVAL;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	switch (len) {
 	case 1:
@@ -213,7 +213,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
 		break;
 	}
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return (int)((result & 0xff00) >> 8);
 }
@@ -228,7 +228,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus,
 	if ((bus > 255) || (devfn > 255) || (reg > 255)) 
 		return -EINVAL;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	switch (len) {
 	case 1:
@@ -269,7 +269,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus,
 		break;
 	}
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return (int)((result & 0xff00) >> 8);
 }
-- 
cgit v1.2.3-55-g7522


From 33852cb03ee4cdb05dc6e3a21ec19a4ee63511a4 Mon Sep 17 00:00:00 2001
From: Seth Heasley
Date: Thu, 25 Mar 2010 16:11:37 -0700
Subject: x86/PCI: irq and pci_ids patch for additional Intel Cougar Point
 DeviceIDs

This patch adds additional LPC Controller DeviceIDs for the Intel Cougar
Point PCH.

The DeviceIDs are defined and referenced as a range of values, the same
way Ibex Peak was implemented.

Signed-off-by: Seth Heasley <seth.heasley@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/irq.c      | 9 +++++++--
 include/linux/pci_ids.h | 4 ++--
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 5d362b5ba06f..9810a0f76c91 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -589,8 +589,6 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
 	case PCI_DEVICE_ID_INTEL_ICH10_1:
 	case PCI_DEVICE_ID_INTEL_ICH10_2:
 	case PCI_DEVICE_ID_INTEL_ICH10_3:
-	case PCI_DEVICE_ID_INTEL_CPT_LPC1:
-	case PCI_DEVICE_ID_INTEL_CPT_LPC2:
 		r->name = "PIIX/ICH";
 		r->get = pirq_piix_get;
 		r->set = pirq_piix_set;
@@ -605,6 +603,13 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
 		return 1;
 	}
 
+	if ((device >= PCI_DEVICE_ID_INTEL_CPT_LPC_MIN) && 
+		(device <= PCI_DEVICE_ID_INTEL_CPT_LPC_MAX)) {
+		r->name = "PIIX/ICH";
+		r->get = pirq_piix_get;
+		r->set = pirq_piix_set;
+		return 1;
+	}
 	return 0;
 }
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 9f688d243b86..ae66851870be 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2419,8 +2419,8 @@
 #define PCI_DEVICE_ID_INTEL_82845_HB	0x1a30
 #define PCI_DEVICE_ID_INTEL_IOAT	0x1a38
 #define PCI_DEVICE_ID_INTEL_CPT_SMBUS	0x1c22
-#define PCI_DEVICE_ID_INTEL_CPT_LPC1	0x1c42
-#define PCI_DEVICE_ID_INTEL_CPT_LPC2	0x1c43
+#define PCI_DEVICE_ID_INTEL_CPT_LPC_MIN	0x1c41
+#define PCI_DEVICE_ID_INTEL_CPT_LPC_MAX	0x1c5f
 #define PCI_DEVICE_ID_INTEL_82801AA_0	0x2410
 #define PCI_DEVICE_ID_INTEL_82801AA_1	0x2411
 #define PCI_DEVICE_ID_INTEL_82801AA_3	0x2413
-- 
cgit v1.2.3-55-g7522


From a3c8acd04376d604370dcb6cd2143c9c14078a50 Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Tue, 11 May 2010 17:47:07 -0700
Subject: x86: Add new static_cpu_has() function using alternatives

For CPU-feature-specific code that touches performance-critical paths,
introduce a static patching version of [boot_]cpu_has().  This is run
at alternatives time and is therefore not appropriate for most
initialization code, but on the other hand initialization code is
generally not performance critical.

On gcc 4.5+ this uses the new "asm goto" feature.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1273135546-29690-2-git-send-email-avi@redhat.com>
---
 arch/x86/include/asm/cpufeature.h | 57 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 0cd82d068613..9b11a5cc6662 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -175,6 +175,7 @@
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
+#include <asm/asm.h>
 #include <linux/bitops.h>
 
 extern const char * const x86_cap_flags[NCAPINTS*32];
@@ -283,6 +284,62 @@ extern const char * const x86_power_flags[32];
 
 #endif /* CONFIG_X86_64 */
 
+/*
+ * Static testing of CPU features.  Used the same as boot_cpu_has().
+ * These are only valid after alternatives have run, but will statically
+ * patch the target code for additional performance.
+ *
+ */
+static __always_inline __pure bool __static_cpu_has(u8 bit)
+{
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
+		asm goto("1: jmp %l[t_no]\n"
+			 "2:\n"
+			 ".section .altinstructions,\"a\"\n"
+			 _ASM_ALIGN "\n"
+			 _ASM_PTR "1b\n"
+			 _ASM_PTR "0\n" 	/* no replacement */
+			 " .byte %P0\n"		/* feature bit */
+			 " .byte 2b - 1b\n"	/* source len */
+			 " .byte 0\n"		/* replacement len */
+			 " .byte 0xff + 0 - (2b-1b)\n"	/* padding */
+			 ".previous\n"
+			 : : "i" (bit) : : t_no);
+		return true;
+	t_no:
+		return false;
+#else
+		u8 flag;
+		/* Open-coded due to __stringify() in ALTERNATIVE() */
+		asm volatile("1: movb $0,%0\n"
+			     "2:\n"
+			     ".section .altinstructions,\"a\"\n"
+			     _ASM_ALIGN "\n"
+			     _ASM_PTR "1b\n"
+			     _ASM_PTR "3f\n"
+			     " .byte %P1\n"		/* feature bit */
+			     " .byte 2b - 1b\n"		/* source len */
+			     " .byte 4f - 3f\n"		/* replacement len */
+			     " .byte 0xff + (4f-3f) - (2b-1b)\n" /* padding */
+			     ".previous\n"
+			     ".section .altinstr_replacement,\"ax\"\n"
+			     "3: movb $1,%0\n"
+			     "4:\n"
+			     ".previous\n"
+			     : "=qm" (flag) : "i" (bit));
+		return flag;
+#endif
+}
+
+#define static_cpu_has(bit)					\
+(								\
+	__builtin_constant_p(boot_cpu_has(bit)) ?		\
+		boot_cpu_has(bit) :				\
+	(__builtin_constant_p(bit) && !((bit) & ~0xff)) ?	\
+		__static_cpu_has(bit) :				\
+		boot_cpu_has(bit)				\
+)
+
 #endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
 
 #endif /* _ASM_X86_CPUFEATURE_H */
-- 
cgit v1.2.3-55-g7522


From c9775b4cc522e5f1b40b1366a993f0f05f600f39 Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Tue, 11 May 2010 17:49:54 -0700
Subject: x86, fpu: Use static_cpu_has() to implement use_xsave()

use_xsave() is now just a special case of static_cpu_has(), so use
static_cpu_has().

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1273135546-29690-2-git-send-email-avi@redhat.com>
---
 arch/x86/include/asm/i387.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 8002e9ce25fc..c991b3a7b904 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -18,6 +18,7 @@
 #include <linux/hardirq.h>
 #include <linux/slab.h>
 #include <asm/asm.h>
+#include <asm/cpufeature.h>
 #include <asm/processor.h>
 #include <asm/sigcontext.h>
 #include <asm/user.h>
@@ -57,16 +58,9 @@ extern int restore_i387_xstate_ia32(void __user *buf);
 
 #define X87_FSW_ES (1 << 7)	/* Exception Summary */
 
-static inline bool use_xsave(void)
+static __always_inline __pure bool use_xsave(void)
 {
-	u8 has_xsave;
-
-	alternative_io("mov $0, %0",
-		       "mov $1, %0",
-		       X86_FEATURE_XSAVE,
-		       "=qm" (has_xsave));
-
-	return has_xsave;
+	return static_cpu_has(X86_FEATURE_XSAVE);
 }
 
 #ifdef CONFIG_X86_64
-- 
cgit v1.2.3-55-g7522


From b6dacf63e9fb2e7a1369843d6cef332f76fca6a3 Mon Sep 17 00:00:00 2001
From: Matthew Garrett
Date: Tue, 11 May 2010 13:49:25 -0400
Subject: ACPI: Unconditionally set SCI_EN on resume

The ACPI spec tells us that the firmware will reenable SCI_EN on resume.
Reality disagrees in some cases. The ACPI spec tells us that the only way
to set SCI_EN is via an SMM call.
https://bugzilla.kernel.org/show_bug.cgi?id=13745 shows us that doing so
may break machines. Tracing the ACPI calls made by Windows shows that it
unconditionally sets SCI_EN on resume with a direct register write, and
therefore the overwhelming probability is that everything is fine with
this behaviour.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Tested-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/acpi/sleep.c |   2 -
 drivers/acpi/sleep.c         | 157 +------------------------------------------
 include/linux/acpi.h         |   1 -
 3 files changed, 2 insertions(+), 158 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index f9961034e557..82e508677b91 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -162,8 +162,6 @@ static int __init acpi_sleep_setup(char *str)
 #endif
 		if (strncmp(str, "old_ordering", 12) == 0)
 			acpi_old_suspend_ordering();
-		if (strncmp(str, "sci_force_enable", 16) == 0)
-			acpi_set_sci_en_on_resume();
 		str = strchr(str, ',');
 		if (str != NULL)
 			str += strspn(str, ", \t");
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index baa76bbf244a..4ab2275b4461 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -80,22 +80,6 @@ static int acpi_sleep_prepare(u32 acpi_state)
 
 #ifdef CONFIG_ACPI_SLEEP
 static u32 acpi_target_sleep_state = ACPI_STATE_S0;
-/*
- * According to the ACPI specification the BIOS should make sure that ACPI is
- * enabled and SCI_EN bit is set on wake-up from S1 - S3 sleep states.  Still,
- * some BIOSes don't do that and therefore we use acpi_enable() to enable ACPI
- * on such systems during resume.  Unfortunately that doesn't help in
- * particularly pathological cases in which SCI_EN has to be set directly on
- * resume, although the specification states very clearly that this flag is
- * owned by the hardware.  The set_sci_en_on_resume variable will be set in such
- * cases.
- */
-static bool set_sci_en_on_resume;
-
-void __init acpi_set_sci_en_on_resume(void)
-{
-	set_sci_en_on_resume = true;
-}
 
 /*
  * ACPI 1.0 wants us to execute _PTS before suspending devices, so we allow the
@@ -253,11 +237,8 @@ static int acpi_suspend_enter(suspend_state_t pm_state)
 		break;
 	}
 
-	/* If ACPI is not enabled by the BIOS, we need to enable it here. */
-	if (set_sci_en_on_resume)
-		acpi_write_bit_register(ACPI_BITREG_SCI_ENABLE, 1);
-	else
-		acpi_enable();
+	/* This violates the spec but is required for bug compatibility. */
+	acpi_write_bit_register(ACPI_BITREG_SCI_ENABLE, 1);
 
 	/* Reprogram control registers and execute _BFS */
 	acpi_leave_sleep_state_prep(acpi_state);
@@ -346,12 +327,6 @@ static int __init init_old_suspend_ordering(const struct dmi_system_id *d)
 	return 0;
 }
 
-static int __init init_set_sci_en_on_resume(const struct dmi_system_id *d)
-{
-	set_sci_en_on_resume = true;
-	return 0;
-}
-
 static struct dmi_system_id __initdata acpisleep_dmi_table[] = {
 	{
 	.callback = init_old_suspend_ordering,
@@ -370,22 +345,6 @@ static struct dmi_system_id __initdata acpisleep_dmi_table[] = {
 		},
 	},
 	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Apple MacBook 1,1",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Apple Computer, Inc."),
-		DMI_MATCH(DMI_PRODUCT_NAME, "MacBook1,1"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Apple MacMini 1,1",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Apple Computer, Inc."),
-		DMI_MATCH(DMI_PRODUCT_NAME, "Macmini1,1"),
-		},
-	},
-	{
 	.callback = init_old_suspend_ordering,
 	.ident = "Asus Pundit P1-AH2 (M2N8L motherboard)",
 	.matches = {
@@ -394,94 +353,6 @@ static struct dmi_system_id __initdata acpisleep_dmi_table[] = {
 		},
 	},
 	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Toshiba Satellite L300",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "Satellite L300"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Hewlett-Packard HP G7000 Notebook PC",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "HP G7000 Notebook PC"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Hewlett-Packard HP Pavilion dv3 Notebook PC",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "HP Pavilion dv3 Notebook PC"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Hewlett-Packard Pavilion dv4",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "HP Pavilion dv4"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Hewlett-Packard Pavilion dv7",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "HP Pavilion dv7"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Hewlett-Packard Compaq Presario C700 Notebook PC",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "Compaq Presario C700 Notebook PC"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Hewlett-Packard Compaq Presario CQ40 Notebook PC",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "Compaq Presario CQ40 Notebook PC"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Lenovo ThinkPad T410",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
-		DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad T410"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Lenovo ThinkPad T510",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
-		DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad T510"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Lenovo ThinkPad W510",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
-		DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad W510"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Lenovo ThinkPad X201[s]",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
-		DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad X201"),
-		},
-	},
-	{
 	.callback = init_old_suspend_ordering,
 	.ident = "Panasonic CF51-2L",
 	.matches = {
@@ -490,30 +361,6 @@ static struct dmi_system_id __initdata acpisleep_dmi_table[] = {
 		DMI_MATCH(DMI_BOARD_NAME, "CF51-2L"),
 		},
 	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Dell Studio 1558",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-		DMI_MATCH(DMI_PRODUCT_NAME, "Studio 1558"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Dell Studio 1557",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-		DMI_MATCH(DMI_PRODUCT_NAME, "Studio 1557"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Dell Studio 1555",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-		DMI_MATCH(DMI_PRODUCT_NAME, "Studio 1555"),
-		},
-	},
 	{},
 };
 #endif /* CONFIG_SUSPEND */
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index b926afe8c03e..87ca4913294c 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -251,7 +251,6 @@ int acpi_check_mem_region(resource_size_t start, resource_size_t n,
 void __init acpi_no_s4_hw_signature(void);
 void __init acpi_old_suspend_ordering(void);
 void __init acpi_s4_no_nvs(void);
-void __init acpi_set_sci_en_on_resume(void);
 #endif /* CONFIG_PM_SLEEP */
 
 struct acpi_osc_context {
-- 
cgit v1.2.3-55-g7522


From 061e2fd16863009c8005b4b5fdfb75c7215c0b99 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Wed, 5 May 2010 16:04:43 +0200
Subject: KVM: SVM: Fix wrong intercept masks on 32 bit

This patch makes KVM on 32 bit SVM working again by
correcting the masks used for iret interception. With the
wrong masks the upper 32 bits of the intercepts are masked
out which leaves vmrun unintercepted. This is not legal on
svm and the vmrun fails.
Bug was introduced by commits 95ba827313 and 3cfc3092.

Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2ba58206812a..737361fcd503 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2067,7 +2067,7 @@ static int cpuid_interception(struct vcpu_svm *svm)
 static int iret_interception(struct vcpu_svm *svm)
 {
 	++svm->vcpu.stat.nmi_window_exits;
-	svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
+	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
 	svm->vcpu.arch.hflags |= HF_IRET_MASK;
 	return 1;
 }
@@ -2479,7 +2479,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
 
 	svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
 	vcpu->arch.hflags |= HF_NMI_MASK;
-	svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
+	svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
 	++vcpu->stat.nmi_injections;
 }
 
@@ -2539,10 +2539,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 
 	if (masked) {
 		svm->vcpu.arch.hflags |= HF_NMI_MASK;
-		svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
+		svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
 	} else {
 		svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
-		svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
+		svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
 	}
 }
 
-- 
cgit v1.2.3-55-g7522


From fe19c5a46b4c519153fddd4d5efe32a3e4cfa694 Mon Sep 17 00:00:00 2001
From: Dongxiao Xu
Date: Tue, 11 May 2010 18:21:33 +0800
Subject: KVM: x86: Call vcpu_load and vcpu_put in cpuid_update

cpuid_update may operate VMCS, so vcpu_load() and vcpu_put()
should be called to ensure correctness.

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3c4ca98ad27f..c4f35b545c1d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1712,6 +1712,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 	if (copy_from_user(cpuid_entries, entries,
 			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
 		goto out_free;
+	vcpu_load(vcpu);
 	for (i = 0; i < cpuid->nent; i++) {
 		vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
 		vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1729,6 +1730,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 	r = 0;
 	kvm_apic_set_version(vcpu);
 	kvm_x86_ops->cpuid_update(vcpu);
+	vcpu_put(vcpu);
 
 out_free:
 	vfree(cpuid_entries);
@@ -1749,9 +1751,11 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
 	if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
 			   cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
 		goto out;
+	vcpu_load(vcpu);
 	vcpu->arch.cpuid_nent = cpuid->nent;
 	kvm_apic_set_version(vcpu);
 	kvm_x86_ops->cpuid_update(vcpu);
+	vcpu_put(vcpu);
 	return 0;
 
 out:
-- 
cgit v1.2.3-55-g7522


From f8c5fae16649445e15656667f72bd51d777f7766 Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Tue, 11 May 2010 15:16:46 +0200
Subject: KVM: VMX: blocked-by-sti must not defer NMI injections

As the processor may not consider GUEST_INTR_STATE_STI as a reason for
blocking NMI, it could return immediately with EXIT_REASON_NMI_WINDOW
when we asked for it. But as we consider this state as NMI-blocking, we
can run into an endless loop.

Resolve this by allowing NMI injection if just GUEST_INTR_STATE_STI is
active (originally suggested by Gleb). Intel confirmed that this is
safe, the processor will never complain about NMI injection in this
state.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
KVM-Stable-Tag
Acked-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bc933cfb4e66..2f8db0ec8ae4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2703,8 +2703,7 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 		return 0;
 
 	return	!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS |
-				GUEST_INTR_STATE_NMI));
+			(GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI));
 }
 
 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3-55-g7522


From 720019908fd5a1bb442bb0a35a6027ba21864d25 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Wed, 12 May 2010 21:42:42 +0400
Subject: x86, perf: P4 PMU -- use hash for p4_get_escr_idx()

Linear search over all p4 MSRs should be fine if only
we would not use it in events scheduling routine which
is pretty time critical. Lets use hashes. It should speed
scheduling up significantly.

v2: Steven proposed to use more gentle approach than issue
    BUG on error, so we use WARN_ONCE now

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <20100512174242.GA5190@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 126 ++++++++++++++++++++----------------
 1 file changed, 71 insertions(+), 55 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index a603930271f3..cb875b1e2e87 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -668,66 +668,80 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
 	}
 }
 
-/* ESCRs are not sequential in memory so we need a map */
-static const unsigned int p4_escr_map[ARCH_P4_TOTAL_ESCR] = {
-	MSR_P4_ALF_ESCR0,	/*  0 */
-	MSR_P4_ALF_ESCR1,	/*  1 */
-	MSR_P4_BPU_ESCR0,	/*  2 */
-	MSR_P4_BPU_ESCR1,	/*  3 */
-	MSR_P4_BSU_ESCR0,	/*  4 */
-	MSR_P4_BSU_ESCR1,	/*  5 */
-	MSR_P4_CRU_ESCR0,	/*  6 */
-	MSR_P4_CRU_ESCR1,	/*  7 */
-	MSR_P4_CRU_ESCR2,	/*  8 */
-	MSR_P4_CRU_ESCR3,	/*  9 */
-	MSR_P4_CRU_ESCR4,	/* 10 */
-	MSR_P4_CRU_ESCR5,	/* 11 */
-	MSR_P4_DAC_ESCR0,	/* 12 */
-	MSR_P4_DAC_ESCR1,	/* 13 */
-	MSR_P4_FIRM_ESCR0,	/* 14 */
-	MSR_P4_FIRM_ESCR1,	/* 15 */
-	MSR_P4_FLAME_ESCR0,	/* 16 */
-	MSR_P4_FLAME_ESCR1,	/* 17 */
-	MSR_P4_FSB_ESCR0,	/* 18 */
-	MSR_P4_FSB_ESCR1,	/* 19 */
-	MSR_P4_IQ_ESCR0,	/* 20 */
-	MSR_P4_IQ_ESCR1,	/* 21 */
-	MSR_P4_IS_ESCR0,	/* 22 */
-	MSR_P4_IS_ESCR1,	/* 23 */
-	MSR_P4_ITLB_ESCR0,	/* 24 */
-	MSR_P4_ITLB_ESCR1,	/* 25 */
-	MSR_P4_IX_ESCR0,	/* 26 */
-	MSR_P4_IX_ESCR1,	/* 27 */
-	MSR_P4_MOB_ESCR0,	/* 28 */
-	MSR_P4_MOB_ESCR1,	/* 29 */
-	MSR_P4_MS_ESCR0,	/* 30 */
-	MSR_P4_MS_ESCR1,	/* 31 */
-	MSR_P4_PMH_ESCR0,	/* 32 */
-	MSR_P4_PMH_ESCR1,	/* 33 */
-	MSR_P4_RAT_ESCR0,	/* 34 */
-	MSR_P4_RAT_ESCR1,	/* 35 */
-	MSR_P4_SAAT_ESCR0,	/* 36 */
-	MSR_P4_SAAT_ESCR1,	/* 37 */
-	MSR_P4_SSU_ESCR0,	/* 38 */
-	MSR_P4_SSU_ESCR1,	/* 39 */
-	MSR_P4_TBPU_ESCR0,	/* 40 */
-	MSR_P4_TBPU_ESCR1,	/* 41 */
-	MSR_P4_TC_ESCR0,	/* 42 */
-	MSR_P4_TC_ESCR1,	/* 43 */
-	MSR_P4_U2L_ESCR0,	/* 44 */
-	MSR_P4_U2L_ESCR1,	/* 45 */
+/*
+ * ESCR address hashing is tricky, ESCRs are not sequential
+ * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03e0) and
+ * the metric between any ESCRs is laid in range [0xa0,0xe1]
+ *
+ * so we make ~70% filled hashtable
+ */
+
+#define P4_ESCR_MSR_BASE		0x000003a0
+#define P4_ESCR_MSR_MAX			0x000003e1
+#define P4_ESCR_MSR_TABLE_SIZE		(P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
+#define P4_ESCR_MSR_IDX(msr)		(msr - P4_ESCR_MSR_BASE)
+#define P4_ESCR_MSR_TABLE_ENTRY(msr)	[P4_ESCR_MSR_IDX(msr)] = msr
+
+static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
 };
 
 static int p4_get_escr_idx(unsigned int addr)
 {
-	unsigned int i;
+	unsigned int idx = P4_ESCR_MSR_IDX(addr);
 
-	for (i = 0; i < ARRAY_SIZE(p4_escr_map); i++) {
-		if (addr == p4_escr_map[i])
-			return i;
+	if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
+			!p4_escr_table[idx])) {
+		WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
+		return -1;
 	}
 
-	return -1;
+	return idx;
 }
 
 static int p4_next_cntr(int thread, unsigned long *used_mask,
@@ -747,7 +761,7 @@ static int p4_next_cntr(int thread, unsigned long *used_mask,
 static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 {
 	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long escr_mask[BITS_TO_LONGS(ARCH_P4_TOTAL_ESCR)];
+	unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
 	int cpu = raw_smp_processor_id();
 	struct hw_perf_event *hwc;
 	struct p4_event_bind *bind;
@@ -755,7 +769,7 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
 	int cntr_idx, escr_idx;
 
 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
-	bitmap_zero(escr_mask, ARCH_P4_TOTAL_ESCR);
+	bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
 
 	for (i = 0, num = n; i < n; i++, num--) {
 
@@ -763,6 +777,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
 		thread = p4_ht_thread(cpu);
 		bind = p4_config_get_bind(hwc->config);
 		escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
+		if (unlikely(escr_idx == -1))
+			goto done;
 
 		if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
 			cntr_idx = hwc->idx;
-- 
cgit v1.2.3-55-g7522


From f01487119dda3d9f58c9729c7361ecc50a61c188 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann
Date: Tue, 27 Apr 2010 12:13:48 +0200
Subject: x86, amd: Check X86_FEATURE_OSVW bit before accessing OSVW MSRs

If host CPU is exposed to a guest the OSVW MSRs are not guaranteed
to be present and a GP fault occurs. Thus checking the feature flag is
essential.

Cc: <stable@kernel.org> # .32.x .33.x
Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <20100427101348.GC4489@alberich.amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/process.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 28ad9f4d8b94..0415c3ef91b5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -546,11 +546,13 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
 		 * check OSVW bit for CPUs that are not affected
 		 * by erratum #400
 		 */
-		rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
-		if (val >= 2) {
-			rdmsrl(MSR_AMD64_OSVW_STATUS, val);
-			if (!(val & BIT(1)))
-				goto no_c1e_idle;
+		if (cpu_has(c, X86_FEATURE_OSVW)) {
+			rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
+			if (val >= 2) {
+				rdmsrl(MSR_AMD64_OSVW_STATUS, val);
+				if (!(val & BIT(1)))
+					goto no_c1e_idle;
+			}
 		}
 		return 1;
 	}
-- 
cgit v1.2.3-55-g7522


From 9e565292270a2d55524be38835104c564ac8f795 Mon Sep 17 00:00:00 2001
From: Roland McGrath
Date: Thu, 13 May 2010 21:43:03 -0700
Subject: x86: Use .cfi_sections for assembly code

The newer assemblers support the .cfi_sections directive so we can put
the CFI from .S files into the .debug_frame section that is preserved
in unstripped vmlinux and in separate debuginfo, rather than the
.eh_frame section that is now discarded by vmlinux.lds.S.

Signed-off-by: Roland McGrath <roland@redhat.com>
LKML-Reference: <20100514044303.A6FE7400BE@magilla.sf.frob.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Makefile             |  5 +++--
 arch/x86/include/asm/dwarf2.h | 12 ++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 0a43dc515e4c..8aa1b59b9074 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -95,8 +95,9 @@ sp-$(CONFIG_X86_64) := rsp
 cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1)
 # is .cfi_signal_frame supported too?
 cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe)
+cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index ae6253ab9029..733f7e91e7a9 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -34,6 +34,18 @@
 #define CFI_SIGNAL_FRAME
 #endif
 
+#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__)
+	/*
+	 * Emit CFI data in .debug_frame sections, not .eh_frame sections.
+	 * The latter we currently just discard since we don't do DWARF
+	 * unwinding at runtime.  So only the offline DWARF information is
+	 * useful to anyone.  Note we should not use this directive if this
+	 * file is used in the vDSO assembly, or if vmlinux.lds.S gets
+	 * changed so it doesn't discard .eh_frame.
+	 */
+	.cfi_sections .debug_frame
+#endif
+
 #else
 
 /*
-- 
cgit v1.2.3-55-g7522


From 0ddc9324b1a842afd77e8e86698b1d1d2ffed022 Mon Sep 17 00:00:00 2001
From: Andreas Dilger
Date: Fri, 14 May 2010 11:13:27 +0200
Subject: add descriptive comment for TIF_MEMDIE task flag declaration.

Signed-off-by: Andreas Dilger <adilger@dilger.ca>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/alpha/include/asm/thread_info.h      | 2 +-
 arch/arm/include/asm/thread_info.h        | 2 +-
 arch/avr32/include/asm/thread_info.h      | 2 +-
 arch/blackfin/include/asm/thread_info.h   | 2 +-
 arch/cris/include/asm/thread_info.h       | 2 +-
 arch/frv/include/asm/thread_info.h        | 2 +-
 arch/h8300/include/asm/thread_info.h      | 2 +-
 arch/ia64/include/asm/thread_info.h       | 2 +-
 arch/m32r/include/asm/thread_info.h       | 2 +-
 arch/m68k/include/asm/thread_info_mm.h    | 2 +-
 arch/m68k/include/asm/thread_info_no.h    | 2 +-
 arch/microblaze/include/asm/thread_info.h | 2 +-
 arch/mips/include/asm/thread_info.h       | 2 +-
 arch/mn10300/include/asm/thread_info.h    | 2 +-
 arch/parisc/include/asm/thread_info.h     | 2 +-
 arch/powerpc/include/asm/thread_info.h    | 2 +-
 arch/s390/include/asm/thread_info.h       | 2 +-
 arch/score/include/asm/thread_info.h      | 2 +-
 arch/sh/include/asm/thread_info.h         | 2 +-
 arch/sparc/include/asm/thread_info_32.h   | 2 +-
 arch/sparc/include/asm/thread_info_64.h   | 2 +-
 arch/um/include/asm/thread_info.h         | 7 +++----
 arch/x86/include/asm/thread_info.h        | 2 +-
 arch/xtensa/include/asm/thread_info.h     | 2 +-
 24 files changed, 26 insertions(+), 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h
index b3e888638bb7..6f32f9c84a2d 100644
--- a/arch/alpha/include/asm/thread_info.h
+++ b/arch/alpha/include/asm/thread_info.h
@@ -77,7 +77,7 @@ register struct thread_info *__current_thread_info __asm__("$8");
 #define TIF_UAC_NOPRINT		10	/* see sysinfo.h */
 #define TIF_UAC_NOFIX		11
 #define TIF_UAC_SIGBUS		12
-#define TIF_MEMDIE		13
+#define TIF_MEMDIE		13	/* is terminating due to OOM killer */
 #define TIF_RESTORE_SIGMASK	14	/* restore signal mask in do_signal */
 #define TIF_FREEZE		16	/* is freezing for suspend */
 
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index b74970ec02c4..763e29fa8530 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -141,7 +141,7 @@ extern void vfp_flush_hwstate(struct thread_info *);
 #define TIF_SYSCALL_TRACE	8
 #define TIF_POLLING_NRFLAG	16
 #define TIF_USING_IWMMXT	17
-#define TIF_MEMDIE		18
+#define TIF_MEMDIE		18	/* is terminating due to OOM killer */
 #define TIF_FREEZE		19
 #define TIF_RESTORE_SIGMASK	20
 
diff --git a/arch/avr32/include/asm/thread_info.h b/arch/avr32/include/asm/thread_info.h
index fd0c5d7e9337..7a9c03dcb0b6 100644
--- a/arch/avr32/include/asm/thread_info.h
+++ b/arch/avr32/include/asm/thread_info.h
@@ -81,7 +81,7 @@ static inline struct thread_info *current_thread_info(void)
 					   TIF_NEED_RESCHED */
 #define TIF_BREAKPOINT		4	/* enter monitor mode on return */
 #define TIF_SINGLE_STEP		5	/* single step in progress */
-#define TIF_MEMDIE		6
+#define TIF_MEMDIE		6	/* is terminating due to OOM killer */
 #define TIF_RESTORE_SIGMASK	7	/* restore signal mask in do_signal */
 #define TIF_CPU_GOING_TO_SLEEP	8	/* CPU is entering sleep 0 mode */
 #define TIF_NOTIFY_RESUME	9	/* callback before returning to user */
diff --git a/arch/blackfin/include/asm/thread_info.h b/arch/blackfin/include/asm/thread_info.h
index e9a5614cdbb1..02560fd8a121 100644
--- a/arch/blackfin/include/asm/thread_info.h
+++ b/arch/blackfin/include/asm/thread_info.h
@@ -98,7 +98,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_NEED_RESCHED	2	/* rescheduling necessary */
 #define TIF_POLLING_NRFLAG	3	/* true if poll_idle() is polling
 					   TIF_NEED_RESCHED */
-#define TIF_MEMDIE		4
+#define TIF_MEMDIE		4	/* is terminating due to OOM killer */
 #define TIF_RESTORE_SIGMASK	5	/* restore signal mask in do_signal() */
 #define TIF_FREEZE		6	/* is freezing for suspend */
 #define TIF_IRQ_SYNC		7	/* sync pipeline stage */
diff --git a/arch/cris/include/asm/thread_info.h b/arch/cris/include/asm/thread_info.h
index c3aade36c330..91776069ca80 100644
--- a/arch/cris/include/asm/thread_info.h
+++ b/arch/cris/include/asm/thread_info.h
@@ -85,7 +85,7 @@ struct thread_info {
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal() */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
-#define TIF_MEMDIE		17
+#define TIF_MEMDIE		17	/* is terminating due to OOM killer */
 #define TIF_FREEZE		18	/* is freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
diff --git a/arch/frv/include/asm/thread_info.h b/arch/frv/include/asm/thread_info.h
index e608e056bb53..11f33ead29bf 100644
--- a/arch/frv/include/asm/thread_info.h
+++ b/arch/frv/include/asm/thread_info.h
@@ -113,7 +113,7 @@ register struct thread_info *__current_thread_info asm("gr15");
 #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_RESTORE_SIGMASK	5	/* restore signal mask in do_signal() */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
-#define TIF_MEMDIE		17	/* OOM killer killed process */
+#define TIF_MEMDIE		17	/* is terminating due to OOM killer */
 #define TIF_FREEZE		18	/* freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
diff --git a/arch/h8300/include/asm/thread_info.h b/arch/h8300/include/asm/thread_info.h
index 70e67e47d020..d6f1784bfdee 100644
--- a/arch/h8300/include/asm/thread_info.h
+++ b/arch/h8300/include/asm/thread_info.h
@@ -87,7 +87,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_NEED_RESCHED	2	/* rescheduling necessary */
 #define TIF_POLLING_NRFLAG	3	/* true if poll_idle() is polling
 					   TIF_NEED_RESCHED */
-#define TIF_MEMDIE		4
+#define TIF_MEMDIE		4	/* is terminating due to OOM killer */
 #define TIF_RESTORE_SIGMASK	5	/* restore signal mask in do_signal() */
 #define TIF_NOTIFY_RESUME	6	/* callback before returning to user */
 #define TIF_FREEZE		16	/* is freezing for suspend */
diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
index 8ce2e388e37c..b6a5ba2aca34 100644
--- a/arch/ia64/include/asm/thread_info.h
+++ b/arch/ia64/include/asm/thread_info.h
@@ -102,7 +102,7 @@ struct thread_info {
 #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_NOTIFY_RESUME	6	/* resumption notification requested */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
-#define TIF_MEMDIE		17
+#define TIF_MEMDIE		17	/* is terminating due to OOM killer */
 #define TIF_MCA_INIT		18	/* this task is processing MCA or INIT */
 #define TIF_DB_DISABLED		19	/* debug trap disabled for fsyscall */
 #define TIF_FREEZE		20	/* is freezing for suspend */
diff --git a/arch/m32r/include/asm/thread_info.h b/arch/m32r/include/asm/thread_info.h
index ed240b6e8e77..71faff5bcc27 100644
--- a/arch/m32r/include/asm/thread_info.h
+++ b/arch/m32r/include/asm/thread_info.h
@@ -142,7 +142,7 @@ static inline unsigned int get_thread_fault_code(void)
 #define TIF_RESTORE_SIGMASK	8	/* restore signal mask in do_signal() */
 #define TIF_USEDFPU		16	/* FPU was used by this task this quantum (SMP) */
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling TIF_NEED_RESCHED */
-#define TIF_MEMDIE		18	/* OOM killer killed process */
+#define TIF_MEMDIE		18	/* is terminating due to OOM killer */
 #define TIF_FREEZE		19	/* is freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
diff --git a/arch/m68k/include/asm/thread_info_mm.h b/arch/m68k/include/asm/thread_info_mm.h
index 67266c683453..3bf31dc51b12 100644
--- a/arch/m68k/include/asm/thread_info_mm.h
+++ b/arch/m68k/include/asm/thread_info_mm.h
@@ -65,7 +65,7 @@ struct thread_info {
 #define TIF_NEED_RESCHED	7	/* rescheduling necessary */
 #define TIF_DELAYED_TRACE	14	/* single step a syscall */
 #define TIF_SYSCALL_TRACE	15	/* syscall trace active */
-#define TIF_MEMDIE		16
+#define TIF_MEMDIE		16	/* is terminating due to OOM killer */
 #define TIF_FREEZE		17	/* thread is freezing for suspend */
 
 #endif	/* _ASM_M68K_THREAD_INFO_H */
diff --git a/arch/m68k/include/asm/thread_info_no.h b/arch/m68k/include/asm/thread_info_no.h
index 884776f686ca..51f354b672e6 100644
--- a/arch/m68k/include/asm/thread_info_no.h
+++ b/arch/m68k/include/asm/thread_info_no.h
@@ -85,7 +85,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_NEED_RESCHED	2	/* rescheduling necessary */
 #define TIF_POLLING_NRFLAG	3	/* true if poll_idle() is polling
 					   TIF_NEED_RESCHED */
-#define TIF_MEMDIE		4
+#define TIF_MEMDIE		4	/* is terminating due to OOM killer */
 #define TIF_FREEZE		16	/* is freezing for suspend */
 
 /* as above, but as bit values */
diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h
index b2ca80f64640..8a8e9fc6e0c0 100644
--- a/arch/microblaze/include/asm/thread_info.h
+++ b/arch/microblaze/include/asm/thread_info.h
@@ -122,7 +122,7 @@ static inline struct thread_info *current_thread_info(void)
 /* restore singlestep on return to user mode */
 #define TIF_SINGLESTEP		4
 #define TIF_IRET		5 /* return with iret */
-#define TIF_MEMDIE		6
+#define TIF_MEMDIE		6	/* is terminating due to OOM killer */
 #define TIF_SYSCALL_AUDIT	9       /* syscall auditing active */
 #define TIF_SECCOMP		10      /* secure computing */
 #define TIF_FREEZE		14	/* Freezing for suspend */
diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h
index 845da2107ed1..2376f2e06e47 100644
--- a/arch/mips/include/asm/thread_info.h
+++ b/arch/mips/include/asm/thread_info.h
@@ -112,7 +112,7 @@ register struct thread_info *__current_thread_info __asm__("$28");
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal() */
 #define TIF_USEDFPU		16	/* FPU was used by this task this quantum (SMP) */
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling TIF_NEED_RESCHED */
-#define TIF_MEMDIE		18
+#define TIF_MEMDIE		18	/* is terminating due to OOM killer */
 #define TIF_FREEZE		19
 #define TIF_FIXADE		20	/* Fix address errors in software */
 #define TIF_LOGADE		21	/* Log address errors to syslog */
diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h
index 58d64f8b2cc3..2001cb657a95 100644
--- a/arch/mn10300/include/asm/thread_info.h
+++ b/arch/mn10300/include/asm/thread_info.h
@@ -148,7 +148,7 @@ static inline unsigned long current_stack_pointer(void)
 #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_RESTORE_SIGMASK	5	/* restore signal mask in do_signal() */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
-#define TIF_MEMDIE		17	/* OOM killer killed process */
+#define TIF_MEMDIE		17	/* is terminating due to OOM killer */
 #define TIF_FREEZE		18	/* freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	+(1 << TIF_SYSCALL_TRACE)
diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h
index 7ecc1039cfed..aa8de727e90b 100644
--- a/arch/parisc/include/asm/thread_info.h
+++ b/arch/parisc/include/asm/thread_info.h
@@ -56,7 +56,7 @@ struct thread_info {
 #define TIF_NEED_RESCHED	2	/* rescheduling necessary */
 #define TIF_POLLING_NRFLAG	3	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_32BIT               4       /* 32 bit binary */
-#define TIF_MEMDIE		5
+#define TIF_MEMDIE		5	/* is terminating due to OOM killer */
 #define TIF_RESTORE_SIGMASK	6	/* restore saved signal mask */
 #define TIF_FREEZE		7	/* is freezing for suspend */
 #define TIF_NOTIFY_RESUME	8	/* callback before returning to user */
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index aa9d383a1c09..65eb85976a03 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -104,7 +104,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_PERFMON_CTXSW	6	/* perfmon needs ctxsw calls */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SINGLESTEP		8	/* singlestepping active */
-#define TIF_MEMDIE		9
+#define TIF_MEMDIE		9	/* is terminating due to OOM killer */
 #define TIF_SECCOMP		10	/* secure computing */
 #define TIF_RESTOREALL		11	/* Restore all regs (implies NOERROR) */
 #define TIF_NOERROR		12	/* Force successful syscall return */
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index 34f0873d6525..370b4df68e88 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -96,7 +96,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling
 					   TIF_NEED_RESCHED */
 #define TIF_31BIT		17	/* 32bit process */
-#define TIF_MEMDIE		18
+#define TIF_MEMDIE		18	/* is terminating due to OOM killer */
 #define TIF_RESTORE_SIGMASK	19	/* restore signal mask in do_signal() */
 #define TIF_FREEZE		20	/* thread is freezing for suspend */
 
diff --git a/arch/score/include/asm/thread_info.h b/arch/score/include/asm/thread_info.h
index 55939992c27d..8570d08f58c1 100644
--- a/arch/score/include/asm/thread_info.h
+++ b/arch/score/include/asm/thread_info.h
@@ -92,7 +92,7 @@ register struct thread_info *__current_thread_info __asm__("r28");
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal() */
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling
 						 TIF_NEED_RESCHED */
-#define TIF_MEMDIE		18
+#define TIF_MEMDIE		18	/* is terminating due to OOM killer */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h
index 55a36fef6875..c228946926ed 100644
--- a/arch/sh/include/asm/thread_info.h
+++ b/arch/sh/include/asm/thread_info.h
@@ -121,7 +121,7 @@ extern void init_thread_xstate(void);
 #define TIF_NOTIFY_RESUME	7	/* callback before returning to user */
 #define TIF_SYSCALL_TRACEPOINT	8	/* for ftrace syscall instrumentation */
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling TIF_NEED_RESCHED */
-#define TIF_MEMDIE		18
+#define TIF_MEMDIE		18	/* is terminating due to OOM killer */
 #define TIF_FREEZE		19	/* Freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h
index 844d73a0340c..9dd0318d3ddf 100644
--- a/arch/sparc/include/asm/thread_info_32.h
+++ b/arch/sparc/include/asm/thread_info_32.h
@@ -132,7 +132,7 @@ BTFIXUPDEF_CALL(void, free_thread_info, struct thread_info *)
 					 * this quantum (SMP) */
 #define TIF_POLLING_NRFLAG	9	/* true if poll_idle() is polling
 					 * TIF_NEED_RESCHED */
-#define TIF_MEMDIE		10
+#define TIF_MEMDIE		10	/* is terminating due to OOM killer */
 #define TIF_FREEZE		11	/* is freezing for suspend */
 
 /* as above, but as bit values */
diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index 4827a3aeac7f..fb2ea7705a46 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -223,7 +223,7 @@ register struct thread_info *current_thread_info_reg asm("g6");
  *       an immediate value in instructions such as andcc.
  */
 /* flag bit 12 is available */
-#define TIF_MEMDIE		13
+#define TIF_MEMDIE		13	/* is terminating due to OOM killer */
 #define TIF_POLLING_NRFLAG	14
 #define TIF_FREEZE		15	/* is freezing for suspend */
 
diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h
index fd911f855367..e2cf786bda0a 100644
--- a/arch/um/include/asm/thread_info.h
+++ b/arch/um/include/asm/thread_info.h
@@ -63,10 +63,9 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_SIGPENDING		1	/* signal pending */
 #define TIF_NEED_RESCHED	2	/* rescheduling necessary */
 #define TIF_POLLING_NRFLAG      3       /* true if poll_idle() is polling
-					 * TIF_NEED_RESCHED
-					 */
-#define TIF_RESTART_BLOCK 	4
-#define TIF_MEMDIE	 	5
+					 * TIF_NEED_RESCHED */
+#define TIF_RESTART_BLOCK	4
+#define TIF_MEMDIE		5	/* is terminating due to OOM killer */
 #define TIF_SYSCALL_AUDIT	6
 #define TIF_RESTORE_SIGMASK	7
 #define TIF_FREEZE		16	/* is freezing for suspend */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e0d28901e969..4cc1be269757 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -87,7 +87,7 @@ struct thread_info {
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
 #define TIF_FORK		18	/* ret_from_fork */
-#define TIF_MEMDIE		20
+#define TIF_MEMDIE		20	/* is terminating due to OOM killer */
 #define TIF_DEBUG		21	/* uses debug registers */
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
 #define TIF_FREEZE		23	/* is freezing for suspend */
diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h
index 13165641cc51..7be8accb0b0c 100644
--- a/arch/xtensa/include/asm/thread_info.h
+++ b/arch/xtensa/include/asm/thread_info.h
@@ -129,7 +129,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_NEED_RESCHED	2	/* rescheduling necessary */
 #define TIF_SINGLESTEP		3	/* restore singlestep on return to user mode */
 #define TIF_IRET		4	/* return with iret */
-#define TIF_MEMDIE		5
+#define TIF_MEMDIE		5	/* is terminating due to OOM killer */
 #define TIF_RESTORE_SIGMASK	6	/* restore signal mask in do_signal() */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_FREEZE		17	/* is freezing for suspend */
-- 
cgit v1.2.3-55-g7522


From ade029e2aaacc8965a548b0b0f80c5bee97ffc68 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Sat, 24 Apr 2010 09:56:53 +0200
Subject: x86, k8: Fix build error when K8_NB is disabled

K8_NB depends on PCI and when the last is disabled (allnoconfig) we fail
at the final linking stage due to missing exported num_k8_northbridges.
Add a header stub for that.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20100503183036.GJ26107@aftab>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: <stable@kernel.org>
---
 arch/x86/include/asm/k8.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index f70e60071fe8..af00bd1d2089 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -16,11 +16,16 @@ extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
 extern int k8_scan_nodes(void);
 
 #ifdef CONFIG_K8_NB
+extern int num_k8_northbridges;
+
 static inline struct pci_dev *node_to_k8_nb_misc(int node)
 {
 	return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
 }
+
 #else
+#define num_k8_northbridges 0
+
 static inline struct pci_dev *node_to_k8_nb_misc(int node)
 {
 	return NULL;
-- 
cgit v1.2.3-55-g7522


From 7f284d3cc96e02468a42e045f77af11e5ff8b095 Mon Sep 17 00:00:00 2001
From: Frank Arnold
Date: Thu, 22 Apr 2010 16:06:59 +0200
Subject: x86, cacheinfo: Turn off L3 cache index disable feature in
 virtualized environments

When running a quest kernel on xen we get:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000038
IP: [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x2ca/0x3df
PGD 0
Oops: 0000 [#1] SMP
last sysfs file:
CPU 0
Modules linked in:

Pid: 0, comm: swapper Tainted: G        W  2.6.34-rc3 #1 /HVM domU
RIP: 0010:[<ffffffff8142f2fb>]  [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x
2ca/0x3df
RSP: 0018:ffff880002203e08  EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000060
RDX: 0000000000000000 RSI: 0000000000000040 RDI: 0000000000000000
RBP: ffff880002203ed8 R08: 00000000000017c0 R09: ffff880002203e38
R10: ffff8800023d5d40 R11: ffffffff81a01e28 R12: ffff880187e6f5c0
R13: ffff880002203e34 R14: ffff880002203e58 R15: ffff880002203e68
FS:  0000000000000000(0000) GS:ffff880002200000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000038 CR3: 0000000001a3c000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffffffff81a00000, task ffffffff81a44020)
Stack:
 ffffffff810d7ecb ffff880002203e20 ffffffff81059140 ffff880002203e30
<0> ffffffff810d7ec9 0000000002203e40 000000000050d140 ffff880002203e70
<0> 0000000002008140 0000000000000086 ffff880040020140 ffffffff81068b8b
Call Trace:
 <IRQ>
 [<ffffffff810d7ecb>] ? sync_supers_timer_fn+0x0/0x1c
 [<ffffffff81059140>] ? mod_timer+0x23/0x25
 [<ffffffff810d7ec9>] ? arm_supers_timer+0x34/0x36
 [<ffffffff81068b8b>] ? hrtimer_get_next_event+0xa7/0xc3
 [<ffffffff81058e85>] ? get_next_timer_interrupt+0x19a/0x20d
 [<ffffffff8142fa23>] get_cpu_leaves+0x5c/0x232
 [<ffffffff8106a7b1>] ? sched_clock_local+0x1c/0x82
 [<ffffffff8106a9a0>] ? sched_clock_tick+0x75/0x7a
 [<ffffffff8107748c>] generic_smp_call_function_single_interrupt+0xae/0xd0
 [<ffffffff8101f6ef>] smp_call_function_single_interrupt+0x18/0x27
 [<ffffffff8100a773>] call_function_single_interrupt+0x13/0x20
 <EOI>
 [<ffffffff8143c468>] ? notifier_call_chain+0x14/0x63
 [<ffffffff810295c6>] ? native_safe_halt+0xc/0xd
 [<ffffffff810114eb>] ? default_idle+0x36/0x53
 [<ffffffff81008c22>] cpu_idle+0xaa/0xe4
 [<ffffffff81423a9a>] rest_init+0x7e/0x80
 [<ffffffff81b10dd2>] start_kernel+0x40e/0x419
 [<ffffffff81b102c8>] x86_64_start_reservations+0xb3/0xb7
 [<ffffffff81b103c4>] x86_64_start_kernel+0xf8/0x107
Code: 14 d5 40 ff ae 81 8b 14 02 31 c0 3b 15 47 1c 8b 00 7d 0e 48 8b 05 36 1c 8b
 00 48 63 d2 48 8b 04 d0 c7 85 5c ff ff ff 00 00 00 00 <8b> 70 38 48 8d 8d 5c ff
 ff ff 48 8b 78 10 ba c4 01 00 00 e8 eb
RIP  [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x2ca/0x3df
 RSP <ffff880002203e08>
CR2: 0000000000000038
---[ end trace a7919e7f17c0a726 ]---

The L3 cache index disable feature of AMD CPUs has to be disabled if the
kernel is running as guest on top of a hypervisor because northbridge
devices are not available to the guest. Currently, this fixes a boot
crash on top of Xen. In the future this will become an issue on KVM as
well.

Check if northbridge devices are present and do not enable the feature
if there are none.

[ hpa: backported to 2.6.34 ]

Signed-off-by: Frank Arnold <frank.arnold@amd.com>
LKML-Reference: <1271945222-5283-3-git-send-email-bp@amd64.org>
Acked-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: <stable@kernel.org>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index b3eeb66c0a51..95962a93f99a 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -340,6 +340,10 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 	     (boot_cpu_data.x86_mask  < 0x1)))
 		return;
 
+	/* not in virtualized environments */
+	if (num_k8_northbridges == 0)
+		return;
+
 	this_leaf->can_disable = true;
 	this_leaf->l3_indices  = amd_calc_l3_indices();
 }
-- 
cgit v1.2.3-55-g7522


From e9b1d5d0ff4d3ae86050dc4c91b3147361c7af9e Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Fri, 14 May 2010 13:55:57 -0700
Subject: x86, mrst: Don't blindly access extended config space

Do not blindly access extended configuration space unless we actively
know we're on a Moorestown platform.  The fixed-size BAR capability
lives in the extended configuration space, and thus is not applicable
if the configuration space isn't appropriately sized.

This fixes booting certain VMware configurations with CONFIG_MRST=y.

Moorestown will add a fake PCI-X 266 capability to advertise the
presence of extended configuration space.

Reported-and-tested-by: Petr Vandrovec <petr@vandrovec.name>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Jacob Pan <jacob.jun.pan@intel.com>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
LKML-Reference: <AANLkTiltKUa3TrKR1M51eGw8FLNoQJSLT0k0_K5X3-OJ@mail.gmail.com>
---
 arch/x86/pci/mrst.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index 8bf2fcb88d04..1cdc02cf8fa4 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -247,6 +247,10 @@ static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev)
 	u32 size;
 	int i;
 
+	/* Must have extended configuration space */
+	if (dev->cfg_size < PCIE_CAP_OFFSET + 4)
+		return;
+
 	/* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */
 	offset = fixed_bar_cap(dev->bus, dev->devfn);
 	if (!offset || PCI_DEVFN(2, 0) == dev->devfn ||
-- 
cgit v1.2.3-55-g7522


From 1ff3d7d79204612ebe2e611d2592f8898908ca00 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Fri, 14 May 2010 23:08:15 +0400
Subject: x86, perf: P4 PMU - fix counters management logic

Jaswinder reported this #GP:

 |
 | Message from syslogd@ht at May 14 09:39:32 ...
 | kernel:[  314.908612] EIP: [<c100ccca>]
 | x86_perf_event_set_period+0x19d/0x1b2 SS:ESP 0068:edac3d70
 |

Ming has narrowed it down to a comparision issue
between arguments with different sizes and
signs. As result event index reached a wrong
value which in turn led to a GP fault.

At the same time it was found that p4_next_cntr
has broken logic and should return the counter
index only if it was not yet borrowed for
another event.

Reported-by: Jaswinder Singh Rajput <jaswinderlinux@gmail.com>
Reported-by: Lin Ming <ming.m.lin@intel.com>
Bisected-by: Lin Ming <ming.m.lin@intel.com>
Tested-by: Jaswinder Singh Rajput <jaswinderlinux@gmail.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100514190815.GG13509@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index cb875b1e2e87..424fc8de68e4 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -18,7 +18,7 @@
 struct p4_event_bind {
 	unsigned int opcode;			/* Event code and ESCR selector */
 	unsigned int escr_msr[2];		/* ESCR MSR for this event */
-	unsigned char cntr[2][P4_CNTR_LIMIT];	/* counter index (offset), -1 on abscence */
+	char cntr[2][P4_CNTR_LIMIT];		/* counter index (offset), -1 on abscence */
 };
 
 struct p4_cache_event_bind {
@@ -747,11 +747,11 @@ static int p4_get_escr_idx(unsigned int addr)
 static int p4_next_cntr(int thread, unsigned long *used_mask,
 			struct p4_event_bind *bind)
 {
-	int i = 0, j;
+	int i, j;
 
 	for (i = 0; i < P4_CNTR_LIMIT; i++) {
-		j = bind->cntr[thread][i++];
-		if (j == -1 || !test_bit(j, used_mask))
+		j = bind->cntr[thread][i];
+		if (j != -1 && !test_bit(j, used_mask))
 			return j;
 	}
 
-- 
cgit v1.2.3-55-g7522


From e4af4268a34d8cd28c46a03161fc017cbd2db887 Mon Sep 17 00:00:00 2001
From: Jacob Pan
Date: Fri, 14 May 2010 14:41:14 -0700
Subject: x86, mrst, pci: return 0 for non-present pci bars

Moorestown PCI code has special handling of devices with fixed BARs. In
case of BAR sizing writes, we need to update the fake PCI MMCFG space with real
size decode value.

When a BAR is not present, we need to return 0 instead of ~0. ~0 will be
treated as device error per bugzilla 12006.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
LKML-Reference: <1273873281-17489-2-git-send-email-jacob.jun.pan@linux.intel.com>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/pci/mrst.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index 8bf2fcb88d04..d5c7aefe56ff 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -109,7 +109,7 @@ static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn,
 			decode++;
 			decode = ~(decode - 1);
 		} else {
-			decode = ~0;
+			decode = 0;
 		}
 
 		/*
-- 
cgit v1.2.3-55-g7522


From fea24e28c663e62663097f0ed3b8ff1f9a87f15e Mon Sep 17 00:00:00 2001
From: Jacob Pan
Date: Fri, 14 May 2010 14:41:20 -0700
Subject: x86, mrst: add nop functions to x86_init mpparse functions

Moorestown does not have BIOS provided MP tables, we can save some time
by avoiding scaning of these tables. e.g.
[    0.000000] Scan SMP from c0000000 for 1024 bytes.
[    0.000000] Scan SMP from c009fc00 for 1024 bytes.
[    0.000000] Scan SMP from c00f0000 for 65536 bytes.
[    0.000000] Scan SMP from c00bfff0 for 1024 bytes.

Searching EBDA with the base at 0x40E will also result in random pointer
deferencing within 1MB. This can be a problem in Lincroft if the pointer
hits VGA area and VGA mode is not enabled.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
LKML-Reference: <1273873281-17489-8-git-send-email-jacob.jun.pan@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/mrst.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 0aad8670858e..e796448f0eb5 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -237,4 +237,9 @@ void __init x86_mrst_early_setup(void)
 	x86_init.pci.fixup_irqs = x86_init_noop;
 
 	legacy_pic = &null_legacy_pic;
+
+	/* Avoid searching for BIOS MP tables */
+	x86_init.mpparse.find_smp_config = x86_init_noop;
+	x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+
 }
-- 
cgit v1.2.3-55-g7522


From 0fc5c3a54d68d0e6c2f3b346dcc924ba928c4d0e Mon Sep 17 00:00:00 2001
From: Andrea Gelmini
Date: Sat, 27 Feb 2010 17:51:43 +0100
Subject: KVM: arch/x86/kvm/kvm_timer.h checkpatch cleanup

arch/x86/kvm/kvm_timer.h:13: ERROR: code indent should use tabs where possible

Signed-off-by: Andrea Gelmini <andrea.gelmini@gelma.net>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/kvm_timer.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 55c7524dda54..64bc6ea78d90 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -10,9 +10,7 @@ struct kvm_timer {
 };
 
 struct kvm_timer_ops {
-        bool (*is_periodic)(struct kvm_timer *);
+	bool (*is_periodic)(struct kvm_timer *);
 };
 
-
 enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
-
-- 
cgit v1.2.3-55-g7522


From d24778265ac9b2602889a5e99c6e7ba777a236df Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Mon, 1 Mar 2010 15:34:34 +0100
Subject: KVM: SVM: Return correct values in nested_svm_exit_handled_msr

The nested_svm_exit_handled_msr() returned an bool which is
a bug. I worked by accident because the exected integer
return values match with the true and false values. This
patch changes the return value to int and let the function
return the correct values.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6882be5b9267..07437ca12787 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1552,16 +1552,16 @@ static void nested_svm_unmap(struct page *page)
 	kvm_release_page_dirty(page);
 }
 
-static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 {
 	u32 param = svm->vmcb->control.exit_info_1 & 1;
 	u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
-	bool ret = false;
 	u32 t0, t1;
+	int ret;
 	u8 val;
 
 	if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
-		return false;
+		return NESTED_EXIT_HOST;
 
 	switch (msr) {
 	case 0 ... 0x1fff:
@@ -1579,12 +1579,12 @@ static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 		t0 %= 8;
 		break;
 	default:
-		ret = true;
+		ret = NESTED_EXIT_DONE;
 		goto out;
 	}
 
 	if (!kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + t1, &val, 1))
-		ret = val & ((1 << param) << t0);
+		ret = val & ((1 << param) << t0) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
 
 out:
 	return ret;
-- 
cgit v1.2.3-55-g7522


From 455716fa941ec7a03c04bd54e1b906698171b15c Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Mon, 1 Mar 2010 15:34:35 +0100
Subject: KVM: SVM: Move msrpm offset calculation to seperate function

The algorithm to find the offset in the msrpm for a given
msr is needed at other places too. Move that logic to its
own function.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 53 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 37 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 07437ca12787..429a24435c6d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -117,6 +117,8 @@ struct vcpu_svm {
 	unsigned long int3_rip;
 };
 
+#define MSR_INVALID			0xffffffffU
+
 /* enable NPT for AMD64 and X86 with PAE */
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 static bool npt_enabled = true;
@@ -200,6 +202,27 @@ static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 #define MSRS_RANGE_SIZE 2048
 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 
+static u32 svm_msrpm_offset(u32 msr)
+{
+	u32 offset;
+	int i;
+
+	for (i = 0; i < NUM_MSR_MAPS; i++) {
+		if (msr < msrpm_ranges[i] ||
+		    msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
+			continue;
+
+		offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
+		offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
+
+		/* Now we have the u8 offset - but need the u32 offset */
+		return offset / 4;
+	}
+
+	/* MSR not in any range */
+	return MSR_INVALID;
+}
+
 #define MAX_INST_SIZE 15
 
 static inline u32 svm_has(u32 feat)
@@ -418,23 +441,21 @@ err_1:
 static void set_msr_interception(u32 *msrpm, unsigned msr,
 				 int read, int write)
 {
-	int i;
+	u8 bit_read, bit_write;
+	unsigned long tmp;
+	u32 offset;
 
-	for (i = 0; i < NUM_MSR_MAPS; i++) {
-		if (msr >= msrpm_ranges[i] &&
-		    msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
-			u32 msr_offset = (i * MSRS_IN_RANGE + msr -
-					  msrpm_ranges[i]) * 2;
-
-			u32 *base = msrpm + (msr_offset / 32);
-			u32 msr_shift = msr_offset % 32;
-			u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
-			*base = (*base & ~(0x3 << msr_shift)) |
-				(mask << msr_shift);
-			return;
-		}
-	}
-	BUG();
+	offset    = svm_msrpm_offset(msr);
+	bit_read  = 2 * (msr & 0x0f);
+	bit_write = 2 * (msr & 0x0f) + 1;
+	tmp       = msrpm[offset];
+
+	BUG_ON(offset == MSR_INVALID);
+
+	read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
+	write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
+
+	msrpm[offset] = tmp;
 }
 
 static void svm_vcpu_init_msrpm(u32 *msrpm)
-- 
cgit v1.2.3-55-g7522


From ac72a9b733995cb3ef538000f6309b5e724aa469 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Mon, 1 Mar 2010 15:34:36 +0100
Subject: KVM: SVM: Introduce direct access msr list

This patch introduces a list with all msrs a guest might
have direct access to and changes the svm_vcpu_init_msrpm
function to use this list.
It also adds a check to set_msr_interception which triggers
a warning if a developer changes a msr intercept that is not
in the list.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 56 ++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 46 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 429a24435c6d..a079550d3886 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -119,6 +119,27 @@ struct vcpu_svm {
 
 #define MSR_INVALID			0xffffffffU
 
+static struct svm_direct_access_msrs {
+	u32 index;   /* Index of the MSR */
+	bool always; /* True if intercept is always on */
+} direct_access_msrs[] = {
+	{ .index = MSR_K6_STAR,				.always = true  },
+	{ .index = MSR_IA32_SYSENTER_CS,		.always = true  },
+#ifdef CONFIG_X86_64
+	{ .index = MSR_GS_BASE,				.always = true  },
+	{ .index = MSR_FS_BASE,				.always = true  },
+	{ .index = MSR_KERNEL_GS_BASE,			.always = true  },
+	{ .index = MSR_LSTAR,				.always = true  },
+	{ .index = MSR_CSTAR,				.always = true  },
+	{ .index = MSR_SYSCALL_MASK,			.always = true  },
+#endif
+	{ .index = MSR_IA32_LASTBRANCHFROMIP,		.always = false },
+	{ .index = MSR_IA32_LASTBRANCHTOIP,		.always = false },
+	{ .index = MSR_IA32_LASTINTFROMIP,		.always = false },
+	{ .index = MSR_IA32_LASTINTTOIP,		.always = false },
+	{ .index = MSR_INVALID,				.always = false },
+};
+
 /* enable NPT for AMD64 and X86 with PAE */
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 static bool npt_enabled = true;
@@ -438,6 +459,17 @@ err_1:
 
 }
 
+static bool valid_msr_intercept(u32 index)
+{
+	int i;
+
+	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
+		if (direct_access_msrs[i].index == index)
+			return true;
+
+	return false;
+}
+
 static void set_msr_interception(u32 *msrpm, unsigned msr,
 				 int read, int write)
 {
@@ -445,6 +477,12 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
 	unsigned long tmp;
 	u32 offset;
 
+	/*
+	 * If this warning triggers extend the direct_access_msrs list at the
+	 * beginning of the file
+	 */
+	WARN_ON(!valid_msr_intercept(msr));
+
 	offset    = svm_msrpm_offset(msr);
 	bit_read  = 2 * (msr & 0x0f);
 	bit_write = 2 * (msr & 0x0f) + 1;
@@ -460,18 +498,16 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
 
 static void svm_vcpu_init_msrpm(u32 *msrpm)
 {
+	int i;
+
 	memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
 
-#ifdef CONFIG_X86_64
-	set_msr_interception(msrpm, MSR_GS_BASE, 1, 1);
-	set_msr_interception(msrpm, MSR_FS_BASE, 1, 1);
-	set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1);
-	set_msr_interception(msrpm, MSR_LSTAR, 1, 1);
-	set_msr_interception(msrpm, MSR_CSTAR, 1, 1);
-	set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1);
-#endif
-	set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
-	set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
+	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+		if (!direct_access_msrs[i].always)
+			continue;
+
+		set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
+	}
 }
 
 static void svm_enable_lbrv(struct vcpu_svm *svm)
-- 
cgit v1.2.3-55-g7522


From 323c3d809b8bd42d6d557c734d4bdfdefa110445 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Mon, 1 Mar 2010 15:34:37 +0100
Subject: KVM: SVM: Optimize nested svm msrpm merging

This patch optimizes the way the msrpm of the host and the
guest are merged. The old code merged the 2 msrpm pages
completly. This code needed to touch 24kb of memory for that
operation. The optimized variant this patch introduces
merges only the parts where the host msrpm may contain zero
bits. This reduces the amount of memory which is touched to
48 bytes.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 71 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a079550d3886..45a287e51e18 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -93,6 +93,9 @@ struct nested_state {
 
 };
 
+#define MSRPM_OFFSETS	16
+static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+
 struct vcpu_svm {
 	struct kvm_vcpu vcpu;
 	struct vmcb *vmcb;
@@ -510,6 +513,49 @@ static void svm_vcpu_init_msrpm(u32 *msrpm)
 	}
 }
 
+static void add_msr_offset(u32 offset)
+{
+	int i;
+
+	for (i = 0; i < MSRPM_OFFSETS; ++i) {
+
+		/* Offset already in list? */
+		if (msrpm_offsets[i] == offset)
+			return;
+
+		/* Slot used by another offset? */
+		if (msrpm_offsets[i] != MSR_INVALID)
+			continue;
+
+		/* Add offset to list */
+		msrpm_offsets[i] = offset;
+
+		return;
+	}
+
+	/*
+	 * If this BUG triggers the msrpm_offsets table has an overflow. Just
+	 * increase MSRPM_OFFSETS in this case.
+	 */
+	BUG();
+}
+
+static void init_msrpm_offsets(void)
+{
+	int i;
+
+	memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
+
+	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+		u32 offset;
+
+		offset = svm_msrpm_offset(direct_access_msrs[i].index);
+		BUG_ON(offset == MSR_INVALID);
+
+		add_msr_offset(offset);
+	}
+}
+
 static void svm_enable_lbrv(struct vcpu_svm *svm)
 {
 	u32 *msrpm = svm->msrpm;
@@ -548,6 +594,8 @@ static __init int svm_hardware_setup(void)
 	memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
 	iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
 
+	init_msrpm_offsets();
+
 	if (boot_cpu_has(X86_FEATURE_NX))
 		kvm_enable_efer_bits(EFER_NX);
 
@@ -811,6 +859,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	svm_vcpu_init_msrpm(svm->msrpm);
 
 	svm->nested.msrpm = page_address(nested_msrpm_pages);
+	svm_vcpu_init_msrpm(svm->nested.msrpm);
 
 	svm->vmcb = page_address(page);
 	clear_page(svm->vmcb);
@@ -1888,20 +1937,33 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 
 static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 {
-	u32 *nested_msrpm;
-	struct page *page;
+	/*
+	 * This function merges the msr permission bitmaps of kvm and the
+	 * nested vmcb. It is omptimized in that it only merges the parts where
+	 * the kvm msr permission bitmap may contain zero bits
+	 */
 	int i;
 
-	nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, &page);
-	if (!nested_msrpm)
-		return false;
+	if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+		return true;
 
-	for (i = 0; i < PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
-		svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
+	for (i = 0; i < MSRPM_OFFSETS; i++) {
+		u32 value, p;
+		u64 offset;
 
-	svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
+		if (msrpm_offsets[i] == 0xffffffff)
+			break;
 
-	nested_svm_unmap(page);
+		offset = svm->nested.vmcb_msrpm + msrpm_offsets[i];
+		p      = msrpm_offsets[i] / 4;
+
+		if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
+			return false;
+
+		svm->nested.msrpm[p] = svm->msrpm[p] | value;
+	}
+
+	svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
 
 	return true;
 }
-- 
cgit v1.2.3-55-g7522


From 0d6b35378e80c555e20ca3aa3d3cc609b403cbb6 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Mon, 1 Mar 2010 15:34:38 +0100
Subject: KVM: SVM: Use svm_msrpm_offset in nested_svm_exit_handled_msr

There is a generic function now to calculate msrpm offsets.
Use that function in nested_svm_exit_handled_msr() remove
the duplicate logic (which had a bug anyway).

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 47 +++++++++++++++++------------------------------
 1 file changed, 17 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 45a287e51e18..7cb2eb906eca 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1660,40 +1660,27 @@ static void nested_svm_unmap(struct page *page)
 
 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 {
-	u32 param = svm->vmcb->control.exit_info_1 & 1;
-	u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
-	u32 t0, t1;
-	int ret;
-	u8 val;
+	u32 offset, msr, value;
+	int write, mask;
 
 	if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
 		return NESTED_EXIT_HOST;
 
-	switch (msr) {
-	case 0 ... 0x1fff:
-		t0 = (msr * 2) % 8;
-		t1 = msr / 8;
-		break;
-	case 0xc0000000 ... 0xc0001fff:
-		t0 = (8192 + msr - 0xc0000000) * 2;
-		t1 = (t0 / 8);
-		t0 %= 8;
-		break;
-	case 0xc0010000 ... 0xc0011fff:
-		t0 = (16384 + msr - 0xc0010000) * 2;
-		t1 = (t0 / 8);
-		t0 %= 8;
-		break;
-	default:
-		ret = NESTED_EXIT_DONE;
-		goto out;
-	}
+	msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+	offset = svm_msrpm_offset(msr);
+	write  = svm->vmcb->control.exit_info_1 & 1;
+	mask   = 1 << ((2 * (msr & 0xf)) + write);
 
-	if (!kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + t1, &val, 1))
-		ret = val & ((1 << param) << t0) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+	if (offset == MSR_INVALID)
+		return NESTED_EXIT_DONE;
 
-out:
-	return ret;
+	/* Offset is in 32 bit units but need in 8 bit units */
+	offset *= 4;
+
+	if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
+		return NESTED_EXIT_DONE;
+
+	return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
 }
 
 static int nested_svm_exit_special(struct vcpu_svm *svm)
@@ -1954,8 +1941,8 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 		if (msrpm_offsets[i] == 0xffffffff)
 			break;
 
-		offset = svm->nested.vmcb_msrpm + msrpm_offsets[i];
-		p      = msrpm_offsets[i] / 4;
+		p      = msrpm_offsets[i];
+		offset = svm->nested.vmcb_msrpm + (p * 4);
 
 		if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
 			return false;
-- 
cgit v1.2.3-55-g7522


From ce2ac085ff1500aad157cabd8221b7c38eb751bd Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Mon, 1 Mar 2010 15:34:39 +0100
Subject: KVM; SVM: Add correct handling of nested iopm

This patch adds the correct handling of the nested io
permission bitmap. Old behavior was to not lookup the port
in the iopm but only reinject an io intercept to the guest.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7cb2eb906eca..ddea391e78cd 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -79,6 +79,7 @@ struct nested_state {
 
 	/* gpa pointers to the real vectors */
 	u64 vmcb_msrpm;
+	u64 vmcb_iopm;
 
 	/* A VMEXIT is required but not yet emulated */
 	bool exit_required;
@@ -1658,6 +1659,26 @@ static void nested_svm_unmap(struct page *page)
 	kvm_release_page_dirty(page);
 }
 
+static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
+{
+	unsigned port;
+	u8 val, bit;
+	u64 gpa;
+
+	if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
+		return NESTED_EXIT_HOST;
+
+	port = svm->vmcb->control.exit_info_1 >> 16;
+	gpa  = svm->nested.vmcb_iopm + (port / 8);
+	bit  = port % 8;
+	val  = 0;
+
+	if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
+		val &= (1 << bit);
+
+	return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 {
 	u32 offset, msr, value;
@@ -1723,6 +1744,9 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
 	case SVM_EXIT_MSR:
 		vmexit = nested_svm_exit_handled_msr(svm);
 		break;
+	case SVM_EXIT_IOIO:
+		vmexit = nested_svm_intercept_ioio(svm);
+		break;
 	case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
 		u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
 		if (svm->nested.intercept_cr_read & cr_bits)
@@ -2047,6 +2071,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	svm->vmcb->save.cpl = nested_vmcb->save.cpl;
 
 	svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
+	svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
 
 	/* cache intercepts */
 	svm->nested.intercept_cr_read    = nested_vmcb->control.intercept_cr_read;
-- 
cgit v1.2.3-55-g7522


From f71385383fb86246ab3c69cc17a09ef9883590d8 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Mon, 1 Mar 2010 15:34:40 +0100
Subject: KVM: SVM: Ignore lower 12 bit of nested msrpm_pa

These bits are ignored by the hardware too. Implement this
for nested svm too.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ddea391e78cd..490bec199887 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2070,7 +2070,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
 	svm->vmcb->save.cpl = nested_vmcb->save.cpl;
 
-	svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
+	svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
 	svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
 
 	/* cache intercepts */
-- 
cgit v1.2.3-55-g7522


From 835e6b80478e59820cff127adba3e518ae5a43f5 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Wed, 3 Mar 2010 17:53:05 +0200
Subject: KVM: x86 emulator mark VMMCALL and LMSW as privileged

LMSW is present in both group tables. It was marked privileged only in
one of them. Intel analog of VMMCALL is already marked privileged.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5b6794adaa2e..2832a8c07c6a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -362,9 +362,9 @@ static u32 group_table[] = {
 
 static u32 group2_table[] = {
 	[Group7*8] =
-	SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM,
+	SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
 	SrcNone | ModRM | DstMem | Mov, 0,
-	SrcMem16 | ModRM | Mov, 0,
+	SrcMem16 | ModRM | Mov | Priv, 0,
 	[Group9*8] =
 	0, 0, 0, 0, 0, 0, 0, 0,
 };
-- 
cgit v1.2.3-55-g7522


From 2ed152afc7ed61830b848b32936e1541a1a57799 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Wed, 10 Mar 2010 19:00:43 +0800
Subject: KVM: cleanup kvm trace

This patch does:

 - no need call tracepoint_synchronize_unregister() when kvm module
   is unloaded since ftrace can handle it

 - cleanup ftrace's macro

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 1 -
 arch/x86/kvm/mmutrace.h    | 7 +++++--
 arch/x86/kvm/trace.h       | 7 +++++--
 arch/x86/kvm/x86.c         | 2 +-
 include/trace/events/kvm.h | 1 -
 virt/kvm/kvm_main.c        | 1 -
 6 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 19a8906bcaa2..3af2dfd8778e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -148,7 +148,6 @@ module_param(oos_shadow, bool, 0644);
 
 #include <trace/events/kvm.h>
 
-#undef TRACE_INCLUDE_FILE
 #define CREATE_TRACE_POINTS
 #include "mmutrace.h"
 
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3e4a5c6ca2a9..1fe956ab7617 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -6,8 +6,6 @@
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvmmmu
-#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE mmutrace
 
 #define KVM_MMU_PAGE_FIELDS \
 	__field(__u64, gfn) \
@@ -216,5 +214,10 @@ TRACE_EVENT(
 
 #endif /* _TRACE_KVMMMU_H */
 
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE mmutrace
+
 /* This part must be outside protection */
 #include <trace/define_trace.h>
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 17b52ccd9774..b75efef79e56 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -5,8 +5,6 @@
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvm
-#define TRACE_INCLUDE_PATH arch/x86/kvm
-#define TRACE_INCLUDE_FILE trace
 
 /*
  * Tracepoint for guest mode entry.
@@ -575,5 +573,10 @@ TRACE_EVENT(kvm_skinit,
 
 #endif /* _TRACE_KVM_H */
 
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH arch/x86/kvm
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
 /* This part must be outside protection */
 #include <trace/define_trace.h>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2b34dc705cfb..74e70d975ffa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -41,7 +41,7 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <trace/events/kvm.h>
-#undef TRACE_INCLUDE_FILE
+
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index b17d49dfc3ef..6dd3a51ab1cb 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -5,7 +5,6 @@
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvm
-#define TRACE_INCLUDE_FILE kvm
 
 #if defined(__KVM_HAVE_IOAPIC)
 TRACE_EVENT(kvm_set_irq,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5bac6eb0f0a9..b152b23cd095 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2279,7 +2279,6 @@ EXPORT_SYMBOL_GPL(kvm_init);
 
 void kvm_exit(void)
 {
-	tracepoint_synchronize_unregister();
 	kvm_exit_debug();
 	misc_deregister(&kvm_dev);
 	kmem_cache_destroy(kvm_vcpu_cache);
-- 
cgit v1.2.3-55-g7522


From d4f64b6cad0fc0fb4cec868c6ca6b1325949d08b Mon Sep 17 00:00:00 2001
From: Minchan Kim
Date: Wed, 10 Mar 2010 23:31:22 +0900
Subject: KVM: remove redundant initialization of page->private

The prep_new_page() in page allocator calls set_page_private(page, 0).
So we don't need to reinitialize private of page.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Avi Kivity<avi@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3af2dfd8778e..4455ddbe36f4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -326,7 +326,6 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
 		page = alloc_page(GFP_KERNEL);
 		if (!page)
 			return -ENOMEM;
-		set_page_private(page, 0);
 		cache->objects[cache->nobjs++] = page_address(page);
 	}
 	return 0;
-- 
cgit v1.2.3-55-g7522


From 5bfd8b5455e69b37af16a2df1edae2c3b567648c Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Thu, 11 Mar 2010 10:50:44 +0200
Subject: KVM: Move kvm_exit tracepoint rip reading inside tracepoint

Reading rip is expensive on vmx, so move it inside the tracepoint so we only
incur the cost if tracing is enabled.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c   | 2 +-
 arch/x86/kvm/trace.h | 6 +++---
 arch/x86/kvm/vmx.c   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 490bec199887..abbc3f9d03b2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2686,7 +2686,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 	struct kvm_run *kvm_run = vcpu->run;
 	u32 exit_code = svm->vmcb->control.exit_code;
 
-	trace_kvm_exit(exit_code, svm->vmcb->save.rip);
+	trace_kvm_exit(exit_code, vcpu);
 
 	if (unlikely(svm->nested.exit_required)) {
 		nested_svm_vmexit(svm);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index b75efef79e56..d10b359a21f3 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -182,8 +182,8 @@ TRACE_EVENT(kvm_apic,
  * Tracepoint for kvm guest exit:
  */
 TRACE_EVENT(kvm_exit,
-	TP_PROTO(unsigned int exit_reason, unsigned long guest_rip),
-	TP_ARGS(exit_reason, guest_rip),
+	TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu),
+	TP_ARGS(exit_reason, vcpu),
 
 	TP_STRUCT__entry(
 		__field(	unsigned int,	exit_reason	)
@@ -192,7 +192,7 @@ TRACE_EVENT(kvm_exit,
 
 	TP_fast_assign(
 		__entry->exit_reason	= exit_reason;
-		__entry->guest_rip	= guest_rip;
+		__entry->guest_rip	= kvm_rip_read(vcpu);
 	),
 
 	TP_printk("reason %s rip 0x%lx",
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8e2a24693be9..3dbfc20824b7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3612,7 +3612,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	u32 exit_reason = vmx->exit_reason;
 	u32 vectoring_info = vmx->idt_vectoring_info;
 
-	trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
+	trace_kvm_exit(exit_reason, vcpu);
 
 	/* If guest state is invalid, start emulating */
 	if (vmx->emulation_required && emulate_invalid_guest_state)
-- 
cgit v1.2.3-55-g7522


From 5c1c85d08da5c257b21b0423b96fa6554aa4cb6f Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Thu, 11 Mar 2010 13:01:59 +0200
Subject: KVM: Trace exception injection

Often an exception can help point out where things start to go wrong.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/trace.h | 32 ++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c   |  3 +++
 2 files changed, 35 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index d10b359a21f3..32c912c40bf8 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -219,6 +219,38 @@ TRACE_EVENT(kvm_inj_virq,
 	TP_printk("irq %u", __entry->irq)
 );
 
+#define EXS(x) { x##_VECTOR, "#" #x }
+
+#define kvm_trace_sym_exc						\
+	EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM),	\
+	EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF),		\
+	EXS(MF), EXS(MC)
+
+/*
+ * Tracepoint for kvm interrupt injection:
+ */
+TRACE_EVENT(kvm_inj_exception,
+	TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
+	TP_ARGS(exception, has_error, error_code),
+
+	TP_STRUCT__entry(
+		__field(	u8,	exception	)
+		__field(	u8,	has_error	)
+		__field(	u32,	error_code	)
+	),
+
+	TP_fast_assign(
+		__entry->exception	= exception;
+		__entry->has_error	= has_error;
+		__entry->error_code	= error_code;
+	),
+
+	TP_printk("%s (0x%x)",
+		  __print_symbolic(__entry->exception, kvm_trace_sym_exc),
+		  /* FIXME: don't print error_code if not present */
+		  __entry->has_error ? __entry->error_code : 0)
+);
+
 /*
  * Tracepoint for page fault.
  */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 74e70d975ffa..a1cf87fe9f3a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4237,6 +4237,9 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
 {
 	/* try to reinject previous events if any */
 	if (vcpu->arch.exception.pending) {
+		trace_kvm_inj_exception(vcpu->arch.exception.nr,
+					vcpu->arch.exception.has_error_code,
+					vcpu->arch.exception.error_code);
 		kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
 					  vcpu->arch.exception.has_error_code,
 					  vcpu->arch.exception.error_code);
-- 
cgit v1.2.3-55-g7522


From ec68798c8fd0f01cdbd3f3e1a970e76a644cf08e Mon Sep 17 00:00:00 2001
From: Wei Yongjun
Date: Fri, 5 Mar 2010 12:11:48 +0800
Subject: KVM: x86: Use native_store_idt() instead of kvm_get_idt()

This patch use generic linux function native_store_idt()
instead of kvm_get_idt(), and also removed the useless
function kvm_get_idt().

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 5 -----
 arch/x86/kvm/vmx.c              | 2 +-
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ec891a2ce86e..ea1b6c615f9f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -716,11 +716,6 @@ static inline void kvm_load_ldt(u16 sel)
 	asm("lldt %0" : : "rm"(sel));
 }
 
-static inline void kvm_get_idt(struct desc_ptr *table)
-{
-	asm("sidt %0" : "=m"(*table));
-}
-
 #ifdef CONFIG_X86_64
 static inline unsigned long read_msr(unsigned long msr)
 {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3dbfc20824b7..33d88e0a0601 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2452,7 +2452,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
 	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
 
-	kvm_get_idt(&dt);
+	native_store_idt(&dt);
 	vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
 
 	asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
-- 
cgit v1.2.3-55-g7522


From 160d2f6c0c90713aa3bb93dd344fe0d527342e26 Mon Sep 17 00:00:00 2001
From: Wei Yongjun
Date: Fri, 12 Mar 2010 10:09:45 +0800
Subject: KVM: x86: fix the error of ioctl KVM_IRQ_LINE if no irq chip

If no irq chip in kernel, ioctl KVM_IRQ_LINE will return -EFAULT.
But I see in other place such as KVM_[GET|SET]IRQCHIP, -ENXIO is
return. So this patch used -ENXIO instead of -EFAULT.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a1cf87fe9f3a..7a8fe9051ecf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2836,11 +2836,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = -EFAULT;
 		if (copy_from_user(&irq_event, argp, sizeof irq_event))
 			goto out;
+		r = -ENXIO;
 		if (irqchip_in_kernel(kvm)) {
 			__s32 status;
 			status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
 					irq_event.irq, irq_event.level);
 			if (ioctl == KVM_IRQ_LINE_STATUS) {
+				r = -EFAULT;
 				irq_event.status = status;
 				if (copy_to_user(argp, &irq_event,
 							sizeof irq_event))
-- 
cgit v1.2.3-55-g7522


From 72016f3a4221799a0b1fdf443ef6e29db572a9bb Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Mon, 15 Mar 2010 13:59:53 +0200
Subject: KVM: MMU: Consolidate two guest pte reads in kvm_mmu_pte_write()

kvm_mmu_pte_write() reads guest ptes in two different occasions, both to
allow a 32-bit pae guest to update a pte with 4-byte writes.  Consolidate
these into a single read, which also allows us to consolidate another read
from an invlpg speculating a gpte into the shadow page table.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 69 ++++++++++++++++++++++++------------------------------
 1 file changed, 31 insertions(+), 38 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4455ddbe36f4..91f8b171c825 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2560,36 +2560,11 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
 }
 
 static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-					  const u8 *new, int bytes)
+					  u64 gpte)
 {
 	gfn_t gfn;
-	int r;
-	u64 gpte = 0;
 	pfn_t pfn;
 
-	if (bytes != 4 && bytes != 8)
-		return;
-
-	/*
-	 * Assume that the pte write on a page table of the same type
-	 * as the current vcpu paging mode.  This is nearly always true
-	 * (might be false while changing modes).  Note it is verified later
-	 * by update_pte().
-	 */
-	if (is_pae(vcpu)) {
-		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
-		if ((bytes == 4) && (gpa % 4 == 0)) {
-			r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
-			if (r)
-				return;
-			memcpy((void *)&gpte + (gpa % 8), new, 4);
-		} else if ((bytes == 8) && (gpa % 8 == 0)) {
-			memcpy((void *)&gpte, new, 8);
-		}
-	} else {
-		if ((bytes == 4) && (gpa % 4 == 0))
-			memcpy((void *)&gpte, new, 4);
-	}
 	if (!is_present_gpte(gpte))
 		return;
 	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -2640,7 +2615,34 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	int r;
 
 	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
-	mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
+
+	switch (bytes) {
+	case 4:
+		gentry = *(const u32 *)new;
+		break;
+	case 8:
+		gentry = *(const u64 *)new;
+		break;
+	default:
+		gentry = 0;
+		break;
+	}
+
+	/*
+	 * Assume that the pte write on a page table of the same type
+	 * as the current vcpu paging mode.  This is nearly always true
+	 * (might be false while changing modes).  Note it is verified later
+	 * by update_pte().
+	 */
+	if (is_pae(vcpu) && bytes == 4) {
+		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+		gpa &= ~(gpa_t)7;
+		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, 8);
+		if (r)
+			gentry = 0;
+	}
+
+	mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_access_page(vcpu, gfn);
 	kvm_mmu_free_some_pages(vcpu);
@@ -2705,20 +2707,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 				continue;
 		}
 		spte = &sp->spt[page_offset / sizeof(*spte)];
-		if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
-			gentry = 0;
-			r = kvm_read_guest_atomic(vcpu->kvm,
-						  gpa & ~(u64)(pte_size - 1),
-						  &gentry, pte_size);
-			new = (const void *)&gentry;
-			if (r < 0)
-				new = NULL;
-		}
 		while (npte--) {
 			entry = *spte;
 			mmu_pte_write_zap_pte(vcpu, sp, spte);
-			if (new)
-				mmu_pte_write_new_pte(vcpu, sp, spte, new);
+			if (gentry)
+				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
 			mmu_pte_write_flush_tlb(vcpu, entry, *spte);
 			++spte;
 		}
-- 
cgit v1.2.3-55-g7522


From daea3e73cb4ac971bee97f333ae027861d00fc0b Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Mon, 15 Mar 2010 13:59:54 +0200
Subject: KVM: Make locked operations truly atomic

Once upon a time, locked operations were emulated while holding the mmu mutex.
Since mmu pages were write protected, it was safe to emulate the writes in
a non-atomic manner, since there could be no other writer, either in the
guest or in the kernel.

These days emulation takes place without holding the mmu spinlock, so the
write could be preempted by an unshadowing event, which exposes the page
to writes by the guest.  This may cause corruption of guest page tables.

Fix by using an atomic cmpxchg for these operations.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 69 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7a8fe9051ecf..855f3ea9edd1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3291,41 +3291,68 @@ int emulator_write_emulated(unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(emulator_write_emulated);
 
+#define CMPXCHG_TYPE(t, ptr, old, new) \
+	(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
+
+#ifdef CONFIG_X86_64
+#  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
+#else
+#  define CMPXCHG64(ptr, old, new) \
+	(cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u *)(new)) == *(u64 *)(old))
+#endif
+
 static int emulator_cmpxchg_emulated(unsigned long addr,
 				     const void *old,
 				     const void *new,
 				     unsigned int bytes,
 				     struct kvm_vcpu *vcpu)
 {
-	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
-#ifndef CONFIG_X86_64
-	/* guests cmpxchg8b have to be emulated atomically */
-	if (bytes == 8) {
-		gpa_t gpa;
-		struct page *page;
-		char *kaddr;
-		u64 val;
+	gpa_t gpa;
+	struct page *page;
+	char *kaddr;
+	bool exchanged;
 
-		gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
+	/* guests cmpxchg8b have to be emulated atomically */
+	if (bytes > 8 || (bytes & (bytes - 1)))
+		goto emul_write;
 
-		if (gpa == UNMAPPED_GVA ||
-		   (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
-			goto emul_write;
+	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
 
-		if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
-			goto emul_write;
+	if (gpa == UNMAPPED_GVA ||
+	    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+		goto emul_write;
 
-		val = *(u64 *)new;
+	if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+		goto emul_write;
 
-		page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
 
-		kaddr = kmap_atomic(page, KM_USER0);
-		set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
-		kunmap_atomic(kaddr, KM_USER0);
-		kvm_release_page_dirty(page);
+	kaddr = kmap_atomic(page, KM_USER0);
+	kaddr += offset_in_page(gpa);
+	switch (bytes) {
+	case 1:
+		exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
+		break;
+	case 2:
+		exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
+		break;
+	case 4:
+		exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
+		break;
+	case 8:
+		exchanged = CMPXCHG64(kaddr, old, new);
+		break;
+	default:
+		BUG();
 	}
+	kunmap_atomic(kaddr, KM_USER0);
+	kvm_release_page_dirty(page);
+
+	if (!exchanged)
+		return X86EMUL_CMPXCHG_FAILED;
+
 emul_write:
-#endif
+	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 
 	return emulator_write_emulated(addr, new, bytes, vcpu);
 }
-- 
cgit v1.2.3-55-g7522


From 4a5f48f666ccc4ffdbc54241d9cab06806ed7922 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Mon, 15 Mar 2010 13:59:55 +0200
Subject: KVM: Don't follow an atomic operation by a non-atomic one

Currently emulated atomic operations are immediately followed by a non-atomic
operation, so that kvm_mmu_pte_write() can be invoked.  This updates the mmu
but undoes the whole point of doing things atomically.

Fix by only performing the atomic operation and the mmu update, and avoiding
the non-atomic write.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 855f3ea9edd1..dd4a7ad63aff 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3234,7 +3234,8 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 static int emulator_write_emulated_onepage(unsigned long addr,
 					   const void *val,
 					   unsigned int bytes,
-					   struct kvm_vcpu *vcpu)
+					   struct kvm_vcpu *vcpu,
+					   bool mmu_only)
 {
 	gpa_t                 gpa;
 	u32 error_code;
@@ -3250,6 +3251,10 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 		goto mmio;
 
+	if (mmu_only) {
+		kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
+		return X86EMUL_CONTINUE;
+	}
 	if (emulator_write_phys(vcpu, gpa, val, bytes))
 		return X86EMUL_CONTINUE;
 
@@ -3270,24 +3275,35 @@ mmio:
 	return X86EMUL_CONTINUE;
 }
 
-int emulator_write_emulated(unsigned long addr,
+int __emulator_write_emulated(unsigned long addr,
 				   const void *val,
 				   unsigned int bytes,
-				   struct kvm_vcpu *vcpu)
+				   struct kvm_vcpu *vcpu,
+				   bool mmu_only)
 {
 	/* Crossing a page boundary? */
 	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
 		int rc, now;
 
 		now = -addr & ~PAGE_MASK;
-		rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
+		rc = emulator_write_emulated_onepage(addr, val, now, vcpu,
+						     mmu_only);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		addr += now;
 		val += now;
 		bytes -= now;
 	}
-	return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+	return emulator_write_emulated_onepage(addr, val, bytes, vcpu,
+					       mmu_only);
+}
+
+int emulator_write_emulated(unsigned long addr,
+				   const void *val,
+				   unsigned int bytes,
+				   struct kvm_vcpu *vcpu)
+{
+	return __emulator_write_emulated(addr, val, bytes, vcpu, false);
 }
 EXPORT_SYMBOL_GPL(emulator_write_emulated);
 
@@ -3351,6 +3367,8 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 	if (!exchanged)
 		return X86EMUL_CMPXCHG_FAILED;
 
+	return __emulator_write_emulated(addr, new, bytes, vcpu, true);
+
 emul_write:
 	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 
@@ -4005,7 +4023,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
 
 	kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
-	return emulator_write_emulated(rip, instruction, 3, vcpu);
+	return __emulator_write_emulated(rip, instruction, 3, vcpu, false);
 }
 
 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
-- 
cgit v1.2.3-55-g7522


From fbc5d139bb92e6822e4c000f97631a072d8babf9 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Mon, 15 Mar 2010 13:59:56 +0200
Subject: KVM: MMU: Do not instantiate nontrapping spte on unsync page

The update_pte() path currently uses a nontrapping spte when a nonpresent
(or nonaccessed) gpte is written.  This is fine since at present it is only
used on sync pages.  However, on an unsync page this will cause an endless
fault loop as the guest is under no obligation to invlpg a gpte that
transitions from nonpresent to present.

Needed for the next patch which reinstates update_pte() on invlpg.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 81eab9a50e6a..4b37e1acd375 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -258,11 +258,17 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	pt_element_t gpte;
 	unsigned pte_access;
 	pfn_t pfn;
+	u64 new_spte;
 
 	gpte = *(const pt_element_t *)pte;
 	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
-		if (!is_present_gpte(gpte))
-			__set_spte(spte, shadow_notrap_nonpresent_pte);
+		if (!is_present_gpte(gpte)) {
+			if (page->unsync)
+				new_spte = shadow_trap_nonpresent_pte;
+			else
+				new_spte = shadow_notrap_nonpresent_pte;
+			__set_spte(spte, new_spte);
+		}
 		return;
 	}
 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
-- 
cgit v1.2.3-55-g7522


From 08e850c6536db302050c0287649e68e3bbdfe2c7 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Mon, 15 Mar 2010 13:59:57 +0200
Subject: KVM: MMU: Reinstate pte prefetch on invlpg

Commit fb341f57 removed the pte prefetch on guest invlpg, citing guest races.
However, the SDM is adamant that prefetch is allowed:

  "The processor may create entries in paging-structure caches for
   translations required for prefetches and for accesses that are a
   result of speculative execution that would never actually occur
   in the executed code path."

And, in fact, there was a race in the prefetch code: we picked up the pte
without the mmu lock held, so an older invlpg could install the pte over
a newer invlpg.

Reinstate the prefetch logic, but this time note whether another invlpg has
executed using a counter.  If a race occured, do not install the pte.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c              | 37 +++++++++++++++++++++++--------------
 arch/x86/kvm/paging_tmpl.h      | 15 +++++++++++++++
 3 files changed, 39 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ea1b6c615f9f..28826c82d1e2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -389,6 +389,7 @@ struct kvm_arch {
 	unsigned int n_free_mmu_pages;
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_alloc_mmu_pages;
+	atomic_t invlpg_counter;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
 	/*
 	 * Hash table of struct kvm_mmu_page.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 91f8b171c825..064c3efb49dc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2613,20 +2613,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	int flooded = 0;
 	int npte;
 	int r;
+	int invlpg_counter;
 
 	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
 
-	switch (bytes) {
-	case 4:
-		gentry = *(const u32 *)new;
-		break;
-	case 8:
-		gentry = *(const u64 *)new;
-		break;
-	default:
-		gentry = 0;
-		break;
-	}
+	invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
 
 	/*
 	 * Assume that the pte write on a page table of the same type
@@ -2634,16 +2625,34 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	 * (might be false while changing modes).  Note it is verified later
 	 * by update_pte().
 	 */
-	if (is_pae(vcpu) && bytes == 4) {
+	if ((is_pae(vcpu) && bytes == 4) || !new) {
 		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
-		gpa &= ~(gpa_t)7;
-		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, 8);
+		if (is_pae(vcpu)) {
+			gpa &= ~(gpa_t)7;
+			bytes = 8;
+		}
+		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
 		if (r)
 			gentry = 0;
+		new = (const u8 *)&gentry;
+	}
+
+	switch (bytes) {
+	case 4:
+		gentry = *(const u32 *)new;
+		break;
+	case 8:
+		gentry = *(const u64 *)new;
+		break;
+	default:
+		gentry = 0;
+		break;
 	}
 
 	mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
 	spin_lock(&vcpu->kvm->mmu_lock);
+	if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
+		gentry = 0;
 	kvm_mmu_access_page(vcpu, gfn);
 	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4b37e1acd375..067797a72768 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -463,6 +463,7 @@ out_unlock:
 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
 	struct kvm_shadow_walk_iterator iterator;
+	gpa_t pte_gpa = -1;
 	int level;
 	u64 *sptep;
 	int need_flush = 0;
@@ -476,6 +477,10 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 		if (level == PT_PAGE_TABLE_LEVEL  ||
 		    ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
 		    ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
+			struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+			pte_gpa = (sp->gfn << PAGE_SHIFT);
+			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
 
 			if (is_shadow_present_pte(*sptep)) {
 				rmap_remove(vcpu->kvm, sptep);
@@ -493,7 +498,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 
 	if (need_flush)
 		kvm_flush_remote_tlbs(vcpu->kvm);
+
+	atomic_inc(&vcpu->kvm->arch.invlpg_counter);
+
 	spin_unlock(&vcpu->kvm->mmu_lock);
+
+	if (pte_gpa == -1)
+		return;
+
+	if (mmu_topup_memory_caches(vcpu))
+		return;
+	kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
 }
 
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
-- 
cgit v1.2.3-55-g7522


From d6d367d6783e38634377bc66b62bff3ffd717e5f Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Mon, 15 Mar 2010 16:38:28 +0200
Subject: KVM: x86 emulator: Fix DstAcc decoding.

Set correct operation length. Add RAX (64bit) handling.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2832a8c07c6a..0b70a364f0f4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1194,9 +1194,9 @@ done_prefixes:
 		break;
 	case DstAcc:
 		c->dst.type = OP_REG;
-		c->dst.bytes = c->op_bytes;
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
-		switch (c->op_bytes) {
+		switch (c->dst.bytes) {
 			case 1:
 				c->dst.val = *(u8 *)c->dst.ptr;
 				break;
@@ -1206,6 +1206,9 @@ done_prefixes:
 			case 4:
 				c->dst.val = *(u32 *)c->dst.ptr;
 				break;
+			case 8:
+				c->dst.val = *(u64 *)c->dst.ptr;
+				break;
 		}
 		c->dst.orig_val = c->dst.val;
 		break;
-- 
cgit v1.2.3-55-g7522


From c73e197bc525e67b71578126b679446f5b88b508 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Mon, 15 Mar 2010 16:38:29 +0200
Subject: KVM: x86 emulator: fix RCX access during rep emulation

During rep emulation access length to RCX depends on current address
mode.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0b70a364f0f4..4dce80560d26 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1852,7 +1852,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 
 	if (c->rep_prefix && (c->d & String)) {
 		/* All REP prefixes have the same first termination condition */
-		if (c->regs[VCPU_REGS_RCX] == 0) {
+		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
 			kvm_rip_write(ctxt->vcpu, c->eip);
 			goto done;
 		}
@@ -1876,7 +1876,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 				goto done;
 			}
 		}
-		c->regs[VCPU_REGS_RCX]--;
+		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
 		c->eip = kvm_rip_read(ctxt->vcpu);
 	}
 
-- 
cgit v1.2.3-55-g7522


From af5b4f7ff7ec76400b89db9538accd9aeb996da4 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Mon, 15 Mar 2010 16:38:30 +0200
Subject: KVM: x86 emulator: check return value against correct define

Check return value against correct define instead of open code
the value.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4dce80560d26..670ca8f151d2 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -566,7 +566,7 @@ static u32 group2_table[] = {
 #define insn_fetch(_type, _size, _eip)                                  \
 ({	unsigned long _x;						\
 	rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size));		\
-	if (rc != 0)							\
+	if (rc != X86EMUL_CONTINUE)					\
 		goto done;						\
 	(_eip) += (_size);						\
 	(_type)_x;							\
-- 
cgit v1.2.3-55-g7522


From 49c6799a2ce3a6a4dd66021dabeb468901c7a700 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Mon, 15 Mar 2010 16:38:31 +0200
Subject: KVM: Remove pointer to rflags from realmode_set_cr parameters.

Mov reg, cr instruction doesn't change flags in any meaningful way, so
no need to update rflags after instruction execution.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 3 +--
 arch/x86/kvm/emulate.c          | 3 +--
 arch/x86/kvm/x86.c              | 4 +---
 3 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 28826c82d1e2..53f520259471 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -587,8 +587,7 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
 		   unsigned long *rflags);
 
 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
-		     unsigned long *rflags);
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value);
 void kvm_enable_efer_bits(u64);
 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 670ca8f151d2..91450b5cd49e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2534,8 +2534,7 @@ twobyte_insn:
 	case 0x22: /* mov reg, cr */
 		if (c->modrm_mod != 3)
 			goto cannot_emulate;
-		realmode_set_cr(ctxt->vcpu,
-				c->modrm_reg, c->modrm_val, &ctxt->eflags);
+		realmode_set_cr(ctxt->vcpu, c->modrm_reg, c->modrm_val);
 		c->dst.type = OP_NONE;
 		break;
 	case 0x23: /* mov from reg to dr */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dd4a7ad63aff..35db4f0db4ea 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4080,13 +4080,11 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
 	return value;
 }
 
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
-		     unsigned long *rflags)
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val)
 {
 	switch (cr) {
 	case 0:
 		kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
-		*rflags = kvm_get_rflags(vcpu);
 		break;
 	case 2:
 		vcpu->arch.cr2 = val;
-- 
cgit v1.2.3-55-g7522


From 31299944584fd62df8b0cfa30ad2c56f445b8cf2 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng
Date: Mon, 15 Mar 2010 17:29:09 +0800
Subject: KVM: VMX: change to use bool return values

Make use of bool as return values, and remove some useless
bool value converting. Thanks Avi to point this out.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 54 +++++++++++++++++++++++++++---------------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 33d88e0a0601..87b3c6843aac 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -234,56 +234,56 @@ static const u32 vmx_msr_index[] = {
 };
 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
 
-static inline int is_page_fault(u32 intr_info)
+static inline bool is_page_fault(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
 		(INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
 }
 
-static inline int is_no_device(u32 intr_info)
+static inline bool is_no_device(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
 		(INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
 }
 
-static inline int is_invalid_opcode(u32 intr_info)
+static inline bool is_invalid_opcode(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
 		(INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
 }
 
-static inline int is_external_interrupt(u32 intr_info)
+static inline bool is_external_interrupt(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
 		== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
 }
 
-static inline int is_machine_check(u32 intr_info)
+static inline bool is_machine_check(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
 		(INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
 }
 
-static inline int cpu_has_vmx_msr_bitmap(void)
+static inline bool cpu_has_vmx_msr_bitmap(void)
 {
 	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
 }
 
-static inline int cpu_has_vmx_tpr_shadow(void)
+static inline bool cpu_has_vmx_tpr_shadow(void)
 {
 	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
 }
 
-static inline int vm_need_tpr_shadow(struct kvm *kvm)
+static inline bool vm_need_tpr_shadow(struct kvm *kvm)
 {
 	return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
 }
 
-static inline int cpu_has_secondary_exec_ctrls(void)
+static inline bool cpu_has_secondary_exec_ctrls(void)
 {
 	return vmcs_config.cpu_based_exec_ctrl &
 		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -303,80 +303,80 @@ static inline bool cpu_has_vmx_flexpriority(void)
 
 static inline bool cpu_has_vmx_ept_execute_only(void)
 {
-	return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT);
+	return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
 }
 
 static inline bool cpu_has_vmx_eptp_uncacheable(void)
 {
-	return !!(vmx_capability.ept & VMX_EPTP_UC_BIT);
+	return vmx_capability.ept & VMX_EPTP_UC_BIT;
 }
 
 static inline bool cpu_has_vmx_eptp_writeback(void)
 {
-	return !!(vmx_capability.ept & VMX_EPTP_WB_BIT);
+	return vmx_capability.ept & VMX_EPTP_WB_BIT;
 }
 
 static inline bool cpu_has_vmx_ept_2m_page(void)
 {
-	return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
+	return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
 }
 
 static inline bool cpu_has_vmx_ept_1g_page(void)
 {
-	return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT);
+	return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
 }
 
-static inline int cpu_has_vmx_invept_individual_addr(void)
+static inline bool cpu_has_vmx_invept_individual_addr(void)
 {
-	return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
+	return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
 }
 
-static inline int cpu_has_vmx_invept_context(void)
+static inline bool cpu_has_vmx_invept_context(void)
 {
-	return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT);
+	return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
 }
 
-static inline int cpu_has_vmx_invept_global(void)
+static inline bool cpu_has_vmx_invept_global(void)
 {
-	return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT);
+	return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
 }
 
-static inline int cpu_has_vmx_ept(void)
+static inline bool cpu_has_vmx_ept(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
 		SECONDARY_EXEC_ENABLE_EPT;
 }
 
-static inline int cpu_has_vmx_unrestricted_guest(void)
+static inline bool cpu_has_vmx_unrestricted_guest(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
 		SECONDARY_EXEC_UNRESTRICTED_GUEST;
 }
 
-static inline int cpu_has_vmx_ple(void)
+static inline bool cpu_has_vmx_ple(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
 		SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 }
 
-static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
+static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
 {
 	return flexpriority_enabled && irqchip_in_kernel(kvm);
 }
 
-static inline int cpu_has_vmx_vpid(void)
+static inline bool cpu_has_vmx_vpid(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
 		SECONDARY_EXEC_ENABLE_VPID;
 }
 
-static inline int cpu_has_vmx_rdtscp(void)
+static inline bool cpu_has_vmx_rdtscp(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
 		SECONDARY_EXEC_RDTSCP;
 }
 
-static inline int cpu_has_virtual_nmis(void)
+static inline bool cpu_has_virtual_nmis(void)
 {
 	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
 }
-- 
cgit v1.2.3-55-g7522


From 52a4661737ecc918633f6b05c611a4af4b5eae5a Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:03 +0200
Subject: KVM: Provide callback to get/set control registers in emulator ops.

Use this callback instead of directly call kvm function. Also rename
realmode_(set|get)_cr to emulator_(set|get)_cr since function has nothing
to do with real mode.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |   3 +-
 arch/x86/include/asm/kvm_host.h    |   2 -
 arch/x86/kvm/emulate.c             |   7 +--
 arch/x86/kvm/x86.c                 | 114 +++++++++++++++++++------------------
 4 files changed, 63 insertions(+), 63 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 2666d7ac3229..0c5caa469eb8 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -108,7 +108,8 @@ struct x86_emulate_ops {
 				const void *new,
 				unsigned int bytes,
 				struct kvm_vcpu *vcpu);
-
+	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
+	void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
 };
 
 /* Type, address-of, and value of an instruction's operand. */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 53f520259471..9d474c7ae261 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -586,8 +586,6 @@ void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
 		   unsigned long *rflags);
 
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value);
 void kvm_enable_efer_bits(u64);
 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 91450b5cd49e..5b060e4be0e3 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2483,7 +2483,7 @@ twobyte_insn:
 			break;
 		case 4: /* smsw */
 			c->dst.bytes = 2;
-			c->dst.val = realmode_get_cr(ctxt->vcpu, 0);
+			c->dst.val = ops->get_cr(0, ctxt->vcpu);
 			break;
 		case 6: /* lmsw */
 			realmode_lmsw(ctxt->vcpu, (u16)c->src.val,
@@ -2519,8 +2519,7 @@ twobyte_insn:
 	case 0x20: /* mov cr, reg */
 		if (c->modrm_mod != 3)
 			goto cannot_emulate;
-		c->regs[c->modrm_rm] =
-				realmode_get_cr(ctxt->vcpu, c->modrm_reg);
+		c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x21: /* mov from dr to reg */
@@ -2534,7 +2533,7 @@ twobyte_insn:
 	case 0x22: /* mov reg, cr */
 		if (c->modrm_mod != 3)
 			goto cannot_emulate;
-		realmode_set_cr(ctxt->vcpu, c->modrm_reg, c->modrm_val);
+		ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
 		c->dst.type = OP_NONE;
 		break;
 	case 0x23: /* mov from reg to dr */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 35db4f0db4ea..94a29759ab2c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3423,12 +3423,70 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
 }
 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
 
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+{
+	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+}
+
+static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
+{
+	unsigned long value;
+
+	switch (cr) {
+	case 0:
+		value = kvm_read_cr0(vcpu);
+		break;
+	case 2:
+		value = vcpu->arch.cr2;
+		break;
+	case 3:
+		value = vcpu->arch.cr3;
+		break;
+	case 4:
+		value = kvm_read_cr4(vcpu);
+		break;
+	case 8:
+		value = kvm_get_cr8(vcpu);
+		break;
+	default:
+		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+		return 0;
+	}
+
+	return value;
+}
+
+static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
+{
+	switch (cr) {
+	case 0:
+		kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+		break;
+	case 2:
+		vcpu->arch.cr2 = val;
+		break;
+	case 3:
+		kvm_set_cr3(vcpu, val);
+		break;
+	case 4:
+		kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+		break;
+	case 8:
+		kvm_set_cr8(vcpu, val & 0xfUL);
+		break;
+	default:
+		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+	}
+}
+
 static struct x86_emulate_ops emulate_ops = {
 	.read_std            = kvm_read_guest_virt_system,
 	.fetch               = kvm_fetch_guest_virt,
 	.read_emulated       = emulator_read_emulated,
 	.write_emulated      = emulator_write_emulated,
 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
+	.get_cr              = emulator_get_cr,
+	.set_cr              = emulator_set_cr,
 };
 
 static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -4026,11 +4084,6 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
 	return __emulator_write_emulated(rip, instruction, 3, vcpu, false);
 }
 
-static u64 mk_cr_64(u64 curr_cr, u32 new_val)
-{
-	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
-}
-
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
 {
 	struct desc_ptr dt = { limit, base };
@@ -4052,57 +4105,6 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
 	*rflags = kvm_get_rflags(vcpu);
 }
 
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
-{
-	unsigned long value;
-
-	switch (cr) {
-	case 0:
-		value = kvm_read_cr0(vcpu);
-		break;
-	case 2:
-		value = vcpu->arch.cr2;
-		break;
-	case 3:
-		value = vcpu->arch.cr3;
-		break;
-	case 4:
-		value = kvm_read_cr4(vcpu);
-		break;
-	case 8:
-		value = kvm_get_cr8(vcpu);
-		break;
-	default:
-		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
-		return 0;
-	}
-
-	return value;
-}
-
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val)
-{
-	switch (cr) {
-	case 0:
-		kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
-		break;
-	case 2:
-		vcpu->arch.cr2 = val;
-		break;
-	case 3:
-		kvm_set_cr3(vcpu, val);
-		break;
-	case 4:
-		kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
-		break;
-	case 8:
-		kvm_set_cr8(vcpu, val & 0xfUL);
-		break;
-	default:
-		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
-	}
-}
-
 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
 {
 	struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
-- 
cgit v1.2.3-55-g7522


From 93a152be5af3d651ff0ab5459f5e0f9662b22438 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:04 +0200
Subject: KVM: remove realmode_lmsw function.

Use (get|set)_cr callback to emulate lmsw inside emulator.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 --
 arch/x86/kvm/emulate.c          | 4 ++--
 arch/x86/kvm/x86.c              | 7 -------
 3 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9d474c7ae261..b99cec1547c6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -583,8 +583,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
-		   unsigned long *rflags);
 
 void kvm_enable_efer_bits(u64);
 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5b060e4be0e3..5e2fa61e8104 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2486,8 +2486,8 @@ twobyte_insn:
 			c->dst.val = ops->get_cr(0, ctxt->vcpu);
 			break;
 		case 6: /* lmsw */
-			realmode_lmsw(ctxt->vcpu, (u16)c->src.val,
-				      &ctxt->eflags);
+			ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) |
+				    (c->src.val & 0x0f), ctxt->vcpu);
 			c->dst.type = OP_NONE;
 			break;
 		case 7: /* invlpg*/
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 94a29759ab2c..c382e9721099 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4098,13 +4098,6 @@ void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
 	kvm_x86_ops->set_idt(vcpu, &dt);
 }
 
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
-		   unsigned long *rflags)
-{
-	kvm_lmsw(vcpu, msw);
-	*rflags = kvm_get_rflags(vcpu);
-}
-
 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
 {
 	struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
-- 
cgit v1.2.3-55-g7522


From 9c5372445c1ad4fcdb4128957ec89334223b8113 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:05 +0200
Subject: KVM: Provide x86_emulate_ctxt callback to get current cpl

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/kvm/emulate.c             | 15 ++++++++-------
 arch/x86/kvm/x86.c                 |  6 ++++++
 3 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0c5caa469eb8..b048fd21c54e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -110,6 +110,7 @@ struct x86_emulate_ops {
 				struct kvm_vcpu *vcpu);
 	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
 	void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
+	int (*cpl)(struct kvm_vcpu *vcpu);
 };
 
 /* Type, address-of, and value of an instruction's operand. */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5e2fa61e8104..8bd05571672c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1257,7 +1257,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 	int rc;
 	unsigned long val, change_mask;
 	int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
-	int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu);
+	int cpl = ops->cpl(ctxt->vcpu);
 
 	rc = emulate_pop(ctxt, ops, &val, len);
 	if (rc != X86EMUL_CONTINUE)
@@ -1758,7 +1758,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
-static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
+			      struct x86_emulate_ops *ops)
 {
 	int iopl;
 	if (ctxt->mode == X86EMUL_MODE_REAL)
@@ -1766,7 +1767,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
 	if (ctxt->mode == X86EMUL_MODE_VM86)
 		return true;
 	iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
-	return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl;
+	return ops->cpl(ctxt->vcpu) > iopl;
 }
 
 static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
@@ -1803,7 +1804,7 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
 				 struct x86_emulate_ops *ops,
 				 u16 port, u16 len)
 {
-	if (emulator_bad_iopl(ctxt))
+	if (emulator_bad_iopl(ctxt, ops))
 		if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
 			return false;
 	return true;
@@ -1842,7 +1843,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	}
 
 	/* Privileged instruction can be executed only in CPL=0 */
-	if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) {
+	if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
 		kvm_inject_gp(ctxt->vcpu, 0);
 		goto done;
 	}
@@ -2378,7 +2379,7 @@ special_insn:
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xfa: /* cli */
-		if (emulator_bad_iopl(ctxt))
+		if (emulator_bad_iopl(ctxt, ops))
 			kvm_inject_gp(ctxt->vcpu, 0);
 		else {
 			ctxt->eflags &= ~X86_EFLAGS_IF;
@@ -2386,7 +2387,7 @@ special_insn:
 		}
 		break;
 	case 0xfb: /* sti */
-		if (emulator_bad_iopl(ctxt))
+		if (emulator_bad_iopl(ctxt, ops))
 			kvm_inject_gp(ctxt->vcpu, 0);
 		else {
 			toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c382e9721099..9cb28a943c9a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3479,6 +3479,11 @@ static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
 	}
 }
 
+static int emulator_get_cpl(struct kvm_vcpu *vcpu)
+{
+	return kvm_x86_ops->get_cpl(vcpu);
+}
+
 static struct x86_emulate_ops emulate_ops = {
 	.read_std            = kvm_read_guest_virt_system,
 	.fetch               = kvm_fetch_guest_virt,
@@ -3487,6 +3492,7 @@ static struct x86_emulate_ops emulate_ops = {
 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
 	.get_cr              = emulator_get_cr,
 	.set_cr              = emulator_set_cr,
+	.cpl                 = emulator_get_cpl,
 };
 
 static void cache_all_regs(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3-55-g7522


From 063db061b9b3472c925f09ae3a0a8359b80c2295 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:06 +0200
Subject: KVM: Provide current eip as part of emulator context.

Eliminate the need to call back into KVM to get it from emulator.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  3 ++-
 arch/x86/kvm/emulate.c             | 12 ++++++------
 arch/x86/kvm/x86.c                 |  1 +
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b048fd21c54e..07657258af8f 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -141,7 +141,7 @@ struct decode_cache {
 	u8 seg_override;
 	unsigned int d;
 	unsigned long regs[NR_VCPU_REGS];
-	unsigned long eip, eip_orig;
+	unsigned long eip;
 	/* modrm */
 	u8 modrm;
 	u8 modrm_mod;
@@ -160,6 +160,7 @@ struct x86_emulate_ctxt {
 	struct kvm_vcpu *vcpu;
 
 	unsigned long eflags;
+	unsigned long eip; /* eip before instruction emulation */
 	/* Emulated execution mode, represented by an X86EMUL_MODE value. */
 	int mode;
 	u32 cs_base;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8bd05571672c..2c27aa466cf4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -667,7 +667,7 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
 	int rc;
 
 	/* x86 instructions are limited to 15 bytes. */
-	if (eip + size - ctxt->decode.eip_orig > 15)
+	if (eip + size - ctxt->eip > 15)
 		return X86EMUL_UNHANDLEABLE;
 	eip += ctxt->cs_base;
 	while (size--) {
@@ -927,7 +927,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	/* Shadow copy of register state. Committed on successful emulation. */
 
 	memset(c, 0, sizeof(struct decode_cache));
-	c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu);
+	c->eip = ctxt->eip;
 	ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
 	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
 
@@ -1878,7 +1878,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 			}
 		}
 		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
-		c->eip = kvm_rip_read(ctxt->vcpu);
+		c->eip = ctxt->eip;
 	}
 
 	if (c->src.type == OP_MEM) {
@@ -2447,7 +2447,7 @@ twobyte_insn:
 				goto done;
 
 			/* Let the processor re-execute the fixed hypercall */
-			c->eip = kvm_rip_read(ctxt->vcpu);
+			c->eip = ctxt->eip;
 			/* Disable writeback. */
 			c->dst.type = OP_NONE;
 			break;
@@ -2551,7 +2551,7 @@ twobyte_insn:
 			| ((u64)c->regs[VCPU_REGS_RDX] << 32);
 		if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
-			c->eip = kvm_rip_read(ctxt->vcpu);
+			c->eip = ctxt->eip;
 		}
 		rc = X86EMUL_CONTINUE;
 		c->dst.type = OP_NONE;
@@ -2560,7 +2560,7 @@ twobyte_insn:
 		/* rdmsr */
 		if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
-			c->eip = kvm_rip_read(ctxt->vcpu);
+			c->eip = ctxt->eip;
 		} else {
 			c->regs[VCPU_REGS_RAX] = (u32)msr_data;
 			c->regs[VCPU_REGS_RDX] = msr_data >> 32;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9cb28a943c9a..0ecd37ac9d39 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3531,6 +3531,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 
 		vcpu->arch.emulate_ctxt.vcpu = vcpu;
 		vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+		vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
 		vcpu->arch.emulate_ctxt.mode =
 			(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
 			(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
-- 
cgit v1.2.3-55-g7522


From 5e3ae6c5407ffb23bc4d9871e09d1b222e1b31a4 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:07 +0200
Subject: KVM: x86 emulator: fix mov r/m, sreg emulation.

mov r/m, sreg generates #UD ins sreg is incorrect.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2c27aa466cf4..c3b9334eb248 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2126,12 +2126,11 @@ special_insn:
 	case 0x8c: { /* mov r/m, sreg */
 		struct kvm_segment segreg;
 
-		if (c->modrm_reg <= 5)
+		if (c->modrm_reg <= VCPU_SREG_GS)
 			kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
 		else {
-			printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n",
-			       c->modrm);
-			goto cannot_emulate;
+			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			goto done;
 		}
 		c->dst.val = segreg.selector;
 		break;
-- 
cgit v1.2.3-55-g7522


From 6e1e5ffee8d95f9bce71eaa029cb5247b0f2f673 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:08 +0200
Subject: KVM: x86 emulator: fix 0f 01 /5 emulation

It is undefined and should generate #UD.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c3b9334eb248..7c7debb424df 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2490,6 +2490,9 @@ twobyte_insn:
 				    (c->src.val & 0x0f), ctxt->vcpu);
 			c->dst.type = OP_NONE;
 			break;
+		case 5: /* not defined */
+			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			goto done;
 		case 7: /* invlpg*/
 			emulate_invlpg(ctxt->vcpu, memop);
 			/* Disable writeback. */
-- 
cgit v1.2.3-55-g7522


From ab8557b2b361c8bb2e2421c791c8f6c4f6ba3d08 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:09 +0200
Subject: KVM: x86 emulator: 0f (20|21|22|23) ignore mod bits.

Resent spec says that for 0f (20|21|22|23) the 2 bits in the mod field
are ignored. Interestingly enough older spec says that 11 is only valid
encoding.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7c7debb424df..fa4604e03250 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2520,28 +2520,20 @@ twobyte_insn:
 		c->dst.type = OP_NONE;
 		break;
 	case 0x20: /* mov cr, reg */
-		if (c->modrm_mod != 3)
-			goto cannot_emulate;
 		c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x21: /* mov from dr to reg */
-		if (c->modrm_mod != 3)
-			goto cannot_emulate;
 		if (emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]))
 			goto cannot_emulate;
 		rc = X86EMUL_CONTINUE;
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x22: /* mov reg, cr */
-		if (c->modrm_mod != 3)
-			goto cannot_emulate;
 		ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
 		c->dst.type = OP_NONE;
 		break;
 	case 0x23: /* mov from reg to dr */
-		if (c->modrm_mod != 3)
-			goto cannot_emulate;
 		if (emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]))
 			goto cannot_emulate;
 		rc = X86EMUL_CONTINUE;
-- 
cgit v1.2.3-55-g7522


From 6aebfa6ea75f9a02a0339e733090dd40d6f2edfd Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:10 +0200
Subject: KVM: x86 emulator: inject #UD on access to non-existing CR

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fa4604e03250..836e97ba45da 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2520,6 +2520,13 @@ twobyte_insn:
 		c->dst.type = OP_NONE;
 		break;
 	case 0x20: /* mov cr, reg */
+		switch (c->modrm_reg) {
+		case 1:
+		case 5 ... 7:
+		case 9 ... 15:
+			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			goto done;
+		}
 		c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
-- 
cgit v1.2.3-55-g7522


From 1e470be5a10801cb1c5c145f2cd9e0f5ebaf4f2e Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:11 +0200
Subject: KVM: x86 emulator: fix mov dr to inject #UD when needed.

If CR4.DE=1 access to registers DR4/DR5 cause #UD.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 836e97ba45da..5afddcfa1a7e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2531,9 +2531,12 @@ twobyte_insn:
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x21: /* mov from dr to reg */
-		if (emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]))
-			goto cannot_emulate;
-		rc = X86EMUL_CONTINUE;
+		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
+		    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
+			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			goto done;
+		}
+		emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x22: /* mov reg, cr */
@@ -2541,9 +2544,12 @@ twobyte_insn:
 		c->dst.type = OP_NONE;
 		break;
 	case 0x23: /* mov from reg to dr */
-		if (emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]))
-			goto cannot_emulate;
-		rc = X86EMUL_CONTINUE;
+		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
+		    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
+			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			goto done;
+		}
+		emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]);
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x30:
-- 
cgit v1.2.3-55-g7522


From 2e901c4cf4b550ad37840870246e835889cf7322 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:12 +0200
Subject: KVM: x86 emulator: fix return values of syscall/sysenter/sysexit
 emulations

Return X86EMUL_PROPAGATE_FAULT is fault was injected. Also inject #UD
for those instruction when appropriate.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5afddcfa1a7e..1393bf034243 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1600,8 +1600,11 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
 	u64 msr_data;
 
 	/* syscall is not available in real mode */
-	if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86)
-		return X86EMUL_UNHANDLEABLE;
+	if (ctxt->mode == X86EMUL_MODE_REAL ||
+	    ctxt->mode == X86EMUL_MODE_VM86) {
+		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
 
 	setup_syscalls_segments(ctxt, &cs, &ss);
 
@@ -1651,14 +1654,16 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
 	/* inject #GP if in real mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL) {
 		kvm_inject_gp(ctxt->vcpu, 0);
-		return X86EMUL_UNHANDLEABLE;
+		return X86EMUL_PROPAGATE_FAULT;
 	}
 
 	/* XXX sysenter/sysexit have not been tested in 64bit mode.
 	* Therefore, we inject an #UD.
 	*/
-	if (ctxt->mode == X86EMUL_MODE_PROT64)
-		return X86EMUL_UNHANDLEABLE;
+	if (ctxt->mode == X86EMUL_MODE_PROT64) {
+		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
 
 	setup_syscalls_segments(ctxt, &cs, &ss);
 
@@ -1713,7 +1718,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
 	if (ctxt->mode == X86EMUL_MODE_REAL ||
 	    ctxt->mode == X86EMUL_MODE_VM86) {
 		kvm_inject_gp(ctxt->vcpu, 0);
-		return X86EMUL_UNHANDLEABLE;
+		return X86EMUL_PROPAGATE_FAULT;
 	}
 
 	setup_syscalls_segments(ctxt, &cs, &ss);
-- 
cgit v1.2.3-55-g7522


From fd5253658b403d51fc19e56ecb44c54a3071fded Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:13 +0200
Subject: KVM: x86 emulator: do not call writeback if msr access fails.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1393bf034243..b89a8f217332 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2563,7 +2563,7 @@ twobyte_insn:
 			| ((u64)c->regs[VCPU_REGS_RDX] << 32);
 		if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
-			c->eip = ctxt->eip;
+			goto done;
 		}
 		rc = X86EMUL_CONTINUE;
 		c->dst.type = OP_NONE;
@@ -2572,7 +2572,7 @@ twobyte_insn:
 		/* rdmsr */
 		if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
-			c->eip = ctxt->eip;
+			goto done;
 		} else {
 			c->regs[VCPU_REGS_RAX] = (u32)msr_data;
 			c->regs[VCPU_REGS_RDX] = msr_data >> 32;
-- 
cgit v1.2.3-55-g7522


From a41ffb7540cb37426759e688083502d6463421b2 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:14 +0200
Subject: KVM: x86 emulator: If LOCK prefix is used dest arg should be memory.

If LOCK prefix is used dest arg should be memory, otherwise instruction
should generate #UD.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b89a8f217332..46a7ee3040a0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1842,7 +1842,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	}
 
 	/* LOCK prefix is allowed only with some instructions */
-	if (c->lock_prefix && !(c->d & Lock)) {
+	if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
 		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
 		goto done;
 	}
-- 
cgit v1.2.3-55-g7522


From aca06a83071e4e4c9150751db7ea6a46240734fc Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:15 +0200
Subject: KVM: x86 emulator: cleanup grp3 return value

When x86_emulate_insn() does not know how to emulate instruction it
exits via cannot_emulate label in all cases except when emulating
grp3. Fix that.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 46a7ee3040a0..d696cbd6ff7a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1397,7 +1397,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 			       struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc = X86EMUL_CONTINUE;
 
 	switch (c->modrm_reg) {
 	case 0 ... 1:	/* test */
@@ -1410,11 +1409,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 		emulate_1op("neg", c->dst, ctxt->eflags);
 		break;
 	default:
-		DPRINTF("Cannot emulate %02x\n", c->b);
-		rc = X86EMUL_UNHANDLEABLE;
-		break;
+		return 0;
 	}
-	return rc;
+	return 1;
 }
 
 static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -2374,9 +2371,8 @@ special_insn:
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xf6 ... 0xf7:	/* Grp3 */
-		rc = emulate_grp3(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
+		if (!emulate_grp3(ctxt, ops))
+			goto cannot_emulate;
 		break;
 	case 0xf8: /* clc */
 		ctxt->eflags &= ~EFLG_CF;
-- 
cgit v1.2.3-55-g7522


From 2dafc6c234b6064189405f42e1602e9a0abe5a44 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:16 +0200
Subject: KVM: x86 emulator: Provide more callbacks for x86 emulator.

Provide get_cached_descriptor(), set_cached_descriptor(),
get_segment_selector(), set_segment_selector(), get_gdt(),
write_std() callbacks.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  16 +++++
 arch/x86/kvm/x86.c                 | 130 ++++++++++++++++++++++++++++++++-----
 2 files changed, 131 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 07657258af8f..f901467a18b0 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -62,6 +62,15 @@ struct x86_emulate_ops {
 	int (*read_std)(unsigned long addr, void *val,
 			unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
 
+	/*
+	 * write_std: Write bytes of standard (non-emulated/special) memory.
+	 *            Used for descriptor writing.
+	 *  @addr:  [IN ] Linear address to which to write.
+	 *  @val:   [OUT] Value write to memory, zero-extended to 'u_long'.
+	 *  @bytes: [IN ] Number of bytes to write to memory.
+	 */
+	int (*write_std)(unsigned long addr, void *val,
+			 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
 	/*
 	 * fetch: Read bytes of standard (non-emulated/special) memory.
 	 *        Used for instruction fetch.
@@ -108,6 +117,13 @@ struct x86_emulate_ops {
 				const void *new,
 				unsigned int bytes,
 				struct kvm_vcpu *vcpu);
+	bool (*get_cached_descriptor)(struct desc_struct *desc,
+				      int seg, struct kvm_vcpu *vcpu);
+	void (*set_cached_descriptor)(struct desc_struct *desc,
+				      int seg, struct kvm_vcpu *vcpu);
+	u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
+	void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
+	void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
 	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
 	void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
 	int (*cpl)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0ecd37ac9d39..fbee8fbb33b5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3058,6 +3058,18 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 	return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
 }
 
+static void kvm_set_segment(struct kvm_vcpu *vcpu,
+			struct kvm_segment *var, int seg)
+{
+	kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+
+void kvm_get_segment(struct kvm_vcpu *vcpu,
+		     struct kvm_segment *var, int seg)
+{
+	kvm_x86_ops->get_segment(vcpu, var, seg);
+}
+
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
@@ -3138,14 +3150,18 @@ static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
 }
 
-static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
-				struct kvm_vcpu *vcpu, u32 *error)
+static int kvm_write_guest_virt_helper(gva_t addr, void *val,
+				       unsigned int bytes,
+				       struct kvm_vcpu *vcpu, u32 access,
+				       u32 *error)
 {
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
 
+	access |= PFERR_WRITE_MASK;
+
 	while (bytes) {
-		gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
+		gpa_t gpa =  vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
@@ -3168,6 +3184,19 @@ out:
 	return r;
 }
 
+static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
+				struct kvm_vcpu *vcpu, u32 *error)
+{
+	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, access, error);
+}
+
+static int kvm_write_guest_virt_system(gva_t addr, void *val,
+				       unsigned int bytes,
+				       struct kvm_vcpu *vcpu, u32 *error)
+{
+	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
+}
 
 static int emulator_read_emulated(unsigned long addr,
 				  void *val,
@@ -3484,12 +3513,95 @@ static int emulator_get_cpl(struct kvm_vcpu *vcpu)
 	return kvm_x86_ops->get_cpl(vcpu);
 }
 
+static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
+{
+	kvm_x86_ops->get_gdt(vcpu, dt);
+}
+
+static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
+					   struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment var;
+
+	kvm_get_segment(vcpu, &var, seg);
+
+	if (var.unusable)
+		return false;
+
+	if (var.g)
+		var.limit >>= 12;
+	set_desc_limit(desc, var.limit);
+	set_desc_base(desc, (unsigned long)var.base);
+	desc->type = var.type;
+	desc->s = var.s;
+	desc->dpl = var.dpl;
+	desc->p = var.present;
+	desc->avl = var.avl;
+	desc->l = var.l;
+	desc->d = var.db;
+	desc->g = var.g;
+
+	return true;
+}
+
+static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
+					   struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment var;
+
+	/* needed to preserve selector */
+	kvm_get_segment(vcpu, &var, seg);
+
+	var.base = get_desc_base(desc);
+	var.limit = get_desc_limit(desc);
+	if (desc->g)
+		var.limit = (var.limit << 12) | 0xfff;
+	var.type = desc->type;
+	var.present = desc->p;
+	var.dpl = desc->dpl;
+	var.db = desc->d;
+	var.s = desc->s;
+	var.l = desc->l;
+	var.g = desc->g;
+	var.avl = desc->avl;
+	var.present = desc->p;
+	var.unusable = !var.present;
+	var.padding = 0;
+
+	kvm_set_segment(vcpu, &var, seg);
+	return;
+}
+
+static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment kvm_seg;
+
+	kvm_get_segment(vcpu, &kvm_seg, seg);
+	return kvm_seg.selector;
+}
+
+static void emulator_set_segment_selector(u16 sel, int seg,
+					  struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment kvm_seg;
+
+	kvm_get_segment(vcpu, &kvm_seg, seg);
+	kvm_seg.selector = sel;
+	kvm_set_segment(vcpu, &kvm_seg, seg);
+}
+
 static struct x86_emulate_ops emulate_ops = {
 	.read_std            = kvm_read_guest_virt_system,
+	.write_std           = kvm_write_guest_virt_system,
 	.fetch               = kvm_fetch_guest_virt,
 	.read_emulated       = emulator_read_emulated,
 	.write_emulated      = emulator_write_emulated,
 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
+	.get_cached_descriptor = emulator_get_cached_descriptor,
+	.set_cached_descriptor = emulator_set_cached_descriptor,
+	.get_segment_selector = emulator_get_segment_selector,
+	.set_segment_selector = emulator_set_segment_selector,
+	.get_gdt             = emulator_get_gdt,
 	.get_cr              = emulator_get_cr,
 	.set_cr              = emulator_set_cr,
 	.cpl                 = emulator_get_cpl,
@@ -4649,12 +4761,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	return 0;
 }
 
-void kvm_get_segment(struct kvm_vcpu *vcpu,
-		     struct kvm_segment *var, int seg)
-{
-	kvm_x86_ops->get_segment(vcpu, var, seg);
-}
-
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
 {
 	struct kvm_segment cs;
@@ -4726,12 +4832,6 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-static void kvm_set_segment(struct kvm_vcpu *vcpu,
-			struct kvm_segment *var, int seg)
-{
-	kvm_x86_ops->set_segment(vcpu, var, seg);
-}
-
 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
 				   struct kvm_segment *kvm_desct)
 {
-- 
cgit v1.2.3-55-g7522


From 38ba30ba51a003360f177d5b8349439fe44fc55b Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:17 +0200
Subject: KVM: x86 emulator: Emulate task switch in emulator.c

Implement emulation of 16/32 bit task switch in emulator.c

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |   5 +
 arch/x86/kvm/emulate.c             | 563 +++++++++++++++++++++++++++++++++++++
 2 files changed, 568 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index f901467a18b0..bd469296f5e5 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -11,6 +11,8 @@
 #ifndef _ASM_X86_KVM_X86_EMULATE_H
 #define _ASM_X86_KVM_X86_EMULATE_H
 
+#include <asm/desc_defs.h>
+
 struct x86_emulate_ctxt;
 
 /*
@@ -210,5 +212,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
 		    struct x86_emulate_ops *ops);
 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
 		     struct x86_emulate_ops *ops);
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+			 struct x86_emulate_ops *ops,
+			 u16 tss_selector, int reason);
 
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d696cbd6ff7a..db4776c6b500 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -33,6 +33,7 @@
 #include <asm/kvm_emulate.h>
 
 #include "x86.h"
+#include "tss.h"
 
 /*
  * Opcode effective-address decode tables.
@@ -1221,6 +1222,198 @@ done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 }
 
+static u32 desc_limit_scaled(struct desc_struct *desc)
+{
+	u32 limit = get_desc_limit(desc);
+
+	return desc->g ? (limit << 12) | 0xfff : limit;
+}
+
+static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
+				     struct x86_emulate_ops *ops,
+				     u16 selector, struct desc_ptr *dt)
+{
+	if (selector & 1 << 2) {
+		struct desc_struct desc;
+		memset (dt, 0, sizeof *dt);
+		if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
+			return;
+
+		dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
+		dt->address = get_desc_base(&desc);
+	} else
+		ops->get_gdt(dt, ctxt->vcpu);
+}
+
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				   struct x86_emulate_ops *ops,
+				   u16 selector, struct desc_struct *desc)
+{
+	struct desc_ptr dt;
+	u16 index = selector >> 3;
+	int ret;
+	u32 err;
+	ulong addr;
+
+	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+
+	if (dt.size < index * 8 + 7) {
+		kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+	addr = dt.address + index * 8;
+	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT)
+		kvm_inject_page_fault(ctxt->vcpu, addr, err);
+
+       return ret;
+}
+
+/* allowed just for 8 bytes segments */
+static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				    struct x86_emulate_ops *ops,
+				    u16 selector, struct desc_struct *desc)
+{
+	struct desc_ptr dt;
+	u16 index = selector >> 3;
+	u32 err;
+	ulong addr;
+	int ret;
+
+	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+
+	if (dt.size < index * 8 + 7) {
+		kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+
+	addr = dt.address + index * 8;
+	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT)
+		kvm_inject_page_fault(ctxt->vcpu, addr, err);
+
+	return ret;
+}
+
+static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				   struct x86_emulate_ops *ops,
+				   u16 selector, int seg)
+{
+	struct desc_struct seg_desc;
+	u8 dpl, rpl, cpl;
+	unsigned err_vec = GP_VECTOR;
+	u32 err_code = 0;
+	bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+	int ret;
+
+	memset(&seg_desc, 0, sizeof seg_desc);
+
+	if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
+	    || ctxt->mode == X86EMUL_MODE_REAL) {
+		/* set real mode segment descriptor */
+		set_desc_base(&seg_desc, selector << 4);
+		set_desc_limit(&seg_desc, 0xffff);
+		seg_desc.type = 3;
+		seg_desc.p = 1;
+		seg_desc.s = 1;
+		goto load;
+	}
+
+	/* NULL selector is not valid for TR, CS and SS */
+	if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+	    && null_selector)
+		goto exception;
+
+	/* TR should be in GDT only */
+	if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+		goto exception;
+
+	if (null_selector) /* for NULL selector skip all following checks */
+		goto load;
+
+	ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+
+	err_code = selector & 0xfffc;
+	err_vec = GP_VECTOR;
+
+	/* can't load system descriptor into segment selecor */
+	if (seg <= VCPU_SREG_GS && !seg_desc.s)
+		goto exception;
+
+	if (!seg_desc.p) {
+		err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+		goto exception;
+	}
+
+	rpl = selector & 3;
+	dpl = seg_desc.dpl;
+	cpl = ops->cpl(ctxt->vcpu);
+
+	switch (seg) {
+	case VCPU_SREG_SS:
+		/*
+		 * segment is not a writable data segment or segment
+		 * selector's RPL != CPL or segment selector's RPL != CPL
+		 */
+		if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
+			goto exception;
+		break;
+	case VCPU_SREG_CS:
+		if (!(seg_desc.type & 8))
+			goto exception;
+
+		if (seg_desc.type & 4) {
+			/* conforming */
+			if (dpl > cpl)
+				goto exception;
+		} else {
+			/* nonconforming */
+			if (rpl > cpl || dpl != cpl)
+				goto exception;
+		}
+		/* CS(RPL) <- CPL */
+		selector = (selector & 0xfffc) | cpl;
+		break;
+	case VCPU_SREG_TR:
+		if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
+			goto exception;
+		break;
+	case VCPU_SREG_LDTR:
+		if (seg_desc.s || seg_desc.type != 2)
+			goto exception;
+		break;
+	default: /*  DS, ES, FS, or GS */
+		/*
+		 * segment is not a data or readable code segment or
+		 * ((segment is a data or nonconforming code segment)
+		 * and (both RPL and CPL > DPL))
+		 */
+		if ((seg_desc.type & 0xa) == 0x8 ||
+		    (((seg_desc.type & 0xc) != 0xc) &&
+		     (rpl > dpl && cpl > dpl)))
+			goto exception;
+		break;
+	}
+
+	if (seg_desc.s) {
+		/* mark segment as accessed */
+		seg_desc.type |= 1;
+		ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
+		if (ret != X86EMUL_CONTINUE)
+			return ret;
+	}
+load:
+	ops->set_segment_selector(selector, seg, ctxt->vcpu);
+	ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
+	return X86EMUL_CONTINUE;
+exception:
+	kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code);
+	return X86EMUL_PROPAGATE_FAULT;
+}
+
 static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
@@ -1812,6 +2005,376 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
 	return true;
 }
 
+static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
+				      struct x86_emulate_ops *ops,
+				      int seg)
+{
+	struct desc_struct desc;
+	if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
+		return get_desc_base(&desc);
+	else
+		return ~0;
+}
+
+static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
+				struct x86_emulate_ops *ops,
+				struct tss_segment_16 *tss)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	tss->ip = c->eip;
+	tss->flag = ctxt->eflags;
+	tss->ax = c->regs[VCPU_REGS_RAX];
+	tss->cx = c->regs[VCPU_REGS_RCX];
+	tss->dx = c->regs[VCPU_REGS_RDX];
+	tss->bx = c->regs[VCPU_REGS_RBX];
+	tss->sp = c->regs[VCPU_REGS_RSP];
+	tss->bp = c->regs[VCPU_REGS_RBP];
+	tss->si = c->regs[VCPU_REGS_RSI];
+	tss->di = c->regs[VCPU_REGS_RDI];
+
+	tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
+	tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+	tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
+	tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
+	tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
+}
+
+static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
+				 struct x86_emulate_ops *ops,
+				 struct tss_segment_16 *tss)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int ret;
+
+	c->eip = tss->ip;
+	ctxt->eflags = tss->flag | 2;
+	c->regs[VCPU_REGS_RAX] = tss->ax;
+	c->regs[VCPU_REGS_RCX] = tss->cx;
+	c->regs[VCPU_REGS_RDX] = tss->dx;
+	c->regs[VCPU_REGS_RBX] = tss->bx;
+	c->regs[VCPU_REGS_RSP] = tss->sp;
+	c->regs[VCPU_REGS_RBP] = tss->bp;
+	c->regs[VCPU_REGS_RSI] = tss->si;
+	c->regs[VCPU_REGS_RDI] = tss->di;
+
+	/*
+	 * SDM says that segment selectors are loaded before segment
+	 * descriptors
+	 */
+	ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu);
+	ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
+	ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
+	ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
+
+	/*
+	 * Now load segment descriptors. If fault happenes at this stage
+	 * it is handled in a context of new task
+	 */
+	ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+
+	return X86EMUL_CONTINUE;
+}
+
+static int task_switch_16(struct x86_emulate_ctxt *ctxt,
+			  struct x86_emulate_ops *ops,
+			  u16 tss_selector, u16 old_tss_sel,
+			  ulong old_tss_base, struct desc_struct *new_desc)
+{
+	struct tss_segment_16 tss_seg;
+	int ret;
+	u32 err, new_tss_base = get_desc_base(new_desc);
+
+	ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+			    &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT) {
+		/* FIXME: need to provide precise fault address */
+		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		return ret;
+	}
+
+	save_state_to_tss16(ctxt, ops, &tss_seg);
+
+	ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+			     &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT) {
+		/* FIXME: need to provide precise fault address */
+		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		return ret;
+	}
+
+	ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+			    &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT) {
+		/* FIXME: need to provide precise fault address */
+		kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+		return ret;
+	}
+
+	if (old_tss_sel != 0xffff) {
+		tss_seg.prev_task_link = old_tss_sel;
+
+		ret = ops->write_std(new_tss_base,
+				     &tss_seg.prev_task_link,
+				     sizeof tss_seg.prev_task_link,
+				     ctxt->vcpu, &err);
+		if (ret == X86EMUL_PROPAGATE_FAULT) {
+			/* FIXME: need to provide precise fault address */
+			kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+			return ret;
+		}
+	}
+
+	return load_state_from_tss16(ctxt, ops, &tss_seg);
+}
+
+static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
+				struct x86_emulate_ops *ops,
+				struct tss_segment_32 *tss)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	tss->cr3 = ops->get_cr(3, ctxt->vcpu);
+	tss->eip = c->eip;
+	tss->eflags = ctxt->eflags;
+	tss->eax = c->regs[VCPU_REGS_RAX];
+	tss->ecx = c->regs[VCPU_REGS_RCX];
+	tss->edx = c->regs[VCPU_REGS_RDX];
+	tss->ebx = c->regs[VCPU_REGS_RBX];
+	tss->esp = c->regs[VCPU_REGS_RSP];
+	tss->ebp = c->regs[VCPU_REGS_RBP];
+	tss->esi = c->regs[VCPU_REGS_RSI];
+	tss->edi = c->regs[VCPU_REGS_RDI];
+
+	tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
+	tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+	tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
+	tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
+	tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu);
+	tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu);
+	tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
+}
+
+static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
+				 struct x86_emulate_ops *ops,
+				 struct tss_segment_32 *tss)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int ret;
+
+	ops->set_cr(3, tss->cr3, ctxt->vcpu);
+	c->eip = tss->eip;
+	ctxt->eflags = tss->eflags | 2;
+	c->regs[VCPU_REGS_RAX] = tss->eax;
+	c->regs[VCPU_REGS_RCX] = tss->ecx;
+	c->regs[VCPU_REGS_RDX] = tss->edx;
+	c->regs[VCPU_REGS_RBX] = tss->ebx;
+	c->regs[VCPU_REGS_RSP] = tss->esp;
+	c->regs[VCPU_REGS_RBP] = tss->ebp;
+	c->regs[VCPU_REGS_RSI] = tss->esi;
+	c->regs[VCPU_REGS_RDI] = tss->edi;
+
+	/*
+	 * SDM says that segment selectors are loaded before segment
+	 * descriptors
+	 */
+	ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu);
+	ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
+	ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
+	ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
+	ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu);
+	ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu);
+
+	/*
+	 * Now load segment descriptors. If fault happenes at this stage
+	 * it is handled in a context of new task
+	 */
+	ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+
+	return X86EMUL_CONTINUE;
+}
+
+static int task_switch_32(struct x86_emulate_ctxt *ctxt,
+			  struct x86_emulate_ops *ops,
+			  u16 tss_selector, u16 old_tss_sel,
+			  ulong old_tss_base, struct desc_struct *new_desc)
+{
+	struct tss_segment_32 tss_seg;
+	int ret;
+	u32 err, new_tss_base = get_desc_base(new_desc);
+
+	ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+			    &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT) {
+		/* FIXME: need to provide precise fault address */
+		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		return ret;
+	}
+
+	save_state_to_tss32(ctxt, ops, &tss_seg);
+
+	ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+			     &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT) {
+		/* FIXME: need to provide precise fault address */
+		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		return ret;
+	}
+
+	ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+			    &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT) {
+		/* FIXME: need to provide precise fault address */
+		kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+		return ret;
+	}
+
+	if (old_tss_sel != 0xffff) {
+		tss_seg.prev_task_link = old_tss_sel;
+
+		ret = ops->write_std(new_tss_base,
+				     &tss_seg.prev_task_link,
+				     sizeof tss_seg.prev_task_link,
+				     ctxt->vcpu, &err);
+		if (ret == X86EMUL_PROPAGATE_FAULT) {
+			/* FIXME: need to provide precise fault address */
+			kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+			return ret;
+		}
+	}
+
+	return load_state_from_tss32(ctxt, ops, &tss_seg);
+}
+
+static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
+				    struct x86_emulate_ops *ops,
+				    u16 tss_selector, int reason)
+{
+	struct desc_struct curr_tss_desc, next_tss_desc;
+	int ret;
+	u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
+	ulong old_tss_base =
+		get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
+
+	/* FIXME: old_tss_base == ~0 ? */
+
+	ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+
+	/* FIXME: check that next_tss_desc is tss */
+
+	if (reason != TASK_SWITCH_IRET) {
+		if ((tss_selector & 3) > next_tss_desc.dpl ||
+		    ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
+			kvm_inject_gp(ctxt->vcpu, 0);
+			return X86EMUL_PROPAGATE_FAULT;
+		}
+	}
+
+	if (!next_tss_desc.p || desc_limit_scaled(&next_tss_desc) < 0x67) {
+		kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
+				      tss_selector & 0xfffc);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+
+	if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+		curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
+		write_segment_descriptor(ctxt, ops, old_tss_sel,
+					 &curr_tss_desc);
+	}
+
+	if (reason == TASK_SWITCH_IRET)
+		ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
+
+	/* set back link to prev task only if NT bit is set in eflags
+	   note that old_tss_sel is not used afetr this point */
+	if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+		old_tss_sel = 0xffff;
+
+	if (next_tss_desc.type & 8)
+		ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel,
+				     old_tss_base, &next_tss_desc);
+	else
+		ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
+				     old_tss_base, &next_tss_desc);
+
+	if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
+		ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
+
+	if (reason != TASK_SWITCH_IRET) {
+		next_tss_desc.type |= (1 << 1); /* set busy flag */
+		write_segment_descriptor(ctxt, ops, tss_selector,
+					 &next_tss_desc);
+	}
+
+	ops->set_cr(0,  ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
+	ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu);
+	ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
+
+	return ret;
+}
+
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+			 struct x86_emulate_ops *ops,
+			 u16 tss_selector, int reason)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc;
+
+	memset(c, 0, sizeof(struct decode_cache));
+	c->eip = ctxt->eip;
+	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+
+	rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason);
+
+	if (rc == X86EMUL_CONTINUE) {
+		memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
+		kvm_rip_write(ctxt->vcpu, c->eip);
+	}
+
+	return rc;
+}
+
 int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
-- 
cgit v1.2.3-55-g7522


From 2e873022f511b82a5318c7af179f588f08d68cb9 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:18 +0200
Subject: KVM: x86 emulator: Use load_segment_descriptor() instead of
 kvm_load_segment_descriptor()

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index db4776c6b500..702bffffd27f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1508,7 +1508,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg);
+	rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
 	return rc;
 }
 
@@ -1683,7 +1683,7 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
 	rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS);
+	rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
 	return rc;
 }
 
@@ -2717,7 +2717,7 @@ special_insn:
 		if (c->modrm_reg == VCPU_SREG_SS)
 			toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
 
-		rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg);
+		rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
 
 		c->dst.type = OP_NONE;  /* Disable writeback. */
 		break;
@@ -2892,8 +2892,8 @@ special_insn:
 		goto jmp;
 	case 0xea: /* jmp far */
 	jump_far:
-		if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val,
-						VCPU_SREG_CS))
+		if (load_segment_descriptor(ctxt, ops, c->src2.val,
+					    VCPU_SREG_CS))
 			goto done;
 
 		c->eip = c->src.val;
-- 
cgit v1.2.3-55-g7522


From ceffb4597253b2420d2f171d8b1cdf2cd3137989 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:19 +0200
Subject: KVM: Use task switch from emulator.c

Remove old task switch code from x86.c

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c |   6 +-
 arch/x86/kvm/x86.c     | 561 ++-----------------------------------------------
 2 files changed, 22 insertions(+), 545 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 702bffffd27f..8225ec26efed 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2291,6 +2291,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
 	ulong old_tss_base =
 		get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
+	u32 desc_limit;
 
 	/* FIXME: old_tss_base == ~0 ? */
 
@@ -2311,7 +2312,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 		}
 	}
 
-	if (!next_tss_desc.p || desc_limit_scaled(&next_tss_desc) < 0x67) {
+	desc_limit = desc_limit_scaled(&next_tss_desc);
+	if (!next_tss_desc.p ||
+	    ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
+	     desc_limit < 0x2b)) {
 		kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
 				      tss_selector & 0xfffc);
 		return X86EMUL_PROPAGATE_FAULT;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fbee8fbb33b5..f69854c8f339 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4832,557 +4832,30 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
-				   struct kvm_segment *kvm_desct)
-{
-	kvm_desct->base = get_desc_base(seg_desc);
-	kvm_desct->limit = get_desc_limit(seg_desc);
-	if (seg_desc->g) {
-		kvm_desct->limit <<= 12;
-		kvm_desct->limit |= 0xfff;
-	}
-	kvm_desct->selector = selector;
-	kvm_desct->type = seg_desc->type;
-	kvm_desct->present = seg_desc->p;
-	kvm_desct->dpl = seg_desc->dpl;
-	kvm_desct->db = seg_desc->d;
-	kvm_desct->s = seg_desc->s;
-	kvm_desct->l = seg_desc->l;
-	kvm_desct->g = seg_desc->g;
-	kvm_desct->avl = seg_desc->avl;
-	if (!selector)
-		kvm_desct->unusable = 1;
-	else
-		kvm_desct->unusable = 0;
-	kvm_desct->padding = 0;
-}
-
-static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
-					  u16 selector,
-					  struct desc_ptr *dtable)
-{
-	if (selector & 1 << 2) {
-		struct kvm_segment kvm_seg;
-
-		kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
-
-		if (kvm_seg.unusable)
-			dtable->size = 0;
-		else
-			dtable->size = kvm_seg.limit;
-		dtable->address = kvm_seg.base;
-	}
-	else
-		kvm_x86_ops->get_gdt(vcpu, dtable);
-}
-
-/* allowed just for 8 bytes segments */
-static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
-					 struct desc_struct *seg_desc)
-{
-	struct desc_ptr dtable;
-	u16 index = selector >> 3;
-	int ret;
-	u32 err;
-	gva_t addr;
-
-	get_segment_descriptor_dtable(vcpu, selector, &dtable);
-
-	if (dtable.size < index * 8 + 7) {
-		kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
-		return X86EMUL_PROPAGATE_FAULT;
-	}
-	addr = dtable.base + index * 8;
-	ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
-					 vcpu,  &err);
-	if (ret == X86EMUL_PROPAGATE_FAULT)
-		kvm_inject_page_fault(vcpu, addr, err);
-
-       return ret;
-}
-
-/* allowed just for 8 bytes segments */
-static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
-					 struct desc_struct *seg_desc)
-{
-	struct desc_ptr dtable;
-	u16 index = selector >> 3;
-
-	get_segment_descriptor_dtable(vcpu, selector, &dtable);
-
-	if (dtable.size < index * 8 + 7)
-		return 1;
-	return kvm_write_guest_virt(dtable.address + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
-}
-
-static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
-			       struct desc_struct *seg_desc)
-{
-	u32 base_addr = get_desc_base(seg_desc);
-
-	return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
-}
-
-static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
-			     struct desc_struct *seg_desc)
-{
-	u32 base_addr = get_desc_base(seg_desc);
-
-	return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
-}
-
-static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
-{
-	struct kvm_segment kvm_seg;
-
-	kvm_get_segment(vcpu, &kvm_seg, seg);
-	return kvm_seg.selector;
-}
-
-static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
-{
-	struct kvm_segment segvar = {
-		.base = selector << 4,
-		.limit = 0xffff,
-		.selector = selector,
-		.type = 3,
-		.present = 1,
-		.dpl = 3,
-		.db = 0,
-		.s = 1,
-		.l = 0,
-		.g = 0,
-		.avl = 0,
-		.unusable = 0,
-	};
-	kvm_x86_ops->set_segment(vcpu, &segvar, seg);
-	return X86EMUL_CONTINUE;
-}
-
-static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
-{
-	return (seg != VCPU_SREG_LDTR) &&
-		(seg != VCPU_SREG_TR) &&
-		(kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
-}
-
-int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
-{
-	struct kvm_segment kvm_seg;
-	struct desc_struct seg_desc;
-	u8 dpl, rpl, cpl;
-	unsigned err_vec = GP_VECTOR;
-	u32 err_code = 0;
-	bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
-	int ret;
-
-	if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
-		return kvm_load_realmode_segment(vcpu, selector, seg);
-
-	/* NULL selector is not valid for TR, CS and SS */
-	if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
-	    && null_selector)
-		goto exception;
-
-	/* TR should be in GDT only */
-	if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
-		goto exception;
-
-	ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
-	if (ret)
-		return ret;
-
-	seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
-
-	if (null_selector) { /* for NULL selector skip all following checks */
-		kvm_seg.unusable = 1;
-		goto load;
-	}
-
-	err_code = selector & 0xfffc;
-	err_vec = GP_VECTOR;
-
-	/* can't load system descriptor into segment selecor */
-	if (seg <= VCPU_SREG_GS && !kvm_seg.s)
-		goto exception;
-
-	if (!kvm_seg.present) {
-		err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
-		goto exception;
-	}
-
-	rpl = selector & 3;
-	dpl = kvm_seg.dpl;
-	cpl = kvm_x86_ops->get_cpl(vcpu);
-
-	switch (seg) {
-	case VCPU_SREG_SS:
-		/*
-		 * segment is not a writable data segment or segment
-		 * selector's RPL != CPL or segment selector's RPL != CPL
-		 */
-		if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
-			goto exception;
-		break;
-	case VCPU_SREG_CS:
-		if (!(kvm_seg.type & 8))
-			goto exception;
-
-		if (kvm_seg.type & 4) {
-			/* conforming */
-			if (dpl > cpl)
-				goto exception;
-		} else {
-			/* nonconforming */
-			if (rpl > cpl || dpl != cpl)
-				goto exception;
-		}
-		/* CS(RPL) <- CPL */
-		selector = (selector & 0xfffc) | cpl;
-            break;
-	case VCPU_SREG_TR:
-		if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
-			goto exception;
-		break;
-	case VCPU_SREG_LDTR:
-		if (kvm_seg.s || kvm_seg.type != 2)
-			goto exception;
-		break;
-	default: /*  DS, ES, FS, or GS */
-		/*
-		 * segment is not a data or readable code segment or
-		 * ((segment is a data or nonconforming code segment)
-		 * and (both RPL and CPL > DPL))
-		 */
-		if ((kvm_seg.type & 0xa) == 0x8 ||
-		    (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
-			goto exception;
-		break;
-	}
-
-	if (!kvm_seg.unusable && kvm_seg.s) {
-		/* mark segment as accessed */
-		kvm_seg.type |= 1;
-		seg_desc.type |= 1;
-		save_guest_segment_descriptor(vcpu, selector, &seg_desc);
-	}
-load:
-	kvm_set_segment(vcpu, &kvm_seg, seg);
-	return X86EMUL_CONTINUE;
-exception:
-	kvm_queue_exception_e(vcpu, err_vec, err_code);
-	return X86EMUL_PROPAGATE_FAULT;
-}
-
-static void save_state_to_tss32(struct kvm_vcpu *vcpu,
-				struct tss_segment_32 *tss)
-{
-	tss->cr3 = vcpu->arch.cr3;
-	tss->eip = kvm_rip_read(vcpu);
-	tss->eflags = kvm_get_rflags(vcpu);
-	tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
-	tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
-	tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
-	tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
-	tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
-	tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
-	tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
-	tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
-	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
-	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
-	tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
-	tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
-	tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
-	tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
-	tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-}
-
-static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
-{
-	struct kvm_segment kvm_seg;
-	kvm_get_segment(vcpu, &kvm_seg, seg);
-	kvm_seg.selector = sel;
-	kvm_set_segment(vcpu, &kvm_seg, seg);
-}
-
-static int load_state_from_tss32(struct kvm_vcpu *vcpu,
-				  struct tss_segment_32 *tss)
-{
-	kvm_set_cr3(vcpu, tss->cr3);
-
-	kvm_rip_write(vcpu, tss->eip);
-	kvm_set_rflags(vcpu, tss->eflags | 2);
-
-	kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
-	kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
-	kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
-	kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
-	kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
-	kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
-	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
-	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
-
-	/*
-	 * SDM says that segment selectors are loaded before segment
-	 * descriptors
-	 */
-	kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
-	kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
-	kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
-	kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
-	kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
-	kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
-	kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
-
-	/*
-	 * Now load segment descriptors. If fault happenes at this stage
-	 * it is handled in a context of new task
-	 */
-	if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
-		return 1;
-	return 0;
-}
-
-static void save_state_to_tss16(struct kvm_vcpu *vcpu,
-				struct tss_segment_16 *tss)
-{
-	tss->ip = kvm_rip_read(vcpu);
-	tss->flag = kvm_get_rflags(vcpu);
-	tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
-	tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
-	tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
-	tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
-	tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
-	tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
-	tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
-	tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
-
-	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
-	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
-	tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
-	tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
-	tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-}
-
-static int load_state_from_tss16(struct kvm_vcpu *vcpu,
-				 struct tss_segment_16 *tss)
-{
-	kvm_rip_write(vcpu, tss->ip);
-	kvm_set_rflags(vcpu, tss->flag | 2);
-	kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
-	kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
-	kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
-	kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
-	kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
-	kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
-	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
-	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
-
-	/*
-	 * SDM says that segment selectors are loaded before segment
-	 * descriptors
-	 */
-	kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
-	kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
-	kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
-	kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
-	kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
-
-	/*
-	 * Now load segment descriptors. If fault happenes at this stage
-	 * it is handled in a context of new task
-	 */
-	if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
-		return 1;
-	return 0;
-}
-
-static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
-			      u16 old_tss_sel, u32 old_tss_base,
-			      struct desc_struct *nseg_desc)
-{
-	struct tss_segment_16 tss_segment_16;
-	int ret = 0;
-
-	if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
-			   sizeof tss_segment_16))
-		goto out;
-
-	save_state_to_tss16(vcpu, &tss_segment_16);
-
-	if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
-			    sizeof tss_segment_16))
-		goto out;
-
-	if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
-			   &tss_segment_16, sizeof tss_segment_16))
-		goto out;
-
-	if (old_tss_sel != 0xffff) {
-		tss_segment_16.prev_task_link = old_tss_sel;
-
-		if (kvm_write_guest(vcpu->kvm,
-				    get_tss_base_addr_write(vcpu, nseg_desc),
-				    &tss_segment_16.prev_task_link,
-				    sizeof tss_segment_16.prev_task_link))
-			goto out;
-	}
-
-	if (load_state_from_tss16(vcpu, &tss_segment_16))
-		goto out;
-
-	ret = 1;
-out:
-	return ret;
-}
-
-static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
-		       u16 old_tss_sel, u32 old_tss_base,
-		       struct desc_struct *nseg_desc)
-{
-	struct tss_segment_32 tss_segment_32;
-	int ret = 0;
-
-	if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
-			   sizeof tss_segment_32))
-		goto out;
-
-	save_state_to_tss32(vcpu, &tss_segment_32);
-
-	if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
-			    sizeof tss_segment_32))
-		goto out;
-
-	if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
-			   &tss_segment_32, sizeof tss_segment_32))
-		goto out;
-
-	if (old_tss_sel != 0xffff) {
-		tss_segment_32.prev_task_link = old_tss_sel;
-
-		if (kvm_write_guest(vcpu->kvm,
-				    get_tss_base_addr_write(vcpu, nseg_desc),
-				    &tss_segment_32.prev_task_link,
-				    sizeof tss_segment_32.prev_task_link))
-			goto out;
-	}
-
-	if (load_state_from_tss32(vcpu, &tss_segment_32))
-		goto out;
-
-	ret = 1;
-out:
-	return ret;
-}
-
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
 {
-	struct kvm_segment tr_seg;
-	struct desc_struct cseg_desc;
-	struct desc_struct nseg_desc;
-	int ret = 0;
-	u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
-	u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
-	u32 desc_limit;
-
-	old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
-
-	/* FIXME: Handle errors. Failure to read either TSS or their
-	 * descriptors should generate a pagefault.
-	 */
-	if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
-		goto out;
-
-	if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
-		goto out;
-
-	if (reason != TASK_SWITCH_IRET) {
-		int cpl;
-
-		cpl = kvm_x86_ops->get_cpl(vcpu);
-		if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
-			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
-			return 1;
-		}
-	}
-
-	desc_limit = get_desc_limit(&nseg_desc);
-	if (!nseg_desc.p ||
-	    ((desc_limit < 0x67 && (nseg_desc.type & 8)) ||
-	     desc_limit < 0x2b)) {
-		kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
-		return 1;
-	}
-
-	if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
-		cseg_desc.type &= ~(1 << 1); //clear the B flag
-		save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
-	}
-
-	if (reason == TASK_SWITCH_IRET) {
-		u32 eflags = kvm_get_rflags(vcpu);
-		kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
-	}
+	int cs_db, cs_l, ret;
+	cache_all_regs(vcpu);
 
-	/* set back link to prev task only if NT bit is set in eflags
-	   note that old_tss_sel is not used afetr this point */
-	if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
-		old_tss_sel = 0xffff;
+	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
-	if (nseg_desc.type & 8)
-		ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
-					 old_tss_base, &nseg_desc);
-	else
-		ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
-					 old_tss_base, &nseg_desc);
+	vcpu->arch.emulate_ctxt.vcpu = vcpu;
+	vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+	vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
+	vcpu->arch.emulate_ctxt.mode =
+		(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+		(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+		? X86EMUL_MODE_VM86 : cs_l
+		? X86EMUL_MODE_PROT64 :	cs_db
+		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
 
-	if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
-		u32 eflags = kvm_get_rflags(vcpu);
-		kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
-	}
+	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
+				   tss_selector, reason);
 
-	if (reason != TASK_SWITCH_IRET) {
-		nseg_desc.type |= (1 << 1);
-		save_guest_segment_descriptor(vcpu, tss_selector,
-					      &nseg_desc);
-	}
+	if (ret == X86EMUL_CONTINUE)
+		kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 
-	kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
-	seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
-	tr_seg.type = 11;
-	kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
-out:
-	return ret;
+	return (ret != X86EMUL_CONTINUE);
 }
 EXPORT_SYMBOL_GPL(kvm_task_switch);
 
-- 
cgit v1.2.3-55-g7522


From 69f55cb11e8d789433d111ac3a0f60be37a1ae01 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:20 +0200
Subject: KVM: x86 emulator: populate OP_MEM operand during decoding.

All struct operand fields are initialized during decoding for all
operand types except OP_MEM, but there is no reason for that. Move
OP_MEM operand initialization into decoding stage for consistency.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 66 ++++++++++++++++++++++----------------------------
 1 file changed, 29 insertions(+), 37 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8225ec26efed..0eed6839619f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1057,6 +1057,10 @@ done_prefixes:
 
 	if (c->ad_bytes != 8)
 		c->modrm_ea = (u32)c->modrm_ea;
+
+	if (c->rip_relative)
+		c->modrm_ea += c->eip;
+
 	/*
 	 * Decode and fetch the source operand: register, memory
 	 * or immediate.
@@ -1091,6 +1095,8 @@ done_prefixes:
 			break;
 		}
 		c->src.type = OP_MEM;
+		c->src.ptr = (unsigned long *)c->modrm_ea;
+		c->src.val = 0;
 		break;
 	case SrcImm:
 	case SrcImmU:
@@ -1169,8 +1175,10 @@ done_prefixes:
 		c->src2.val = 1;
 		break;
 	case Src2Mem16:
-		c->src2.bytes = 2;
 		c->src2.type = OP_MEM;
+		c->src2.bytes = 2;
+		c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
+		c->src2.val = 0;
 		break;
 	}
 
@@ -1192,6 +1200,15 @@ done_prefixes:
 			break;
 		}
 		c->dst.type = OP_MEM;
+		c->dst.ptr = (unsigned long *)c->modrm_ea;
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.val = 0;
+		if (c->d & BitOp) {
+			unsigned long mask = ~(c->dst.bytes * 8 - 1);
+
+			c->dst.ptr = (void *)c->dst.ptr +
+						   (c->src.val & mask) / 8;
+		}
 		break;
 	case DstAcc:
 		c->dst.type = OP_REG;
@@ -1215,9 +1232,6 @@ done_prefixes:
 		break;
 	}
 
-	if (c->rip_relative)
-		c->modrm_ea += c->eip;
-
 done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 }
@@ -1638,14 +1652,13 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
 }
 
 static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
-			       struct x86_emulate_ops *ops,
-			       unsigned long memop)
+			       struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
 	u64 old, new;
 	int rc;
 
-	rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
+	rc = ops->read_emulated(c->modrm_ea, &old, 8, ctxt->vcpu);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
@@ -1660,7 +1673,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
 		new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
 		       (u32) c->regs[VCPU_REGS_RBX];
 
-		rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
+		rc = ops->cmpxchg_emulated(c->modrm_ea, &old, &new, 8, ctxt->vcpu);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		ctxt->eflags |= EFLG_ZF;
@@ -2382,7 +2395,6 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
-	unsigned long memop = 0;
 	u64 msr_data;
 	unsigned long saved_eip = 0;
 	struct decode_cache *c = &ctxt->decode;
@@ -2417,9 +2429,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		goto done;
 	}
 
-	if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
-		memop = c->modrm_ea;
-
 	if (c->rep_prefix && (c->d & String)) {
 		/* All REP prefixes have the same first termination condition */
 		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
@@ -2451,8 +2460,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	}
 
 	if (c->src.type == OP_MEM) {
-		c->src.ptr = (unsigned long *)memop;
-		c->src.val = 0;
 		rc = ops->read_emulated((unsigned long)c->src.ptr,
 					&c->src.val,
 					c->src.bytes,
@@ -2463,8 +2470,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	}
 
 	if (c->src2.type == OP_MEM) {
-		c->src2.ptr = (unsigned long *)(memop + c->src.bytes);
-		c->src2.val = 0;
 		rc = ops->read_emulated((unsigned long)c->src2.ptr,
 					&c->src2.val,
 					c->src2.bytes,
@@ -2477,25 +2482,12 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		goto special_insn;
 
 
-	if (c->dst.type == OP_MEM) {
-		c->dst.ptr = (unsigned long *)memop;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.val = 0;
-		if (c->d & BitOp) {
-			unsigned long mask = ~(c->dst.bytes * 8 - 1);
-
-			c->dst.ptr = (void *)c->dst.ptr +
-						   (c->src.val & mask) / 8;
-		}
-		if (!(c->d & Mov)) {
-			/* optimisation - avoid slow emulated read */
-			rc = ops->read_emulated((unsigned long)c->dst.ptr,
-						&c->dst.val,
-						c->dst.bytes,
-						ctxt->vcpu);
-			if (rc != X86EMUL_CONTINUE)
-				goto done;
-		}
+	if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
+		/* optimisation - avoid slow emulated read if Mov */
+		rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val,
+					c->dst.bytes, ctxt->vcpu);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
 	}
 	c->dst.orig_val = c->dst.val;
 
@@ -3062,7 +3054,7 @@ twobyte_insn:
 			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
 			goto done;
 		case 7: /* invlpg*/
-			emulate_invlpg(ctxt->vcpu, memop);
+			emulate_invlpg(ctxt->vcpu, c->modrm_ea);
 			/* Disable writeback. */
 			c->dst.type = OP_NONE;
 			break;
@@ -3263,7 +3255,7 @@ twobyte_insn:
 							(u64) c->src.val;
 		break;
 	case 0xc7:		/* Grp9 (cmpxchg8b) */
-		rc = emulate_grp9(ctxt, ops, memop);
+		rc = emulate_grp9(ctxt, ops);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		c->dst.type = OP_NONE;
-- 
cgit v1.2.3-55-g7522


From a682e35449abc83d260a8219015c7cb4b25ecced Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:21 +0200
Subject: KVM: x86 emulator: add decoding of X,Y parameters from Intel SDM

Add decoding of X,Y parameters from Intel SDM which are used by string
instruction to specify source and destination. Use this new decoding
to implement movs, cmps, stos, lods in a generic way.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 125 +++++++++++++++++--------------------------------
 1 file changed, 44 insertions(+), 81 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0eed6839619f..3b32270a20db 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -51,6 +51,7 @@
 #define DstReg      (2<<1)	/* Register operand. */
 #define DstMem      (3<<1)	/* Memory operand. */
 #define DstAcc      (4<<1)      /* Destination Accumulator */
+#define DstDI       (5<<1)	/* Destination is in ES:(E)DI */
 #define DstMask     (7<<1)
 /* Source operand type. */
 #define SrcNone     (0<<4)	/* No source operand. */
@@ -64,6 +65,7 @@
 #define SrcOne      (7<<4)	/* Implied '1' */
 #define SrcImmUByte (8<<4)      /* 8-bit unsigned immediate operand. */
 #define SrcImmU     (9<<4)      /* Immediate operand, unsigned */
+#define SrcSI       (0xa<<4)	/* Source is in the DS:RSI */
 #define SrcMask     (0xf<<4)
 /* Generic ModRM decode. */
 #define ModRM       (1<<8)
@@ -177,12 +179,12 @@ static u32 opcode_table[256] = {
 	/* 0xA0 - 0xA7 */
 	ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
 	ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
-	ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
-	ByteOp | ImplicitOps | String, ImplicitOps | String,
+	ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
+	ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
 	/* 0xA8 - 0xAF */
-	0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
-	ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
-	ByteOp | ImplicitOps | String, ImplicitOps | String,
+	0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
+	ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
+	ByteOp | DstDI | String, DstDI | String,
 	/* 0xB0 - 0xB7 */
 	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
 	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
@@ -1145,6 +1147,14 @@ done_prefixes:
 		c->src.bytes = 1;
 		c->src.val = 1;
 		break;
+	case SrcSI:
+		c->src.type = OP_MEM;
+		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->src.ptr = (unsigned long *)
+			register_address(c,  seg_override_base(ctxt, c),
+					 c->regs[VCPU_REGS_RSI]);
+		c->src.val = 0;
+		break;
 	}
 
 	/*
@@ -1230,6 +1240,14 @@ done_prefixes:
 		}
 		c->dst.orig_val = c->dst.val;
 		break;
+	case DstDI:
+		c->dst.type = OP_MEM;
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.ptr = (unsigned long *)
+			register_address(c, es_base(ctxt),
+					 c->regs[VCPU_REGS_RDI]);
+		c->dst.val = 0;
+		break;
 	}
 
 done:
@@ -2392,6 +2410,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
+			    int reg, unsigned long **ptr)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
+
+	register_address_increment(c, &c->regs[reg], df * c->src.bytes);
+	*ptr = (unsigned long *)register_address(c,  base, c->regs[reg]);
+}
+
 int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
@@ -2754,89 +2782,16 @@ special_insn:
 		c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
 		break;
 	case 0xa4 ... 0xa5:	/* movs */
-		c->dst.type = OP_MEM;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)register_address(c,
-						   es_base(ctxt),
-						   c->regs[VCPU_REGS_RDI]);
-		rc = ops->read_emulated(register_address(c,
-						seg_override_base(ctxt, c),
-						c->regs[VCPU_REGS_RSI]),
-					&c->dst.val,
-					c->dst.bytes, ctxt->vcpu);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-		register_address_increment(c, &c->regs[VCPU_REGS_RSI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-							   : c->dst.bytes);
-		register_address_increment(c, &c->regs[VCPU_REGS_RDI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-							   : c->dst.bytes);
-		break;
+		goto mov;
 	case 0xa6 ... 0xa7:	/* cmps */
-		c->src.type = OP_NONE; /* Disable writeback. */
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.ptr = (unsigned long *)register_address(c,
-				       seg_override_base(ctxt, c),
-						   c->regs[VCPU_REGS_RSI]);
-		rc = ops->read_emulated((unsigned long)c->src.ptr,
-					&c->src.val,
-					c->src.bytes,
-					ctxt->vcpu);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-
 		c->dst.type = OP_NONE; /* Disable writeback. */
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)register_address(c,
-						   es_base(ctxt),
-						   c->regs[VCPU_REGS_RDI]);
-		rc = ops->read_emulated((unsigned long)c->dst.ptr,
-					&c->dst.val,
-					c->dst.bytes,
-					ctxt->vcpu);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-
 		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
-
-		emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
-
-		register_address_increment(c, &c->regs[VCPU_REGS_RSI],
-				       (ctxt->eflags & EFLG_DF) ? -c->src.bytes
-								  : c->src.bytes);
-		register_address_increment(c, &c->regs[VCPU_REGS_RDI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-								  : c->dst.bytes);
-
-		break;
+		goto cmp;
 	case 0xaa ... 0xab:	/* stos */
-		c->dst.type = OP_MEM;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)register_address(c,
-						   es_base(ctxt),
-						   c->regs[VCPU_REGS_RDI]);
 		c->dst.val = c->regs[VCPU_REGS_RAX];
-		register_address_increment(c, &c->regs[VCPU_REGS_RDI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-							   : c->dst.bytes);
 		break;
 	case 0xac ... 0xad:	/* lods */
-		c->dst.type = OP_REG;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
-		rc = ops->read_emulated(register_address(c,
-						seg_override_base(ctxt, c),
-						c->regs[VCPU_REGS_RSI]),
-					&c->dst.val,
-					c->dst.bytes,
-					ctxt->vcpu);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-		register_address_increment(c, &c->regs[VCPU_REGS_RSI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-							   : c->dst.bytes);
-		break;
+		goto mov;
 	case 0xae ... 0xaf:	/* scas */
 		DPRINTF("Urk! I don't handle SCAS.\n");
 		goto cannot_emulate;
@@ -2979,6 +2934,14 @@ writeback:
 	if (rc != X86EMUL_CONTINUE)
 		goto done;
 
+	if ((c->d & SrcMask) == SrcSI)
+		string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
+				&c->src.ptr);
+
+	if ((c->d & DstMask) == DstDI)
+		string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI,
+				&c->dst.ptr);
+
 	/* Commit shadow register state. */
 	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(ctxt->vcpu, c->eip);
-- 
cgit v1.2.3-55-g7522


From d9271123a46011af26da680baeb7fdf67b498abf Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:22 +0200
Subject: KVM: x86 emulator: during rep emulation decrement ECX only if
 emulation succeeded

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 3b32270a20db..594574d8b9e9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2411,13 +2411,13 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 }
 
 static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
-			    int reg, unsigned long **ptr)
+			    int reg, struct operand *op)
 {
 	struct decode_cache *c = &ctxt->decode;
 	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
 
-	register_address_increment(c, &c->regs[reg], df * c->src.bytes);
-	*ptr = (unsigned long *)register_address(c,  base, c->regs[reg]);
+	register_address_increment(c, &c->regs[reg], df * op->bytes);
+	op->ptr = (unsigned long *)register_address(c,  base, c->regs[reg]);
 }
 
 int
@@ -2483,7 +2483,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 				goto done;
 			}
 		}
-		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
 		c->eip = ctxt->eip;
 	}
 
@@ -2936,11 +2935,13 @@ writeback:
 
 	if ((c->d & SrcMask) == SrcSI)
 		string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
-				&c->src.ptr);
+				&c->src);
 
 	if ((c->d & DstMask) == DstDI)
-		string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI,
-				&c->dst.ptr);
+		string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
+
+	if (c->rep_prefix && (c->d & String))
+		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
 
 	/* Commit shadow register state. */
 	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
-- 
cgit v1.2.3-55-g7522


From cf8f70bfe38b326bb80b10f76d6544f571040229 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:23 +0200
Subject: KVM: x86 emulator: fix in/out emulation.

in/out emulation is broken now. The breakage is different depending
on where IO device resides. If it is in userspace emulator reports
emulation failure since it incorrectly interprets kvm_emulate_pio()
return value. If IO device is in the kernel emulation of 'in' will do
nothing since kvm_emulate_pio() stores result directly into vcpu
registers, so emulator will overwrite result of emulation during
commit of shadowed register.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |   7 ++
 arch/x86/include/asm/kvm_host.h    |   3 +-
 arch/x86/kvm/emulate.c             |  50 +++++----
 arch/x86/kvm/svm.c                 |  20 ++--
 arch/x86/kvm/vmx.c                 |  18 ++--
 arch/x86/kvm/x86.c                 | 213 +++++++++++++++++++++++--------------
 6 files changed, 178 insertions(+), 133 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index bd469296f5e5..679245c9a55f 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -119,6 +119,13 @@ struct x86_emulate_ops {
 				const void *new,
 				unsigned int bytes,
 				struct kvm_vcpu *vcpu);
+
+	int (*pio_in_emulated)(int size, unsigned short port, void *val,
+			       unsigned int count, struct kvm_vcpu *vcpu);
+
+	int (*pio_out_emulated)(int size, unsigned short port, const void *val,
+				unsigned int count, struct kvm_vcpu *vcpu);
+
 	bool (*get_cached_descriptor)(struct desc_struct *desc,
 				      int seg, struct kvm_vcpu *vcpu);
 	void (*set_cached_descriptor)(struct desc_struct *desc,
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b99cec1547c6..776d3e202b56 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -590,8 +590,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
 
 struct x86_emulate_ctxt;
 
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
-		     int size, unsigned port);
+int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
 			   int size, unsigned long count, int down,
 			    gva_t address, int rep, unsigned port);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 594574d8b9e9..2d095ce9dc87 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -210,13 +210,13 @@ static u32 opcode_table[256] = {
 	0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xE0 - 0xE7 */
 	0, 0, 0, 0,
-	ByteOp | SrcImmUByte, SrcImmUByte,
-	ByteOp | SrcImmUByte, SrcImmUByte,
+	ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
+	ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
 	/* 0xE8 - 0xEF */
 	SrcImm | Stack, SrcImm | ImplicitOps,
 	SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
-	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
-	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
+	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
 	/* 0xF0 - 0xF7 */
 	0, 0, 0, 0,
 	ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
@@ -2426,8 +2426,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	u64 msr_data;
 	unsigned long saved_eip = 0;
 	struct decode_cache *c = &ctxt->decode;
-	unsigned int port;
-	int io_dir_in;
 	int rc = X86EMUL_CONTINUE;
 
 	ctxt->interruptibility = 0;
@@ -2823,14 +2821,10 @@ special_insn:
 		break;
 	case 0xe4: 	/* inb */
 	case 0xe5: 	/* in */
-		port = c->src.val;
-		io_dir_in = 1;
-		goto do_io;
+		goto do_io_in;
 	case 0xe6: /* outb */
 	case 0xe7: /* out */
-		port = c->src.val;
-		io_dir_in = 0;
-		goto do_io;
+		goto do_io_out;
 	case 0xe8: /* call (near) */ {
 		long int rel = c->src.val;
 		c->src.val = (unsigned long) c->eip;
@@ -2855,25 +2849,29 @@ special_insn:
 		break;
 	case 0xec: /* in al,dx */
 	case 0xed: /* in (e/r)ax,dx */
-		port = c->regs[VCPU_REGS_RDX];
-		io_dir_in = 1;
-		goto do_io;
+		c->src.val = c->regs[VCPU_REGS_RDX];
+	do_io_in:
+		c->dst.bytes = min(c->dst.bytes, 4u);
+		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
+			kvm_inject_gp(ctxt->vcpu, 0);
+			goto done;
+		}
+		if (!ops->pio_in_emulated(c->dst.bytes, c->src.val,
+					  &c->dst.val, 1, ctxt->vcpu))
+			goto done; /* IO is needed */
+		break;
 	case 0xee: /* out al,dx */
 	case 0xef: /* out (e/r)ax,dx */
-		port = c->regs[VCPU_REGS_RDX];
-		io_dir_in = 0;
-	do_io:
-		if (!emulator_io_permited(ctxt, ops, port,
-					  (c->d & ByteOp) ? 1 : c->op_bytes)) {
+		c->src.val = c->regs[VCPU_REGS_RDX];
+	do_io_out:
+		c->dst.bytes = min(c->dst.bytes, 4u);
+		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			goto done;
 		}
-		if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
-				   (c->d & ByteOp) ? 1 : c->op_bytes,
-				   port) != 0) {
-			c->eip = saved_eip;
-			goto cannot_emulate;
-		}
+		ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
+				      ctxt->vcpu);
+		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xf4:              /* hlt */
 		ctxt->vcpu->arch.halt_request = 1;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index abbc3f9d03b2..e9f79619e185 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1494,29 +1494,23 @@ static int shutdown_interception(struct vcpu_svm *svm)
 
 static int io_interception(struct vcpu_svm *svm)
 {
+	struct kvm_vcpu *vcpu = &svm->vcpu;
 	u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
 	int size, in, string;
 	unsigned port;
 
 	++svm->vcpu.stat.io_exits;
-
-	svm->next_rip = svm->vmcb->control.exit_info_2;
-
 	string = (io_info & SVM_IOIO_STR_MASK) != 0;
-
-	if (string) {
-		if (emulate_instruction(&svm->vcpu,
-					0, 0, 0) == EMULATE_DO_MMIO)
-			return 0;
-		return 1;
-	}
-
 	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
+	if (string || in)
+		return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+
 	port = io_info >> 16;
 	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
-
+	svm->next_rip = svm->vmcb->control.exit_info_2;
 	skip_emulated_instruction(&svm->vcpu);
-	return kvm_emulate_pio(&svm->vcpu, in, size, port);
+
+	return kvm_fast_pio_out(vcpu, size, port);
 }
 
 static int nmi_interception(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 87b3c6843aac..1cceca1c59be 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2985,22 +2985,20 @@ static int handle_io(struct kvm_vcpu *vcpu)
 	int size, in, string;
 	unsigned port;
 
-	++vcpu->stat.io_exits;
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	string = (exit_qualification & 16) != 0;
+	in = (exit_qualification & 8) != 0;
 
-	if (string) {
-		if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
-			return 0;
-		return 1;
-	}
+	++vcpu->stat.io_exits;
 
-	size = (exit_qualification & 7) + 1;
-	in = (exit_qualification & 8) != 0;
-	port = exit_qualification >> 16;
+	if (string || in)
+		return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
 
+	port = exit_qualification >> 16;
+	size = (exit_qualification & 7) + 1;
 	skip_emulated_instruction(vcpu);
-	return kvm_emulate_pio(vcpu, in, size, port);
+
+	return kvm_fast_pio_out(vcpu, size, port);
 }
 
 static void
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f69854c8f339..6624ad13ee99 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3404,6 +3404,86 @@ emul_write:
 	return emulator_write_emulated(addr, new, bytes, vcpu);
 }
 
+static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
+{
+	/* TODO: String I/O for in kernel device */
+	int r;
+
+	if (vcpu->arch.pio.in)
+		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
+				    vcpu->arch.pio.size, pd);
+	else
+		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+				     vcpu->arch.pio.port, vcpu->arch.pio.size,
+				     pd);
+	return r;
+}
+
+
+static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
+			     unsigned int count, struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.pio.cur_count)
+		goto data_avail;
+
+	trace_kvm_pio(1, port, size, 1);
+
+	vcpu->arch.pio.port = port;
+	vcpu->arch.pio.in = 1;
+	vcpu->arch.pio.string = 0;
+	vcpu->arch.pio.down = 0;
+	vcpu->arch.pio.rep = 0;
+	vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
+	vcpu->arch.pio.size = size;
+
+	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+	data_avail:
+		memcpy(val, vcpu->arch.pio_data, size * count);
+		vcpu->arch.pio.cur_count = 0;
+		return 1;
+	}
+
+	vcpu->run->exit_reason = KVM_EXIT_IO;
+	vcpu->run->io.direction = KVM_EXIT_IO_IN;
+	vcpu->run->io.size = size;
+	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+	vcpu->run->io.count = count;
+	vcpu->run->io.port = port;
+
+	return 0;
+}
+
+static int emulator_pio_out_emulated(int size, unsigned short port,
+			      const void *val, unsigned int count,
+			      struct kvm_vcpu *vcpu)
+{
+	trace_kvm_pio(0, port, size, 1);
+
+	vcpu->arch.pio.port = port;
+	vcpu->arch.pio.in = 0;
+	vcpu->arch.pio.string = 0;
+	vcpu->arch.pio.down = 0;
+	vcpu->arch.pio.rep = 0;
+	vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
+	vcpu->arch.pio.size = size;
+
+	memcpy(vcpu->arch.pio_data, val, size * count);
+
+	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+		vcpu->arch.pio.cur_count = 0;
+		return 1;
+	}
+
+	vcpu->run->exit_reason = KVM_EXIT_IO;
+	vcpu->run->io.direction = KVM_EXIT_IO_OUT;
+	vcpu->run->io.size = size;
+	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+	vcpu->run->io.count = count;
+	vcpu->run->io.port = port;
+
+	return 0;
+}
+
 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
 {
 	return kvm_x86_ops->get_segment_base(vcpu, seg);
@@ -3597,6 +3677,8 @@ static struct x86_emulate_ops emulate_ops = {
 	.read_emulated       = emulator_read_emulated,
 	.write_emulated      = emulator_write_emulated,
 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
+	.pio_in_emulated     = emulator_pio_in_emulated,
+	.pio_out_emulated    = emulator_pio_out_emulated,
 	.get_cached_descriptor = emulator_get_cached_descriptor,
 	.set_cached_descriptor = emulator_set_cached_descriptor,
 	.get_segment_selector = emulator_get_segment_selector,
@@ -3704,6 +3786,12 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	if (vcpu->arch.pio.string)
 		return EMULATE_DO_MMIO;
 
+	if (vcpu->arch.pio.cur_count && !vcpu->arch.pio.string) {
+		if (!vcpu->arch.pio.in)
+			vcpu->arch.pio.cur_count = 0;
+		return EMULATE_DO_MMIO;
+	}
+
 	if (r || vcpu->mmio_is_write) {
 		run->exit_reason = KVM_EXIT_MMIO;
 		run->mmio.phys_addr = vcpu->mmio_phys_addr;
@@ -3760,43 +3848,36 @@ int complete_pio(struct kvm_vcpu *vcpu)
 	int r;
 	unsigned long val;
 
-	if (!io->string) {
-		if (io->in) {
-			val = kvm_register_read(vcpu, VCPU_REGS_RAX);
-			memcpy(&val, vcpu->arch.pio_data, io->size);
-			kvm_register_write(vcpu, VCPU_REGS_RAX, val);
-		}
-	} else {
-		if (io->in) {
-			r = pio_copy_data(vcpu);
-			if (r)
-				goto out;
-		}
+	if (io->in) {
+		r = pio_copy_data(vcpu);
+		if (r)
+			goto out;
+	}
 
-		delta = 1;
-		if (io->rep) {
-			delta *= io->cur_count;
-			/*
-			 * The size of the register should really depend on
-			 * current address size.
-			 */
-			val = kvm_register_read(vcpu, VCPU_REGS_RCX);
-			val -= delta;
-			kvm_register_write(vcpu, VCPU_REGS_RCX, val);
-		}
-		if (io->down)
-			delta = -delta;
-		delta *= io->size;
-		if (io->in) {
-			val = kvm_register_read(vcpu, VCPU_REGS_RDI);
-			val += delta;
-			kvm_register_write(vcpu, VCPU_REGS_RDI, val);
-		} else {
-			val = kvm_register_read(vcpu, VCPU_REGS_RSI);
-			val += delta;
-			kvm_register_write(vcpu, VCPU_REGS_RSI, val);
-		}
+	delta = 1;
+	if (io->rep) {
+		delta *= io->cur_count;
+		/*
+		 * The size of the register should really depend on
+		 * current address size.
+		 */
+		val = kvm_register_read(vcpu, VCPU_REGS_RCX);
+		val -= delta;
+		kvm_register_write(vcpu, VCPU_REGS_RCX, val);
+	}
+	if (io->down)
+		delta = -delta;
+	delta *= io->size;
+	if (io->in) {
+		val = kvm_register_read(vcpu, VCPU_REGS_RDI);
+		val += delta;
+		kvm_register_write(vcpu, VCPU_REGS_RDI, val);
+	} else {
+		val = kvm_register_read(vcpu, VCPU_REGS_RSI);
+		val += delta;
+		kvm_register_write(vcpu, VCPU_REGS_RSI, val);
 	}
+
 out:
 	io->count -= io->cur_count;
 	io->cur_count = 0;
@@ -3804,21 +3885,6 @@ out:
 	return 0;
 }
 
-static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
-{
-	/* TODO: String I/O for in kernel device */
-	int r;
-
-	if (vcpu->arch.pio.in)
-		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
-				    vcpu->arch.pio.size, pd);
-	else
-		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
-				     vcpu->arch.pio.port, vcpu->arch.pio.size,
-				     pd);
-	return r;
-}
-
 static int pio_string_write(struct kvm_vcpu *vcpu)
 {
 	struct kvm_pio_request *io = &vcpu->arch.pio;
@@ -3836,36 +3902,6 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
 	return r;
 }
 
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
-{
-	unsigned long val;
-
-	trace_kvm_pio(!in, port, size, 1);
-
-	vcpu->run->exit_reason = KVM_EXIT_IO;
-	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
-	vcpu->run->io.size = vcpu->arch.pio.size = size;
-	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
-	vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
-	vcpu->run->io.port = vcpu->arch.pio.port = port;
-	vcpu->arch.pio.in = in;
-	vcpu->arch.pio.string = 0;
-	vcpu->arch.pio.down = 0;
-	vcpu->arch.pio.rep = 0;
-
-	if (!vcpu->arch.pio.in) {
-		val = kvm_register_read(vcpu, VCPU_REGS_RAX);
-		memcpy(vcpu->arch.pio_data, &val, 4);
-	}
-
-	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
-		complete_pio(vcpu);
-		return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_pio);
-
 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
 		  int size, unsigned long count, int down,
 		  gva_t address, int rep, unsigned port)
@@ -3931,6 +3967,16 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
 
+int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
+{
+	unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
+	/* do not return to emulator after return from userspace */
+	vcpu->arch.pio.cur_count = 0;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
+
 static void bounce_off(void *info)
 {
 	/* nothing */
@@ -4661,9 +4707,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	if (vcpu->arch.pio.cur_count) {
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-		r = complete_pio(vcpu);
+		if (!vcpu->arch.pio.string)
+			r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
+		else
+			r = complete_pio(vcpu);
 		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-		if (r)
+		if (r == EMULATE_DO_MMIO)
 			goto out;
 	}
 	if (vcpu->mmio_needed) {
-- 
cgit v1.2.3-55-g7522


From 7972995b0c346de76fe260ce0fd6bcc8ffab724a Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:24 +0200
Subject: KVM: x86 emulator: Move string pio emulation into emulator.c

Currently emulation is done outside of emulator so things like doing
ins/outs to/from mmio are broken it also makes it hard (if not impossible)
to implement single stepping in the future. The implementation in this
patch is not efficient since it exits to userspace for each IO while
previous implementation did 'ins' in batches. Further patch that
implements pio in string read ahead address this problem.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   8 --
 arch/x86/kvm/emulate.c          |  48 +++-------
 arch/x86/kvm/x86.c              | 206 ++++------------------------------------
 3 files changed, 32 insertions(+), 230 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 776d3e202b56..26c629a062db 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -224,14 +224,9 @@ struct kvm_pv_mmu_op_buffer {
 
 struct kvm_pio_request {
 	unsigned long count;
-	int cur_count;
-	gva_t guest_gva;
 	int in;
 	int port;
 	int size;
-	int string;
-	int down;
-	int rep;
 };
 
 /*
@@ -591,9 +586,6 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
 struct x86_emulate_ctxt;
 
 int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
-			   int size, unsigned long count, int down,
-			    gva_t address, int rep, unsigned port);
 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2d095ce9dc87..2c66e097d916 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -153,8 +153,8 @@ static u32 opcode_table[256] = {
 	0, 0, 0, 0,
 	/* 0x68 - 0x6F */
 	SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
-	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* insb, insw/insd */
-	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* outsb, outsw/outsd */
+	DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
+	SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
 	/* 0x70 - 0x77 */
 	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
 	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
@@ -2615,47 +2615,29 @@ special_insn:
 		break;
 	case 0x6c:		/* insb */
 	case 0x6d:		/* insw/insd */
+		c->dst.bytes = min(c->dst.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
-					  (c->d & ByteOp) ? 1 : c->op_bytes)) {
+					  c->dst.bytes)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			goto done;
 		}
-		if (kvm_emulate_pio_string(ctxt->vcpu,
-				1,
-				(c->d & ByteOp) ? 1 : c->op_bytes,
-				c->rep_prefix ?
-				address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
-				(ctxt->eflags & EFLG_DF),
-				register_address(c, es_base(ctxt),
-						 c->regs[VCPU_REGS_RDI]),
-				c->rep_prefix,
-				c->regs[VCPU_REGS_RDX]) == 0) {
-			c->eip = saved_eip;
-			return -1;
-		}
-		return 0;
+		if (!ops->pio_in_emulated(c->dst.bytes, c->regs[VCPU_REGS_RDX],
+					  &c->dst.val, 1, ctxt->vcpu))
+			goto done; /* IO is needed, skip writeback */
+		break;
 	case 0x6e:		/* outsb */
 	case 0x6f:		/* outsw/outsd */
+		c->src.bytes = min(c->src.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
-					  (c->d & ByteOp) ? 1 : c->op_bytes)) {
+					  c->src.bytes)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			goto done;
 		}
-		if (kvm_emulate_pio_string(ctxt->vcpu,
-				0,
-				(c->d & ByteOp) ? 1 : c->op_bytes,
-				c->rep_prefix ?
-				address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
-				(ctxt->eflags & EFLG_DF),
-					 register_address(c,
-					  seg_override_base(ctxt, c),
-						 c->regs[VCPU_REGS_RSI]),
-				c->rep_prefix,
-				c->regs[VCPU_REGS_RDX]) == 0) {
-			c->eip = saved_eip;
-			return -1;
-		}
-		return 0;
+		ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
+				      &c->src.val, 1, ctxt->vcpu);
+
+		c->dst.type = OP_NONE; /* nothing to writeback */
+		break;
 	case 0x70 ... 0x7f: /* jcc (short) */
 		if (test_cc(c->b, ctxt->eflags))
 			jmp_rel(c, c->src.val);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6624ad13ee99..658e8e8155cb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3150,18 +3150,17 @@ static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
 }
 
-static int kvm_write_guest_virt_helper(gva_t addr, void *val,
+static int kvm_write_guest_virt_system(gva_t addr, void *val,
 				       unsigned int bytes,
-				       struct kvm_vcpu *vcpu, u32 access,
+				       struct kvm_vcpu *vcpu,
 				       u32 *error)
 {
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
 
-	access |= PFERR_WRITE_MASK;
-
 	while (bytes) {
-		gpa_t gpa =  vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
+		gpa_t gpa =  vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
+						       PFERR_WRITE_MASK, error);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
@@ -3184,20 +3183,6 @@ out:
 	return r;
 }
 
-static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
-				struct kvm_vcpu *vcpu, u32 *error)
-{
-	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
-	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, access, error);
-}
-
-static int kvm_write_guest_virt_system(gva_t addr, void *val,
-				       unsigned int bytes,
-				       struct kvm_vcpu *vcpu, u32 *error)
-{
-	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
-}
-
 static int emulator_read_emulated(unsigned long addr,
 				  void *val,
 				  unsigned int bytes,
@@ -3423,23 +3408,20 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
 			     unsigned int count, struct kvm_vcpu *vcpu)
 {
-	if (vcpu->arch.pio.cur_count)
+	if (vcpu->arch.pio.count)
 		goto data_avail;
 
 	trace_kvm_pio(1, port, size, 1);
 
 	vcpu->arch.pio.port = port;
 	vcpu->arch.pio.in = 1;
-	vcpu->arch.pio.string = 0;
-	vcpu->arch.pio.down = 0;
-	vcpu->arch.pio.rep = 0;
-	vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
+	vcpu->arch.pio.count  = count;
 	vcpu->arch.pio.size = size;
 
 	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
 	data_avail:
 		memcpy(val, vcpu->arch.pio_data, size * count);
-		vcpu->arch.pio.cur_count = 0;
+		vcpu->arch.pio.count = 0;
 		return 1;
 	}
 
@@ -3461,16 +3443,13 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
 
 	vcpu->arch.pio.port = port;
 	vcpu->arch.pio.in = 0;
-	vcpu->arch.pio.string = 0;
-	vcpu->arch.pio.down = 0;
-	vcpu->arch.pio.rep = 0;
-	vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
+	vcpu->arch.pio.count = count;
 	vcpu->arch.pio.size = size;
 
 	memcpy(vcpu->arch.pio_data, val, size * count);
 
 	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
-		vcpu->arch.pio.cur_count = 0;
+		vcpu->arch.pio.count = 0;
 		return 1;
 	}
 
@@ -3717,7 +3696,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	cache_all_regs(vcpu);
 
 	vcpu->mmio_is_write = 0;
-	vcpu->arch.pio.string = 0;
 
 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
 		int cs_db, cs_l;
@@ -3783,12 +3761,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	if (r == 0)
 		kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
 
-	if (vcpu->arch.pio.string)
-		return EMULATE_DO_MMIO;
-
-	if (vcpu->arch.pio.cur_count && !vcpu->arch.pio.string) {
+	if (vcpu->arch.pio.count) {
 		if (!vcpu->arch.pio.in)
-			vcpu->arch.pio.cur_count = 0;
+			vcpu->arch.pio.count = 0;
 		return EMULATE_DO_MMIO;
 	}
 
@@ -3821,158 +3796,12 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 }
 EXPORT_SYMBOL_GPL(emulate_instruction);
 
-static int pio_copy_data(struct kvm_vcpu *vcpu)
-{
-	void *p = vcpu->arch.pio_data;
-	gva_t q = vcpu->arch.pio.guest_gva;
-	unsigned bytes;
-	int ret;
-	u32 error_code;
-
-	bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
-	if (vcpu->arch.pio.in)
-		ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
-	else
-		ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
-
-	if (ret == X86EMUL_PROPAGATE_FAULT)
-		kvm_inject_page_fault(vcpu, q, error_code);
-
-	return ret;
-}
-
-int complete_pio(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pio_request *io = &vcpu->arch.pio;
-	long delta;
-	int r;
-	unsigned long val;
-
-	if (io->in) {
-		r = pio_copy_data(vcpu);
-		if (r)
-			goto out;
-	}
-
-	delta = 1;
-	if (io->rep) {
-		delta *= io->cur_count;
-		/*
-		 * The size of the register should really depend on
-		 * current address size.
-		 */
-		val = kvm_register_read(vcpu, VCPU_REGS_RCX);
-		val -= delta;
-		kvm_register_write(vcpu, VCPU_REGS_RCX, val);
-	}
-	if (io->down)
-		delta = -delta;
-	delta *= io->size;
-	if (io->in) {
-		val = kvm_register_read(vcpu, VCPU_REGS_RDI);
-		val += delta;
-		kvm_register_write(vcpu, VCPU_REGS_RDI, val);
-	} else {
-		val = kvm_register_read(vcpu, VCPU_REGS_RSI);
-		val += delta;
-		kvm_register_write(vcpu, VCPU_REGS_RSI, val);
-	}
-
-out:
-	io->count -= io->cur_count;
-	io->cur_count = 0;
-
-	return 0;
-}
-
-static int pio_string_write(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pio_request *io = &vcpu->arch.pio;
-	void *pd = vcpu->arch.pio_data;
-	int i, r = 0;
-
-	for (i = 0; i < io->cur_count; i++) {
-		if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
-				     io->port, io->size, pd)) {
-			r = -EOPNOTSUPP;
-			break;
-		}
-		pd += io->size;
-	}
-	return r;
-}
-
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
-		  int size, unsigned long count, int down,
-		  gva_t address, int rep, unsigned port)
-{
-	unsigned now, in_page;
-	int ret = 0;
-
-	trace_kvm_pio(!in, port, size, count);
-
-	vcpu->run->exit_reason = KVM_EXIT_IO;
-	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
-	vcpu->run->io.size = vcpu->arch.pio.size = size;
-	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
-	vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
-	vcpu->run->io.port = vcpu->arch.pio.port = port;
-	vcpu->arch.pio.in = in;
-	vcpu->arch.pio.string = 1;
-	vcpu->arch.pio.down = down;
-	vcpu->arch.pio.rep = rep;
-
-	if (!count) {
-		kvm_x86_ops->skip_emulated_instruction(vcpu);
-		return 1;
-	}
-
-	if (!down)
-		in_page = PAGE_SIZE - offset_in_page(address);
-	else
-		in_page = offset_in_page(address) + size;
-	now = min(count, (unsigned long)in_page / size);
-	if (!now)
-		now = 1;
-	if (down) {
-		/*
-		 * String I/O in reverse.  Yuck.  Kill the guest, fix later.
-		 */
-		pr_unimpl(vcpu, "guest string pio down\n");
-		kvm_inject_gp(vcpu, 0);
-		return 1;
-	}
-	vcpu->run->io.count = now;
-	vcpu->arch.pio.cur_count = now;
-
-	if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
-		kvm_x86_ops->skip_emulated_instruction(vcpu);
-
-	vcpu->arch.pio.guest_gva = address;
-
-	if (!vcpu->arch.pio.in) {
-		/* string PIO write */
-		ret = pio_copy_data(vcpu);
-		if (ret == X86EMUL_PROPAGATE_FAULT)
-			return 1;
-		if (ret == 0 && !pio_string_write(vcpu)) {
-			complete_pio(vcpu);
-			if (vcpu->arch.pio.count == 0)
-				ret = 1;
-		}
-	}
-	/* no string PIO read support yet */
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
-
 int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
 {
 	unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
 	int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
 	/* do not return to emulator after return from userspace */
-	vcpu->arch.pio.cur_count = 0;
+	vcpu->arch.pio.count = 0;
 	return ret;
 }
 EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
@@ -4705,15 +4534,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (!irqchip_in_kernel(vcpu->kvm))
 		kvm_set_cr8(vcpu, kvm_run->cr8);
 
-	if (vcpu->arch.pio.cur_count) {
+	if (vcpu->arch.pio.count) {
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-		if (!vcpu->arch.pio.string)
-			r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
-		else
-			r = complete_pio(vcpu);
+		r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
 		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-		if (r == EMULATE_DO_MMIO)
+		if (r == EMULATE_DO_MMIO) {
+			r = 0;
 			goto out;
+		}
 	}
 	if (vcpu->mmio_needed) {
 		memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
-- 
cgit v1.2.3-55-g7522


From cb404fe0898779ec5fe5e06e90aaddcf40aefad8 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:25 +0200
Subject: KVM: x86 emulator: remove saved_eip

c->eip is never written back in case of emulation failure, so no need to
set it to old value.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2c66e097d916..0579d9dd9aac 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2424,7 +2424,6 @@ int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 	u64 msr_data;
-	unsigned long saved_eip = 0;
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 
@@ -2436,7 +2435,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	 */
 
 	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
-	saved_eip = c->eip;
 
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
 		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
@@ -2928,11 +2926,7 @@ writeback:
 	kvm_rip_write(ctxt->vcpu, c->eip);
 
 done:
-	if (rc == X86EMUL_UNHANDLEABLE) {
-		c->eip = saved_eip;
-		return -1;
-	}
-	return 0;
+	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 
 twobyte_insn:
 	switch (c->b) {
@@ -3209,6 +3203,5 @@ twobyte_insn:
 
 cannot_emulate:
 	DPRINTF("Cannot emulate %02x\n", c->b);
-	c->eip = saved_eip;
 	return -1;
 }
-- 
cgit v1.2.3-55-g7522


From 5cd21917da245fbe98bd443de2c7f519b3df6814 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:26 +0200
Subject: KVM: x86 emulator: restart string instruction without going back to a
 guest.

Currently when string instruction is only partially complete we go back
to a guest mode, guest tries to reexecute instruction and exits again
and at this point emulation continues. Avoid all of this by restarting
instruction without going back to a guest mode, but return to a guest
mode each 1024 iterations to allow interrupt injection. Pending
exception causes immediate guest entry too.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/kvm/emulate.c             | 34 +++++++++++++++++++++++-----------
 arch/x86/kvm/x86.c                 | 19 ++++++++++++++++++-
 3 files changed, 42 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 679245c9a55f..7fda16f89cc8 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -193,6 +193,7 @@ struct x86_emulate_ctxt {
 	/* interruptibility state, as a result of execution of STI or MOV SS */
 	int interruptibility;
 
+	bool restart; /* restart string instruction after writeback */
 	/* decode cache */
 	struct decode_cache decode;
 };
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0579d9dd9aac..6de6ad1610d8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -927,8 +927,11 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	int mode = ctxt->mode;
 	int def_op_bytes, def_ad_bytes, group;
 
-	/* Shadow copy of register state. Committed on successful emulation. */
 
+	/* we cannot decode insn before we complete previous rep insn */
+	WARN_ON(ctxt->restart);
+
+	/* Shadow copy of register state. Committed on successful emulation. */
 	memset(c, 0, sizeof(struct decode_cache));
 	c->eip = ctxt->eip;
 	ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
@@ -2426,6 +2429,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	u64 msr_data;
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
+	int saved_dst_type = c->dst.type;
 
 	ctxt->interruptibility = 0;
 
@@ -2454,8 +2458,11 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	}
 
 	if (c->rep_prefix && (c->d & String)) {
+		ctxt->restart = true;
 		/* All REP prefixes have the same first termination condition */
 		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
+		string_done:
+			ctxt->restart = false;
 			kvm_rip_write(ctxt->vcpu, c->eip);
 			goto done;
 		}
@@ -2467,17 +2474,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		 * 	- if REPNE/REPNZ and ZF = 1 then done
 		 */
 		if ((c->b == 0xa6) || (c->b == 0xa7) ||
-				(c->b == 0xae) || (c->b == 0xaf)) {
+		    (c->b == 0xae) || (c->b == 0xaf)) {
 			if ((c->rep_prefix == REPE_PREFIX) &&
-				((ctxt->eflags & EFLG_ZF) == 0)) {
-					kvm_rip_write(ctxt->vcpu, c->eip);
-					goto done;
-			}
+			    ((ctxt->eflags & EFLG_ZF) == 0))
+				goto string_done;
 			if ((c->rep_prefix == REPNE_PREFIX) &&
-				((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
-				kvm_rip_write(ctxt->vcpu, c->eip);
-				goto done;
-			}
+			    ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
+				goto string_done;
 		}
 		c->eip = ctxt->eip;
 	}
@@ -2911,6 +2914,12 @@ writeback:
 	if (rc != X86EMUL_CONTINUE)
 		goto done;
 
+	/*
+	 * restore dst type in case the decoding will be reused
+	 * (happens for string instruction )
+	 */
+	c->dst.type = saved_dst_type;
+
 	if ((c->d & SrcMask) == SrcSI)
 		string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
 				&c->src);
@@ -2918,8 +2927,11 @@ writeback:
 	if ((c->d & DstMask) == DstDI)
 		string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
 
-	if (c->rep_prefix && (c->d & String))
+	if (c->rep_prefix && (c->d & String)) {
 		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
+		if (!(c->regs[VCPU_REGS_RCX] & 0x3ff))
+			ctxt->restart = false;
+	}
 
 	/* Commit shadow register state. */
 	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 658e8e8155cb..c88cb8145283 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3755,6 +3755,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		return EMULATE_DONE;
 	}
 
+restart:
 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
 
@@ -3777,7 +3778,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 
 	if (r) {
 		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
-			return EMULATE_DONE;
+			goto done;
 		if (!vcpu->mmio_needed) {
 			kvm_report_emulation_failure(vcpu, "mmio");
 			return EMULATE_FAIL;
@@ -3792,6 +3793,13 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		return EMULATE_DO_MMIO;
 	}
 
+done:
+	if (vcpu->arch.exception.pending)
+		vcpu->arch.emulate_ctxt.restart = false;
+
+	if (vcpu->arch.emulate_ctxt.restart)
+		goto restart;
+
 	return EMULATE_DONE;
 }
 EXPORT_SYMBOL_GPL(emulate_instruction);
@@ -4560,6 +4568,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 			goto out;
 		}
 	}
+	if (vcpu->arch.emulate_ctxt.restart) {
+		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+		r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
+		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+		if (r == EMULATE_DO_MMIO) {
+			r = 0;
+			goto out;
+		}
+	}
 	if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
 		kvm_register_write(vcpu, VCPU_REGS_RAX,
 				     kvm_run->hypercall.ret);
-- 
cgit v1.2.3-55-g7522


From 7b262e90fc20a49fddf3dad94c8cead1f0439751 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:27 +0200
Subject: KVM: x86 emulator: introduce pio in string read ahead.

To optimize "rep ins" instruction do IO in big chunks ahead of time
instead of doing it only when required during instruction emulation.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  7 ++++++
 arch/x86/kvm/emulate.c             | 46 +++++++++++++++++++++++++++++++++-----
 2 files changed, 48 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7fda16f89cc8..b5e12c583860 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -151,6 +151,12 @@ struct fetch_cache {
 	unsigned long end;
 };
 
+struct read_cache {
+	u8 data[1024];
+	unsigned long pos;
+	unsigned long end;
+};
+
 struct decode_cache {
 	u8 twobyte;
 	u8 b;
@@ -178,6 +184,7 @@ struct decode_cache {
 	void *modrm_ptr;
 	unsigned long modrm_val;
 	struct fetch_cache fetch;
+	struct read_cache io_read;
 };
 
 struct x86_emulate_ctxt {
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6de6ad1610d8..ab3fff5bf7c4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1257,6 +1257,36 @@ done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 }
 
+static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+			   struct x86_emulate_ops *ops,
+			   unsigned int size, unsigned short port,
+			   void *dest)
+{
+	struct read_cache *rc = &ctxt->decode.io_read;
+
+	if (rc->pos == rc->end) { /* refill pio read ahead */
+		struct decode_cache *c = &ctxt->decode;
+		unsigned int in_page, n;
+		unsigned int count = c->rep_prefix ?
+			address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
+		in_page = (ctxt->eflags & EFLG_DF) ?
+			offset_in_page(c->regs[VCPU_REGS_RDI]) :
+			PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
+		n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
+			count);
+		if (n == 0)
+			n = 1;
+		rc->pos = rc->end = 0;
+		if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
+			return 0;
+		rc->end = n * size;
+	}
+
+	memcpy(dest, rc->data + rc->pos, size);
+	rc->pos += size;
+	return 1;
+}
+
 static u32 desc_limit_scaled(struct desc_struct *desc)
 {
 	u32 limit = get_desc_limit(desc);
@@ -2622,8 +2652,8 @@ special_insn:
 			kvm_inject_gp(ctxt->vcpu, 0);
 			goto done;
 		}
-		if (!ops->pio_in_emulated(c->dst.bytes, c->regs[VCPU_REGS_RDX],
-					  &c->dst.val, 1, ctxt->vcpu))
+		if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
+				     c->regs[VCPU_REGS_RDX], &c->dst.val))
 			goto done; /* IO is needed, skip writeback */
 		break;
 	case 0x6e:		/* outsb */
@@ -2839,8 +2869,8 @@ special_insn:
 			kvm_inject_gp(ctxt->vcpu, 0);
 			goto done;
 		}
-		if (!ops->pio_in_emulated(c->dst.bytes, c->src.val,
-					  &c->dst.val, 1, ctxt->vcpu))
+		if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
+				     &c->dst.val))
 			goto done; /* IO is needed */
 		break;
 	case 0xee: /* out al,dx */
@@ -2928,8 +2958,14 @@ writeback:
 		string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
 
 	if (c->rep_prefix && (c->d & String)) {
+		struct read_cache *rc = &ctxt->decode.io_read;
 		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
-		if (!(c->regs[VCPU_REGS_RCX] & 0x3ff))
+		/*
+		 * Re-enter guest when pio read ahead buffer is empty or,
+		 * if it is not used, after each 1024 iteration.
+		 */
+		if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
+		    (rc->end != 0 && rc->end == rc->pos))
 			ctxt->restart = false;
 	}
 
-- 
cgit v1.2.3-55-g7522


From 92bf9748b5bc381070f6adf0b56efd3428e4a97b Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 18 Mar 2010 15:20:28 +0200
Subject: KVM: small kvm_arch_vcpu_ioctl_run() cleanup.

Unify all conditions that get us back into emulator after returning from
userspace.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 32 ++++++--------------------------
 1 file changed, 6 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c88cb8145283..cc540f27053f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4542,33 +4542,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (!irqchip_in_kernel(vcpu->kvm))
 		kvm_set_cr8(vcpu, kvm_run->cr8);
 
-	if (vcpu->arch.pio.count) {
-		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-		r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
-		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-		if (r == EMULATE_DO_MMIO) {
-			r = 0;
-			goto out;
-		}
-	}
-	if (vcpu->mmio_needed) {
-		memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
-		vcpu->mmio_read_completed = 1;
-		vcpu->mmio_needed = 0;
-
-		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-		r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
-					EMULTYPE_NO_DECODE);
-		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-		if (r == EMULATE_DO_MMIO) {
-			/*
-			 * Read-modify-write.  Back to userspace.
-			 */
-			r = 0;
-			goto out;
+	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
+	    vcpu->arch.emulate_ctxt.restart) {
+		if (vcpu->mmio_needed) {
+			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+			vcpu->mmio_read_completed = 1;
+			vcpu->mmio_needed = 0;
 		}
-	}
-	if (vcpu->arch.emulate_ctxt.restart) {
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 		r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
 		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-- 
cgit v1.2.3-55-g7522


From 9749a6c0f0a4f88ae7bad4f65d7da32769e9b2b7 Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Sat, 20 Mar 2010 10:14:13 +0100
Subject: KVM: x86: Fix 32-bit build breakage due to typo

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cc540f27053f..b4d3363b78e6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3328,7 +3328,7 @@ EXPORT_SYMBOL_GPL(emulator_write_emulated);
 #  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
 #else
 #  define CMPXCHG64(ptr, old, new) \
-	(cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u *)(new)) == *(u64 *)(old))
+	(cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
 #endif
 
 static int emulator_cmpxchg_emulated(unsigned long addr,
-- 
cgit v1.2.3-55-g7522


From 482ac18ae293a3a0b1e1eea95c10dcc9ceeb4708 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Sun, 21 Mar 2010 13:08:20 +0200
Subject: KVM: x86 emulator: commit rflags as part of registers commit

Make sure that rflags is committed only after successful instruction
emulation.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h | 1 +
 arch/x86/kvm/emulate.c             | 1 +
 arch/x86/kvm/x86.c                 | 8 ++++++--
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b5e12c583860..a1319c82050e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -136,6 +136,7 @@ struct x86_emulate_ops {
 	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
 	void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
 	int (*cpl)(struct kvm_vcpu *vcpu);
+	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 };
 
 /* Type, address-of, and value of an instruction's operand. */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ab3fff5bf7c4..48de4b890055 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2972,6 +2972,7 @@ writeback:
 	/* Commit shadow register state. */
 	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(ctxt->vcpu, c->eip);
+	ops->set_rflags(ctxt->vcpu, ctxt->eflags);
 
 done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b4d3363b78e6..247e805a041e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3649,6 +3649,11 @@ static void emulator_set_segment_selector(u16 sel, int seg,
 	kvm_set_segment(vcpu, &kvm_seg, seg);
 }
 
+static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+	kvm_x86_ops->set_rflags(vcpu, rflags);
+}
+
 static struct x86_emulate_ops emulate_ops = {
 	.read_std            = kvm_read_guest_virt_system,
 	.write_std           = kvm_write_guest_virt_system,
@@ -3666,6 +3671,7 @@ static struct x86_emulate_ops emulate_ops = {
 	.get_cr              = emulator_get_cr,
 	.set_cr              = emulator_set_cr,
 	.cpl                 = emulator_get_cpl,
+	.set_rflags          = emulator_set_rflags,
 };
 
 static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3786,8 +3792,6 @@ restart:
 		return EMULATE_DO_MMIO;
 	}
 
-	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
-
 	if (vcpu->mmio_is_write) {
 		vcpu->mmio_needed = 0;
 		return EMULATE_DO_MMIO;
-- 
cgit v1.2.3-55-g7522


From 6550e1f165f384f3a46b60a1be9aba4bc3c2adad Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Sun, 21 Mar 2010 13:08:21 +0200
Subject: KVM: x86 emulator: add decoding of CMPXCHG8B dst operand

Decode CMPXCHG8B destination operand in decoding stage. Fixes regression
introduced by "If LOCK prefix is used dest arg should be memory" commit.
This commit relies on dst operand be decoded at the beginning of an
instruction emulation.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 48de4b890055..b8ce53861f68 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -52,6 +52,7 @@
 #define DstMem      (3<<1)	/* Memory operand. */
 #define DstAcc      (4<<1)      /* Destination Accumulator */
 #define DstDI       (5<<1)	/* Destination is in ES:(E)DI */
+#define DstMem64    (6<<1)	/* 64bit memory operand */
 #define DstMask     (7<<1)
 /* Source operand type. */
 #define SrcNone     (0<<4)	/* No source operand. */
@@ -360,7 +361,7 @@ static u32 group_table[] = {
 	DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
 	DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
 	[Group9*8] =
-	0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0,
+	0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
 };
 
 static u32 group2_table[] = {
@@ -1205,6 +1206,7 @@ done_prefixes:
 			 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
 		break;
 	case DstMem:
+	case DstMem64:
 		if ((c->d & ModRM) && c->modrm_mod == 3) {
 			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 			c->dst.type = OP_REG;
@@ -1214,7 +1216,10 @@ done_prefixes:
 		}
 		c->dst.type = OP_MEM;
 		c->dst.ptr = (unsigned long *)c->modrm_ea;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		if ((c->d & DstMask) == DstMem64)
+			c->dst.bytes = 8;
+		else
+			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		c->dst.val = 0;
 		if (c->d & BitOp) {
 			unsigned long mask = ~(c->dst.bytes * 8 - 1);
@@ -1706,12 +1711,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
 			       struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	u64 old, new;
-	int rc;
-
-	rc = ops->read_emulated(c->modrm_ea, &old, 8, ctxt->vcpu);
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
+	u64 old = c->dst.orig_val;
 
 	if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
 	    ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
@@ -1719,15 +1719,12 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
 		c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
 		c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
 		ctxt->eflags &= ~EFLG_ZF;
-
 	} else {
-		new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
+		c->dst.val = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
 		       (u32) c->regs[VCPU_REGS_RBX];
 
-		rc = ops->cmpxchg_emulated(c->modrm_ea, &old, &new, 8, ctxt->vcpu);
-		if (rc != X86EMUL_CONTINUE)
-			return rc;
 		ctxt->eflags |= EFLG_ZF;
+		c->lock_prefix = 1;
 	}
 	return X86EMUL_CONTINUE;
 }
@@ -3245,7 +3242,6 @@ twobyte_insn:
 		rc = emulate_grp9(ctxt, ops);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
-		c->dst.type = OP_NONE;
 		break;
 	}
 	goto writeback;
-- 
cgit v1.2.3-55-g7522


From de3e6480f76804fe06d460ddb1920c7daa07f29b Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Sun, 21 Mar 2010 16:58:36 +0200
Subject: KVM: x86 emulator: fix unlocked CMPXCHG8B emulation

When CMPXCHG8B is executed without LOCK prefix it is racy. Preserve this
behaviour in emulator too.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b8ce53861f68..64c9854f0458 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1724,7 +1724,6 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
 		       (u32) c->regs[VCPU_REGS_RBX];
 
 		ctxt->eflags |= EFLG_ZF;
-		c->lock_prefix = 1;
 	}
 	return X86EMUL_CONTINUE;
 }
-- 
cgit v1.2.3-55-g7522


From 041b1359a29b9301bc8baed6efcdbad29f80f6af Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti
Date: Tue, 23 Mar 2010 14:15:53 -0300
Subject: KVM: x86: document KVM_REQ_PENDING_TIMER usage

Document that KVM_REQ_PENDING_TIMER is implicitly used during guest
entry.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/timer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index eea40439066c..4ddadb1a5ffe 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -12,7 +12,8 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
 	/*
 	 * There is a race window between reading and incrementing, but we do
 	 * not care about potentially loosing timer events in the !reinject
-	 * case anyway.
+	 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
+	 * in vcpu_enter_guest.
 	 */
 	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
 		atomic_inc(&ktimer->pending);
-- 
cgit v1.2.3-55-g7522


From f815bce8940bc28e42e6b924825bb31df2a4bff5 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Fri, 19 Mar 2010 17:58:53 +0800
Subject: KVM: MMU: check reserved bits only if CR4.PSE=1 or CR4.PAE=1

- Check reserved bits only if CR4.PAE=1 or CR4.PSE=1 when guest #PF occurs
- Fix a typo in reset_rsvds_bits_mask()

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 064c3efb49dc..83d2ebce9ea9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2297,13 +2297,19 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 		/* no rsvd bits for 2 level 4K page table entries */
 		context->rsvd_bits_mask[0][1] = 0;
 		context->rsvd_bits_mask[0][0] = 0;
+		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+
+		if (!is_pse(vcpu)) {
+			context->rsvd_bits_mask[1][1] = 0;
+			break;
+		}
+
 		if (is_cpuid_PSE36())
 			/* 36bits PSE 4MB page */
 			context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
 		else
 			/* 32 bits PSE 4MB page */
 			context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
-		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
 		break;
 	case PT32E_ROOT_LEVEL:
 		context->rsvd_bits_mask[0][2] =
@@ -2316,7 +2322,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 62) |
 			rsvd_bits(13, 20);		/* large page */
-		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
 		break;
 	case PT64_ROOT_LEVEL:
 		context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
@@ -2334,7 +2340,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 51) |
 			rsvd_bits(13, 20);		/* large page */
-		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
 		break;
 	}
 }
-- 
cgit v1.2.3-55-g7522


From 84b0c8c6a6f87b62bca93727dee12ec59e32e597 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Sun, 14 Mar 2010 10:16:40 +0200
Subject: KVM: MMU: Disassociate direct maps from guest levels

Direct maps are linear translations for a section of memory, used for
real mode or with large pages.  As such, they are independent of the guest
levels.

Teach the mmu about this by making page->role.glevels = 0 for direct maps.
This allows direct maps to be shared among real mode and the various paging
modes.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 83d2ebce9ea9..1cc60d3f445b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1329,6 +1329,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	role = vcpu->arch.mmu.base_role;
 	role.level = level;
 	role.direct = direct;
+	if (role.direct)
+		role.glevels = 0;
 	role.access = access;
 	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
-- 
cgit v1.2.3-55-g7522


From 805d32dea4dfb8319aaf7e73a681ad410a5e331a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Thu, 1 Apr 2010 16:50:45 +0800
Subject: KVM: MMU: cleanup/fix mmu audit code

This patch does:
- 'sp' parameter in inspect_spte_fn() is not used, so remove it
- fix 'kvm' and 'slots' is not defined in count_rmaps()
- fix a bug in inspect_spte_has_rmap()

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1cc60d3f445b..6ed7d633aefe 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3182,8 +3182,7 @@ static gva_t canonicalize(gva_t gva)
 }
 
 
-typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp,
-				 u64 *sptep);
+typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
 
 static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
 			    inspect_spte_fn fn)
@@ -3199,7 +3198,7 @@ static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
 				child = page_header(ent & PT64_BASE_ADDR_MASK);
 				__mmu_spte_walk(kvm, child, fn);
 			} else
-				fn(kvm, sp, &sp->spt[i]);
+				fn(kvm, &sp->spt[i]);
 		}
 	}
 }
@@ -3290,6 +3289,8 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
 
 static int count_rmaps(struct kvm_vcpu *vcpu)
 {
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_memslots *slots;
 	int nmaps = 0;
 	int i, j, k, idx;
 
@@ -3323,7 +3324,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
 	return nmaps;
 }
 
-void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
+void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 {
 	unsigned long *rmapp;
 	struct kvm_mmu_page *rev_sp;
@@ -3339,14 +3340,14 @@ void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
 			printk(KERN_ERR "%s: no memslot for gfn %ld\n",
 					 audit_msg, gfn);
 			printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
-					audit_msg, sptep - rev_sp->spt,
+			       audit_msg, (long int)(sptep - rev_sp->spt),
 					rev_sp->gfn);
 			dump_stack();
 			return;
 		}
 
 		rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
-				    is_large_pte(*sptep));
+				    rev_sp->role.level);
 		if (!*rmapp) {
 			if (!printk_ratelimit())
 				return;
@@ -3381,7 +3382,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
 				continue;
 			if (!(ent & PT_WRITABLE_MASK))
 				continue;
-			inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]);
+			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
 		}
 	}
 	return;
-- 
cgit v1.2.3-55-g7522


From f84cbb0561d7cefb5fc050ee99d8c82ec9ce883d Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Tue, 6 Apr 2010 18:29:05 +0800
Subject: KVM: MMU: remove unused field

kvm_mmu_page.oos_link is not used, so remove it

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 --
 arch/x86/kvm/mmu.c              | 1 -
 2 files changed, 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 26c629a062db..0c49c888be6b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -187,8 +187,6 @@ struct kvm_mmu_page {
 	struct list_head link;
 	struct hlist_node hash_link;
 
-	struct list_head oos_link;
-
 	/*
 	 * The following two entries are used to key the shadow page in the
 	 * hash table.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6ed7d633aefe..ec8900b6692a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -923,7 +923,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
-	INIT_LIST_HEAD(&sp->oos_link);
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	sp->multimapped = 0;
 	sp->parent_pte = parent_pte;
-- 
cgit v1.2.3-55-g7522


From 24222c2fec75df30d56d1e2014d45d2559c94f1a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Tue, 6 Apr 2010 18:31:13 +0800
Subject: KVM: MMU: remove unnecessary NX check in walk_addr

After is_rsvd_bits_set() checks, EFER.NXE must be enabled if NX bit is seted

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 067797a72768..d9dea288e51d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -170,7 +170,7 @@ walk:
 			goto access_error;
 
 #if PTTYPE == 64
-		if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
+		if (fetch_fault && (pte & PT64_NX_MASK))
 			goto access_error;
 #endif
 
-- 
cgit v1.2.3-55-g7522


From 2fb53ad811e238d5dec8716b99986c3f234e3337 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Sun, 11 Apr 2010 13:05:15 +0300
Subject: KVM: x86 emulator: Don't overwrite decode cache

Currently if we an instruction spans a page boundary, when we fetch the
second half we overwrite the first half.  This prevents us from tracing
the full instruction opcodes.

Fix by appending the second half to the first.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 64c9854f0458..083b269a83ea 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -646,21 +646,22 @@ static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
 
 static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
 			      struct x86_emulate_ops *ops,
-			      unsigned long linear, u8 *dest)
+			      unsigned long eip, u8 *dest)
 {
 	struct fetch_cache *fc = &ctxt->decode.fetch;
 	int rc;
-	int size;
+	int size, cur_size;
 
-	if (linear < fc->start || linear >= fc->end) {
-		size = min(15UL, PAGE_SIZE - offset_in_page(linear));
-		rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL);
+	if (eip == fc->end) {
+		cur_size = fc->end - fc->start;
+		size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
+		rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
+				size, ctxt->vcpu, NULL);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
-		fc->start = linear;
-		fc->end = linear + size;
+		fc->end += size;
 	}
-	*dest = fc->data[linear - fc->start];
+	*dest = fc->data[eip - fc->start];
 	return X86EMUL_CONTINUE;
 }
 
@@ -673,7 +674,6 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
 	/* x86 instructions are limited to 15 bytes. */
 	if (eip + size - ctxt->eip > 15)
 		return X86EMUL_UNHANDLEABLE;
-	eip += ctxt->cs_base;
 	while (size--) {
 		rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
 		if (rc != X86EMUL_CONTINUE)
@@ -935,6 +935,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	/* Shadow copy of register state. Committed on successful emulation. */
 	memset(c, 0, sizeof(struct decode_cache));
 	c->eip = ctxt->eip;
+	c->fetch.start = c->fetch.end = c->eip;
 	ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
 	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
 
-- 
cgit v1.2.3-55-g7522


From e46479f852adab6027e4950d69400d967bf7bc6f Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Sun, 11 Apr 2010 13:05:16 +0300
Subject: KVM: Trace emulated instructions

Log emulated instructions in ftrace, especially if they failed.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/trace.h | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c   |  4 +++
 2 files changed, 90 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 32c912c40bf8..a6544b8e7c0f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -603,6 +603,92 @@ TRACE_EVENT(kvm_skinit,
 		  __entry->rip, __entry->slb)
 );
 
+#define __print_insn(insn, ilen) ({		                 \
+	int i;							 \
+	const char *ret = p->buffer + p->len;			 \
+								 \
+	for (i = 0; i < ilen; ++i)				 \
+		trace_seq_printf(p, " %02x", insn[i]);		 \
+	trace_seq_printf(p, "%c", 0);				 \
+	ret;							 \
+	})
+
+#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
+#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
+#define KVM_EMUL_INSN_F_CS_D   (1 << 2)
+#define KVM_EMUL_INSN_F_CS_L   (1 << 3)
+
+#define kvm_trace_symbol_emul_flags	                  \
+	{ 0,   			    "real" },		  \
+	{ KVM_EMUL_INSN_F_CR0_PE			  \
+	  | KVM_EMUL_INSN_F_EFL_VM, "vm16" },		  \
+	{ KVM_EMUL_INSN_F_CR0_PE,   "prot16" },		  \
+	{ KVM_EMUL_INSN_F_CR0_PE			  \
+	  | KVM_EMUL_INSN_F_CS_D,   "prot32" },		  \
+	{ KVM_EMUL_INSN_F_CR0_PE			  \
+	  | KVM_EMUL_INSN_F_CS_L,   "prot64" }
+
+#define kei_decode_mode(mode) ({			\
+	u8 flags = 0xff;				\
+	switch (mode) {					\
+	case X86EMUL_MODE_REAL:				\
+		flags = 0;				\
+		break;					\
+	case X86EMUL_MODE_VM86:				\
+		flags = KVM_EMUL_INSN_F_EFL_VM;		\
+		break;					\
+	case X86EMUL_MODE_PROT16:			\
+		flags = KVM_EMUL_INSN_F_CR0_PE;		\
+		break;					\
+	case X86EMUL_MODE_PROT32:			\
+		flags = KVM_EMUL_INSN_F_CR0_PE		\
+			| KVM_EMUL_INSN_F_CS_D;		\
+		break;					\
+	case X86EMUL_MODE_PROT64:			\
+		flags = KVM_EMUL_INSN_F_CR0_PE		\
+			| KVM_EMUL_INSN_F_CS_L;		\
+		break;					\
+	}						\
+	flags;						\
+	})
+
+TRACE_EVENT(kvm_emulate_insn,
+	TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed),
+	TP_ARGS(vcpu, failed),
+
+	TP_STRUCT__entry(
+		__field(    __u64, rip                       )
+		__field(    __u32, csbase                    )
+		__field(    __u8,  len                       )
+		__array(    __u8,  insn,    15	             )
+		__field(    __u8,  flags       	   	     )
+		__field(    __u8,  failed                    )
+		),
+
+	TP_fast_assign(
+		__entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start;
+		__entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
+		__entry->len = vcpu->arch.emulate_ctxt.decode.eip
+			       - vcpu->arch.emulate_ctxt.decode.fetch.start;
+		memcpy(__entry->insn,
+		       vcpu->arch.emulate_ctxt.decode.fetch.data,
+		       15);
+		__entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
+		__entry->failed = failed;
+		),
+
+	TP_printk("%x:%llx:%s (%s)%s",
+		  __entry->csbase, __entry->rip,
+		  __print_insn(__entry->insn, __entry->len),
+		  __print_symbolic(__entry->flags,
+				   kvm_trace_symbol_emul_flags),
+		  __entry->failed ? " failed" : ""
+		)
+	);
+
+#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
+#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
+
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 247e805a041e..33a40c544c7a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3718,6 +3718,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 			? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
 
 		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+		trace_kvm_emulate_insn_start(vcpu);
 
 		/* Only allow emulation of specific instructions on #UD
 		 * (namely VMMCALL, sysenter, sysexit, syscall)*/
@@ -3750,6 +3751,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		++vcpu->stat.insn_emulation;
 		if (r)  {
 			++vcpu->stat.insn_emulation_fail;
+			trace_kvm_emulate_insn_failed(vcpu);
 			if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
 				return EMULATE_DONE;
 			return EMULATE_FAIL;
@@ -3786,6 +3788,8 @@ restart:
 		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
 			goto done;
 		if (!vcpu->mmio_needed) {
+			++vcpu->stat.insn_emulation_fail;
+			trace_kvm_emulate_insn_failed(vcpu);
 			kvm_report_emulation_failure(vcpu, "mmio");
 			return EMULATE_FAIL;
 		}
-- 
cgit v1.2.3-55-g7522


From f7a711971edd952352a89698db1d36f469e25f77 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Sun, 11 Apr 2010 15:33:32 +0300
Subject: KVM: Fix MAXPHYADDR calculation when cpuid does not support it

MAXPHYADDR is derived from cpuid 0x80000008, but when that isn't present, we
get some random value.

Fix by checking first that cpuid 0x80000008 is supported.

Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 33a40c544c7a..d65e481c5fa4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4171,9 +4171,13 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
 
+	best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
+	if (!best || best->eax < 0x80000008)
+		goto not_found;
 	best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
 	if (best)
 		return best->eax & 0xff;
+not_found:
 	return 36;
 }
 
-- 
cgit v1.2.3-55-g7522


From 6bc31bdc55cad6609b1610b4cecad312664f2808 Mon Sep 17 00:00:00 2001
From: Andre Przywara
Date: Sun, 11 Apr 2010 23:07:28 +0200
Subject: KVM: SVM: implement NEXTRIPsave SVM feature

On SVM we set the instruction length of skipped instructions
to hard-coded, well known values, which could be wrong when (bogus,
but valid) prefixes (REX, segment override) are used.
Newer AMD processors (Fam10h 45nm and better, aka. PhenomII or
AthlonII) have an explicit NEXTRIP field in the VMCB containing the
desired information.
Since it is cheap to do so, we use this field to override the guessed
value on newer processors.
A fix for older CPUs would be rather expensive, as it would require
to fetch and partially decode the instruction. As the problem is not
a security issue and needs special, handcrafted code to trigger
(no compiler will ever generate such code), I omit a fix for older
CPUs.
If someone is interested, I have both a patch for these CPUs as well as
demo code triggering this issue: It segfaults under KVM, but runs
perfectly on native Linux.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/svm.h |  4 +++-
 arch/x86/kvm/svm.c         | 13 ++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index b26a38d85356..1d91d05f9368 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -81,7 +81,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 	u32 event_inj_err;
 	u64 nested_cr3;
 	u64 lbr_ctl;
-	u8 reserved_5[832];
+	u64 reserved_5;
+	u64 next_rip;
+	u8 reserved_6[816];
 };
 
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e9f79619e185..64b7f60dc5b8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -44,11 +44,11 @@ MODULE_LICENSE("GPL");
 #define SEG_TYPE_LDT 2
 #define SEG_TYPE_BUSY_TSS16 3
 
-#define SVM_FEATURE_NPT  (1 << 0)
-#define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_FEATURE_SVML (1 << 2)
-#define SVM_FEATURE_NRIP (1 << 3)
-#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
+#define SVM_FEATURE_NPT            (1 <<  0)
+#define SVM_FEATURE_LBRV           (1 <<  1)
+#define SVM_FEATURE_SVML           (1 <<  2)
+#define SVM_FEATURE_NRIP           (1 <<  3)
+#define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
 
 #define NESTED_EXIT_HOST	0	/* Exit handled on host level */
 #define NESTED_EXIT_DONE	1	/* Exit caused nested vmexit  */
@@ -320,6 +320,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	if (svm->vmcb->control.next_rip != 0)
+		svm->next_rip = svm->vmcb->control.next_rip;
+
 	if (!svm->next_rip) {
 		if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
 				EMULATE_DONE)
-- 
cgit v1.2.3-55-g7522


From 020df0794f5764e742feaa718be88b8f1b4ce04f Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Tue, 13 Apr 2010 10:05:23 +0300
Subject: KVM: move DR register access handling into generic code

Currently both SVM and VMX have their own DR handling code. Move it to
x86.c.

Acked-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  5 +--
 arch/x86/kvm/svm.c              | 66 ++--------------------------------
 arch/x86/kvm/vmx.c              | 78 ++++++-----------------------------------
 arch/x86/kvm/x86.c              | 78 +++++++++++++++++++++++++++++++++++++++--
 4 files changed, 93 insertions(+), 134 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0c49c888be6b..5d5e0a9afcf2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -496,8 +496,7 @@ struct kvm_x86_ops {
 	void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
 	void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
 	void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
-	int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
-	int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
+	void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
 	void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
 	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
@@ -602,6 +601,8 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 64b7f60dc5b8..87b36fbbfec8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1307,70 +1307,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
 	svm->vmcb->control.asid = sd->next_asid++;
 }
 
-static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest)
+static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	switch (dr) {
-	case 0 ... 3:
-		*dest = vcpu->arch.db[dr];
-		break;
-	case 4:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
-			return EMULATE_FAIL; /* will re-inject UD */
-		/* fall through */
-	case 6:
-		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
-			*dest = vcpu->arch.dr6;
-		else
-			*dest = svm->vmcb->save.dr6;
-		break;
-	case 5:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
-			return EMULATE_FAIL; /* will re-inject UD */
-		/* fall through */
-	case 7:
-		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
-			*dest = vcpu->arch.dr7;
-		else
-			*dest = svm->vmcb->save.dr7;
-		break;
-	}
-
-	return EMULATE_DONE;
-}
-
-static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	switch (dr) {
-	case 0 ... 3:
-		vcpu->arch.db[dr] = value;
-		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
-			vcpu->arch.eff_db[dr] = value;
-		break;
-	case 4:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
-			return EMULATE_FAIL; /* will re-inject UD */
-		/* fall through */
-	case 6:
-		vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
-		break;
-	case 5:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
-			return EMULATE_FAIL; /* will re-inject UD */
-		/* fall through */
-	case 7:
-		vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
-		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
-			svm->vmcb->save.dr7 = vcpu->arch.dr7;
-			vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
-		}
-		break;
-	}
-
-	return EMULATE_DONE;
+	svm->vmcb->save.dr7 = value;
 }
 
 static int pf_interception(struct vcpu_svm *svm)
@@ -3302,8 +3243,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_idt = svm_set_idt,
 	.get_gdt = svm_get_gdt,
 	.set_gdt = svm_set_gdt,
-	.get_dr = svm_get_dr,
-	.set_dr = svm_set_dr,
+	.set_dr7 = svm_set_dr7,
 	.cache_reg = svm_cache_reg,
 	.get_rflags = svm_get_rflags,
 	.set_rflags = svm_set_rflags,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1cceca1c59be..fb4a8869bb99 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3089,19 +3089,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static int check_dr_alias(struct kvm_vcpu *vcpu)
-{
-	if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-		kvm_queue_exception(vcpu, UD_VECTOR);
-		return -1;
-	}
-	return 0;
-}
-
 static int handle_dr(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification;
-	unsigned long val;
 	int dr, reg;
 
 	/* Do not handle if the CPL > 0, will trigger GP on re-entry */
@@ -3136,67 +3126,20 @@ static int handle_dr(struct kvm_vcpu *vcpu)
 	dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
 	reg = DEBUG_REG_ACCESS_REG(exit_qualification);
 	if (exit_qualification & TYPE_MOV_FROM_DR) {
-		switch (dr) {
-		case 0 ... 3:
-			val = vcpu->arch.db[dr];
-			break;
-		case 4:
-			if (check_dr_alias(vcpu) < 0)
-				return 1;
-			/* fall through */
-		case 6:
-			val = vcpu->arch.dr6;
-			break;
-		case 5:
-			if (check_dr_alias(vcpu) < 0)
-				return 1;
-			/* fall through */
-		default: /* 7 */
-			val = vcpu->arch.dr7;
-			break;
-		}
-		kvm_register_write(vcpu, reg, val);
-	} else {
-		val = vcpu->arch.regs[reg];
-		switch (dr) {
-		case 0 ... 3:
-			vcpu->arch.db[dr] = val;
-			if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
-				vcpu->arch.eff_db[dr] = val;
-			break;
-		case 4:
-			if (check_dr_alias(vcpu) < 0)
-				return 1;
-			/* fall through */
-		case 6:
-			if (val & 0xffffffff00000000ULL) {
-				kvm_inject_gp(vcpu, 0);
-				return 1;
-			}
-			vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
-			break;
-		case 5:
-			if (check_dr_alias(vcpu) < 0)
-				return 1;
-			/* fall through */
-		default: /* 7 */
-			if (val & 0xffffffff00000000ULL) {
-				kvm_inject_gp(vcpu, 0);
-				return 1;
-			}
-			vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
-			if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
-				vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
-				vcpu->arch.switch_db_regs =
-					(val & DR7_BP_EN_MASK);
-			}
-			break;
-		}
-	}
+		unsigned long val;
+		if (!kvm_get_dr(vcpu, dr, &val))
+			kvm_register_write(vcpu, reg, val);
+	} else
+		kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
 	skip_emulated_instruction(vcpu);
 	return 1;
 }
 
+static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	vmcs_writel(GUEST_DR7, val);
+}
+
 static int handle_cpuid(struct kvm_vcpu *vcpu)
 {
 	kvm_emulate_cpuid(vcpu);
@@ -4187,6 +4130,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_idt = vmx_set_idt,
 	.get_gdt = vmx_get_gdt,
 	.set_gdt = vmx_set_gdt,
+	.set_dr7 = vmx_set_dr7,
 	.cache_reg = vmx_cache_reg,
 	.get_rflags = vmx_get_rflags,
 	.set_rflags = vmx_set_rflags,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d65e481c5fa4..09dccac2df7e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -562,6 +562,80 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+	switch (dr) {
+	case 0 ... 3:
+		vcpu->arch.db[dr] = val;
+		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+			vcpu->arch.eff_db[dr] = val;
+		break;
+	case 4:
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+			kvm_queue_exception(vcpu, UD_VECTOR);
+			return 1;
+		}
+		/* fall through */
+	case 6:
+		if (val & 0xffffffff00000000ULL) {
+			kvm_inject_gp(vcpu, 0);
+			return 1;
+		}
+		vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+		break;
+	case 5:
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+			kvm_queue_exception(vcpu, UD_VECTOR);
+			return 1;
+		}
+		/* fall through */
+	default: /* 7 */
+		if (val & 0xffffffff00000000ULL) {
+			kvm_inject_gp(vcpu, 0);
+			return 1;
+		}
+		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+			kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
+			vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
+		}
+		break;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_dr);
+
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+{
+	switch (dr) {
+	case 0 ... 3:
+		*val = vcpu->arch.db[dr];
+		break;
+	case 4:
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+			kvm_queue_exception(vcpu, UD_VECTOR);
+			return 1;
+		}
+		/* fall through */
+	case 6:
+		*val = vcpu->arch.dr6;
+		break;
+	case 5:
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+			kvm_queue_exception(vcpu, UD_VECTOR);
+			return 1;
+		}
+		/* fall through */
+	default: /* 7 */
+		*val = vcpu->arch.dr7;
+		break;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_get_dr);
+
 static inline u32 bit(int bitno)
 {
 	return 1 << (bitno & 31);
@@ -3483,14 +3557,14 @@ int emulate_clts(struct kvm_vcpu *vcpu)
 
 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
 {
-	return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest);
+	return kvm_get_dr(ctxt->vcpu, dr, dest);
 }
 
 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
 {
 	unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
 
-	return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask);
+	return kvm_set_dr(ctxt->vcpu, dr, value & mask);
 }
 
 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
-- 
cgit v1.2.3-55-g7522


From 8f6abd06f521112a0a3bc906df273fa3ce0a9387 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Tue, 13 Apr 2010 10:21:56 +0300
Subject: KVM: x86: get rid of mmu_only parameter in emulator_write_emulated()

We can call kvm_mmu_pte_write() directly from
emulator_cmpxchg_emulated() instead of passing mmu_only down to
emulator_write_emulated_onepage() and call it there.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 36 +++++++++++-------------------------
 1 file changed, 11 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 09dccac2df7e..40991527f54a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3322,8 +3322,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 static int emulator_write_emulated_onepage(unsigned long addr,
 					   const void *val,
 					   unsigned int bytes,
-					   struct kvm_vcpu *vcpu,
-					   bool mmu_only)
+					   struct kvm_vcpu *vcpu)
 {
 	gpa_t                 gpa;
 	u32 error_code;
@@ -3339,10 +3338,6 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 		goto mmio;
 
-	if (mmu_only) {
-		kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
-		return X86EMUL_CONTINUE;
-	}
 	if (emulator_write_phys(vcpu, gpa, val, bytes))
 		return X86EMUL_CONTINUE;
 
@@ -3363,35 +3358,24 @@ mmio:
 	return X86EMUL_CONTINUE;
 }
 
-int __emulator_write_emulated(unsigned long addr,
-				   const void *val,
-				   unsigned int bytes,
-				   struct kvm_vcpu *vcpu,
-				   bool mmu_only)
+int emulator_write_emulated(unsigned long addr,
+			    const void *val,
+			    unsigned int bytes,
+			    struct kvm_vcpu *vcpu)
 {
 	/* Crossing a page boundary? */
 	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
 		int rc, now;
 
 		now = -addr & ~PAGE_MASK;
-		rc = emulator_write_emulated_onepage(addr, val, now, vcpu,
-						     mmu_only);
+		rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		addr += now;
 		val += now;
 		bytes -= now;
 	}
-	return emulator_write_emulated_onepage(addr, val, bytes, vcpu,
-					       mmu_only);
-}
-
-int emulator_write_emulated(unsigned long addr,
-				   const void *val,
-				   unsigned int bytes,
-				   struct kvm_vcpu *vcpu)
-{
-	return __emulator_write_emulated(addr, val, bytes, vcpu, false);
+	return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
 }
 EXPORT_SYMBOL_GPL(emulator_write_emulated);
 
@@ -3455,7 +3439,9 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 	if (!exchanged)
 		return X86EMUL_CMPXCHG_FAILED;
 
-	return __emulator_write_emulated(addr, new, bytes, vcpu, true);
+	kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
+
+	return X86EMUL_CONTINUE;
 
 emul_write:
 	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
@@ -4165,7 +4151,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
 
 	kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
-	return __emulator_write_emulated(rip, instruction, 3, vcpu, false);
+	return emulator_write_emulated(rip, instruction, 3, vcpu);
 }
 
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-- 
cgit v1.2.3-55-g7522


From 0760d44868f351ba30fc9a08cf1830e46aa72466 Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Wed, 14 Apr 2010 15:50:57 +0200
Subject: KVM: x86: Terminate early if task_switch_16/32 failed

Stop the switch immediately if task_switch_16/32 returned an error. Only
if that step succeeded, the switch should actually take place and update
any register states.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 083b269a83ea..aace5659bbe0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2402,6 +2402,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	else
 		ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
 				     old_tss_base, &next_tss_desc);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
 
 	if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
 		ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
-- 
cgit v1.2.3-55-g7522


From e269fb2189fb86d79d64c0ca74c6c1a549ad4aa3 Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Wed, 14 Apr 2010 15:51:09 +0200
Subject: KVM: x86: Push potential exception error code on task switches

When a fault triggers a task switch, the error code, if existent, has to
be pushed on the new task's stack. Implement the missing bits.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  3 ++-
 arch/x86/include/asm/kvm_host.h    |  3 ++-
 arch/x86/include/asm/svm.h         |  1 +
 arch/x86/kvm/emulate.c             | 22 ++++++++++++++++++----
 arch/x86/kvm/svm.c                 | 11 ++++++++++-
 arch/x86/kvm/vmx.c                 | 12 +++++++++++-
 arch/x86/kvm/x86.c                 |  6 ++++--
 7 files changed, 48 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index a1319c82050e..0b2729bf2070 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -230,6 +230,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
 		     struct x86_emulate_ops *ops);
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 			 struct x86_emulate_ops *ops,
-			 u16 tss_selector, int reason);
+			 u16 tss_selector, int reason,
+			 bool has_error_code, u32 error_code);
 
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5d5e0a9afcf2..3602728d54de 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -595,7 +595,8 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
 
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
+		    bool has_error_code, u32 error_code);
 
 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 1d91d05f9368..0e831059ac5a 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -244,6 +244,7 @@ struct __attribute__ ((__packed__)) vmcb {
 
 #define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
 #define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
+#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
 
 #define	SVM_EXIT_READ_CR0 	0x000
 #define	SVM_EXIT_READ_CR3 	0x003
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index aace5659bbe0..585d0ef4a5f6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2344,8 +2344,9 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 }
 
 static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
-				    struct x86_emulate_ops *ops,
-				    u16 tss_selector, int reason)
+				   struct x86_emulate_ops *ops,
+				   u16 tss_selector, int reason,
+				   bool has_error_code, u32 error_code)
 {
 	struct desc_struct curr_tss_desc, next_tss_desc;
 	int ret;
@@ -2418,12 +2419,22 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu);
 	ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
 
+	if (has_error_code) {
+		struct decode_cache *c = &ctxt->decode;
+
+		c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
+		c->lock_prefix = 0;
+		c->src.val = (unsigned long) error_code;
+		emulate_push(ctxt);
+	}
+
 	return ret;
 }
 
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 			 struct x86_emulate_ops *ops,
-			 u16 tss_selector, int reason)
+			 u16 tss_selector, int reason,
+			 bool has_error_code, u32 error_code)
 {
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
@@ -2431,12 +2442,15 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 	memset(c, 0, sizeof(struct decode_cache));
 	c->eip = ctxt->eip;
 	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+	c->dst.type = OP_NONE;
 
-	rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason);
+	rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
+				     has_error_code, error_code);
 
 	if (rc == X86EMUL_CONTINUE) {
 		memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
 		kvm_rip_write(ctxt->vcpu, c->eip);
+		rc = writeback(ctxt, ops);
 	}
 
 	return rc;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 87b36fbbfec8..78af52222fd2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2222,6 +2222,8 @@ static int task_switch_interception(struct vcpu_svm *svm)
 		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
 	uint32_t idt_v =
 		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
+	bool has_error_code = false;
+	u32 error_code = 0;
 
 	tss_selector = (u16)svm->vmcb->control.exit_info_1;
 
@@ -2242,6 +2244,12 @@ static int task_switch_interception(struct vcpu_svm *svm)
 			svm->vcpu.arch.nmi_injected = false;
 			break;
 		case SVM_EXITINTINFO_TYPE_EXEPT:
+			if (svm->vmcb->control.exit_info_2 &
+			    (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
+				has_error_code = true;
+				error_code =
+					(u32)svm->vmcb->control.exit_info_2;
+			}
 			kvm_clear_exception_queue(&svm->vcpu);
 			break;
 		case SVM_EXITINTINFO_TYPE_INTR:
@@ -2258,7 +2266,8 @@ static int task_switch_interception(struct vcpu_svm *svm)
 	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
 		skip_emulated_instruction(&svm->vcpu);
 
-	return kvm_task_switch(&svm->vcpu, tss_selector, reason);
+	return kvm_task_switch(&svm->vcpu, tss_selector, reason,
+			       has_error_code, error_code);
 }
 
 static int cpuid_interception(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fb4a8869bb99..1b38d8a88cf7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3271,6 +3271,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long exit_qualification;
+	bool has_error_code = false;
+	u32 error_code = 0;
 	u16 tss_selector;
 	int reason, type, idt_v;
 
@@ -3293,6 +3295,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 			kvm_clear_interrupt_queue(vcpu);
 			break;
 		case INTR_TYPE_HARD_EXCEPTION:
+			if (vmx->idt_vectoring_info &
+			    VECTORING_INFO_DELIVER_CODE_MASK) {
+				has_error_code = true;
+				error_code =
+					vmcs_read32(IDT_VECTORING_ERROR_CODE);
+			}
+			/* fall through */
 		case INTR_TYPE_SOFT_EXCEPTION:
 			kvm_clear_exception_queue(vcpu);
 			break;
@@ -3307,7 +3316,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 		       type != INTR_TYPE_NMI_INTR))
 		skip_emulated_instruction(vcpu);
 
-	if (!kvm_task_switch(vcpu, tss_selector, reason))
+	if (!kvm_task_switch(vcpu, tss_selector, reason, has_error_code,
+			     error_code))
 		return 0;
 
 	/* clear all local breakpoint enable flags */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 40991527f54a..58a295c6bf62 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4778,7 +4778,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
+		    bool has_error_code, u32 error_code)
 {
 	int cs_db, cs_l, ret;
 	cache_all_regs(vcpu);
@@ -4796,7 +4797,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
 		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
 
 	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
-				   tss_selector, reason);
+				   tss_selector, reason, has_error_code,
+				   error_code);
 
 	if (ret == X86EMUL_CONTINUE)
 		kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
-- 
cgit v1.2.3-55-g7522


From 5b7e0102ae744e9175b905f4267a81393bdb7a75 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Wed, 14 Apr 2010 19:20:03 +0300
Subject: KVM: MMU: Replace role.glevels with role.cr4_pae

There is no real distinction between glevels=3 and glevels=4; both have
exactly the same format and the code is treated exactly the same way.  Drop
role.glevels and replace is with role.cr4_pae (which is meaningful).  This
simplifies the code a bit.

As a side effect, it allows sharing shadow page tables between pae and
longmode guest page tables at the same guest page.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu.c              | 12 ++++++------
 arch/x86/kvm/mmutrace.h         |  5 +++--
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3602728d54de..707d272ae4a1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -171,8 +171,8 @@ struct kvm_pte_chain {
 union kvm_mmu_page_role {
 	unsigned word;
 	struct {
-		unsigned glevels:4;
 		unsigned level:4;
+		unsigned cr4_pae:1;
 		unsigned quadrant:2;
 		unsigned pad_for_nice_hex_output:6;
 		unsigned direct:1;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ec8900b6692a..51aa580d0aa5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1206,7 +1206,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 
 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
-	if (sp->role.glevels != vcpu->arch.mmu.root_level) {
+	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
 		kvm_mmu_zap_page(vcpu->kvm, sp);
 		return 1;
 	}
@@ -1329,7 +1329,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	role.level = level;
 	role.direct = direct;
 	if (role.direct)
-		role.glevels = 0;
+		role.cr4_pae = 0;
 	role.access = access;
 	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
@@ -2443,7 +2443,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 	else
 		r = paging32_init_context(vcpu);
 
-	vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level;
+	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
 
 	return r;
 }
@@ -2532,7 +2532,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
         }
 
 	++vcpu->kvm->stat.mmu_pte_updated;
-	if (sp->role.glevels == PT32_ROOT_LEVEL)
+	if (!sp->role.cr4_pae)
 		paging32_update_pte(vcpu, sp, spte, new);
 	else
 		paging64_update_pte(vcpu, sp, spte, new);
@@ -2681,7 +2681,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
 		if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
 			continue;
-		pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
+		pte_size = sp->role.cr4_pae ? 8 : 4;
 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
 		misaligned |= bytes < 4;
 		if (misaligned || flooded) {
@@ -2705,7 +2705,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		page_offset = offset;
 		level = sp->role.level;
 		npte = 1;
-		if (sp->role.glevels == PT32_ROOT_LEVEL) {
+		if (!sp->role.cr4_pae) {
 			page_offset <<= 1;	/* 32->64 */
 			/*
 			 * A 32-bit pde maps 4MB while the shadow pdes map
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 1fe956ab7617..3851f1f3030c 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -28,9 +28,10 @@
 								        \
 	role.word = __entry->role;					\
 									\
-	trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge"	\
+	trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s %spge"		\
 			 " %snxe root %u %s%c",				\
-			 __entry->gfn, role.level, role.glevels,	\
+			 __entry->gfn, role.level,			\
+			 role.cr4_pae ? " pae" : "",			\
 			 role.quadrant,					\
 			 role.direct ? " direct" : "",			\
 			 access_str[role.access],			\
-- 
cgit v1.2.3-55-g7522


From 19d04437267f00c7b50343513693b7a3174ff908 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 15 Apr 2010 12:29:50 +0300
Subject: KVM: fix emulator_task_switch() return value.

emulator_task_switch() should return -1 for failure and 0 for success to
the caller, just like x86_emulate_insn() does.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 arch/x86/kvm/x86.c     | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 585d0ef4a5f6..5ac0bb465ed6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2453,7 +2453,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 		rc = writeback(ctxt, ops);
 	}
 
-	return rc;
+	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 }
 
 static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 58a295c6bf62..30efeead4511 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4800,10 +4800,11 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 				   tss_selector, reason, has_error_code,
 				   error_code);
 
-	if (ret == X86EMUL_CONTINUE)
-		kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	if (ret)
+		return EMULATE_FAIL;
 
-	return (ret != X86EMUL_CONTINUE);
+	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	return EMULATE_DONE;
 }
 EXPORT_SYMBOL_GPL(kvm_task_switch);
 
-- 
cgit v1.2.3-55-g7522


From 1b8c7934a4063653095fb9fa88f9169f5b52f52b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Fri, 16 Apr 2010 21:23:41 +0800
Subject: KVM: MMU: remove unused struct kvm_unsync_walk

Remove 'struct kvm_unsync_walk' since it's not used.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 51aa580d0aa5..b57be03d442c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -173,11 +173,6 @@ struct kvm_shadow_walk_iterator {
 	     shadow_walk_okay(&(_walker));			\
 	     shadow_walk_next(&(_walker)))
 
-
-struct kvm_unsync_walk {
-	int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
-};
-
 typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
 
 static struct kmem_cache *pte_chain_cache;
-- 
cgit v1.2.3-55-g7522


From 0571d366e0be571be14581cb5e28d9c3f6e0d0b1 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Fri, 16 Apr 2010 21:27:54 +0800
Subject: KVM: MMU: reduce 'struct kvm_mmu_page' size

Define 'multimapped' as 'bool'.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 707d272ae4a1..3c31c5ad37ab 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -202,9 +202,9 @@ struct kvm_mmu_page {
 	 * in this shadow page.
 	 */
 	DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
-	int multimapped;         /* More than one parent_pte? */
-	int root_count;          /* Currently serving as active root */
+	bool multimapped;         /* More than one parent_pte? */
 	bool unsync;
+	int root_count;          /* Currently serving as active root */
 	unsigned int unsync_children;
 	union {
 		u64 *parent_pte;               /* !multimapped */
-- 
cgit v1.2.3-55-g7522


From 6b18493d60e7f54a0feb771fa141b5fcb72ce1e4 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Fri, 16 Apr 2010 21:29:17 +0800
Subject: KVM: MMU: remove unused parameter in mmu_parent_walk()

'vcpu' is unused, remove it

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b57be03d442c..45f3aa5213c5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -173,7 +173,7 @@ struct kvm_shadow_walk_iterator {
 	     shadow_walk_okay(&(_walker));			\
 	     shadow_walk_next(&(_walker)))
 
-typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
+typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
 
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
@@ -1001,8 +1001,7 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
 }
 
 
-static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			    mmu_parent_walk_fn fn)
+static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
 {
 	struct kvm_pte_chain *pte_chain;
 	struct hlist_node *node;
@@ -1011,8 +1010,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 
 	if (!sp->multimapped && sp->parent_pte) {
 		parent_sp = page_header(__pa(sp->parent_pte));
-		fn(vcpu, parent_sp);
-		mmu_parent_walk(vcpu, parent_sp, fn);
+		fn(parent_sp);
+		mmu_parent_walk(parent_sp, fn);
 		return;
 	}
 	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
@@ -1020,8 +1019,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			if (!pte_chain->parent_ptes[i])
 				break;
 			parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
-			fn(vcpu, parent_sp);
-			mmu_parent_walk(vcpu, parent_sp, fn);
+			fn(parent_sp);
+			mmu_parent_walk(parent_sp, fn);
 		}
 }
 
@@ -1058,16 +1057,15 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
 		}
 }
 
-static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int unsync_walk_fn(struct kvm_mmu_page *sp)
 {
 	kvm_mmu_update_parents_unsync(sp);
 	return 1;
 }
 
-static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
-					struct kvm_mmu_page *sp)
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
 {
-	mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+	mmu_parent_walk(sp, unsync_walk_fn);
 	kvm_mmu_update_parents_unsync(sp);
 }
 
@@ -1345,7 +1343,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 			if (sp->unsync_children) {
 				set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
-				kvm_mmu_mark_parents_unsync(vcpu, sp);
+				kvm_mmu_mark_parents_unsync(sp);
 			}
 			trace_kvm_mmu_get_page(sp, false);
 			return sp;
@@ -1759,7 +1757,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	++vcpu->kvm->stat.mmu_unsync;
 	sp->unsync = 1;
 
-	kvm_mmu_mark_parents_unsync(vcpu, sp);
+	kvm_mmu_mark_parents_unsync(sp);
 
 	mmu_convert_notrap(sp);
 	return 0;
-- 
cgit v1.2.3-55-g7522


From acb5451789f21ad51215897bb8f9306a05e8acd4 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 15 Apr 2010 21:03:50 +0300
Subject: KVM: prevent spurious exit to userspace during task switch emulation.

If kvm_task_switch() fails code exits to userspace without specifying
exit reason, so the previous exit reason is reused by userspace. Fix
this by specifying exit reason correctly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 10 ++++++++--
 arch/x86/kvm/vmx.c |  8 ++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 78af52222fd2..ab78eb8ba899 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2266,8 +2266,14 @@ static int task_switch_interception(struct vcpu_svm *svm)
 	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
 		skip_emulated_instruction(&svm->vcpu);
 
-	return kvm_task_switch(&svm->vcpu, tss_selector, reason,
-			       has_error_code, error_code);
+	if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
+				has_error_code, error_code) == EMULATE_FAIL) {
+		svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		svm->vcpu.run->internal.ndata = 0;
+		return 0;
+	}
+	return 1;
 }
 
 static int cpuid_interception(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1b38d8a88cf7..6e5e75e0d7d3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3316,9 +3316,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 		       type != INTR_TYPE_NMI_INTR))
 		skip_emulated_instruction(vcpu);
 
-	if (!kvm_task_switch(vcpu, tss_selector, reason, has_error_code,
-			     error_code))
+	if (kvm_task_switch(vcpu, tss_selector, reason,
+				has_error_code, error_code) == EMULATE_FAIL) {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		vcpu->run->internal.ndata = 0;
 		return 0;
+	}
 
 	/* clear all local breakpoint enable flags */
 	vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
-- 
cgit v1.2.3-55-g7522


From 3246af0ece6c61689847417977733f0b12dc4b6f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Fri, 16 Apr 2010 16:35:54 +0800
Subject: KVM: MMU: cleanup for hlist walk restart

Quote from Avi:

|Just change the assignment to a 'goto restart;' please,
|I don't like playing with list_for_each internals.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 45f3aa5213c5..7a17db1cdcd6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1565,13 +1565,14 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 	r = 0;
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
+restart:
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
 		if (sp->gfn == gfn && !sp->role.direct) {
 			pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
 				 sp->role.word);
 			r = 1;
 			if (kvm_mmu_zap_page(kvm, sp))
-				n = bucket->first;
+				goto restart;
 		}
 	return r;
 }
@@ -1585,13 +1586,14 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
+restart:
 	hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
 		if (sp->gfn == gfn && !sp->role.direct
 		    && !sp->role.invalid) {
 			pgprintk("%s: zap %lx %x\n",
 				 __func__, gfn, sp->role.word);
 			if (kvm_mmu_zap_page(kvm, sp))
-				nn = bucket->first;
+				goto restart;
 		}
 	}
 }
@@ -2671,6 +2673,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	}
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+
+restart:
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
 		if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
 			continue;
@@ -2691,7 +2695,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
 				 gpa, bytes, sp->role.word);
 			if (kvm_mmu_zap_page(vcpu->kvm, sp))
-				n = bucket->first;
+				goto restart;
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
 		}
@@ -2900,10 +2904,11 @@ void kvm_mmu_zap_all(struct kvm *kvm)
 	struct kvm_mmu_page *sp, *node;
 
 	spin_lock(&kvm->mmu_lock);
+restart:
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
 		if (kvm_mmu_zap_page(kvm, sp))
-			node = container_of(kvm->arch.active_mmu_pages.next,
-					    struct kvm_mmu_page, link);
+			goto restart;
+
 	spin_unlock(&kvm->mmu_lock);
 
 	kvm_flush_remote_tlbs(kvm);
-- 
cgit v1.2.3-55-g7522


From 90d83dc3d49f5101addae962ccc1b4aff66b68d8 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan
Date: Mon, 19 Apr 2010 17:41:23 +0800
Subject: KVM: use the correct RCU API for PROVE_RCU=y

The RCU/SRCU API have already changed for proving RCU usage.

I got the following dmesg when PROVE_RCU=y because we used incorrect API.
This patch coverts rcu_deference() to srcu_dereference() or family API.

===================================================
[ INFO: suspicious rcu_dereference_check() usage. ]
---------------------------------------------------
arch/x86/kvm/mmu.c:3020 invoked rcu_dereference_check() without protection!

other info that might help us debug this:

rcu_scheduler_active = 1, debug_locks = 0
2 locks held by qemu-system-x86/8550:
 #0:  (&kvm->slots_lock){+.+.+.}, at: [<ffffffffa011a6ac>] kvm_set_memory_region+0x29/0x50 [kvm]
 #1:  (&(&kvm->mmu_lock)->rlock){+.+...}, at: [<ffffffffa012262d>] kvm_arch_commit_memory_region+0xa6/0xe2 [kvm]

stack backtrace:
Pid: 8550, comm: qemu-system-x86 Not tainted 2.6.34-rc4-tip-01028-g939eab1 #27
Call Trace:
 [<ffffffff8106c59e>] lockdep_rcu_dereference+0xaa/0xb3
 [<ffffffffa012f6c1>] kvm_mmu_calculate_mmu_pages+0x44/0x7d [kvm]
 [<ffffffffa012263e>] kvm_arch_commit_memory_region+0xb7/0xe2 [kvm]
 [<ffffffffa011a5d7>] __kvm_set_memory_region+0x636/0x6e2 [kvm]
 [<ffffffffa011a6ba>] kvm_set_memory_region+0x37/0x50 [kvm]
 [<ffffffffa015e956>] vmx_set_tss_addr+0x46/0x5a [kvm_intel]
 [<ffffffffa0126592>] kvm_arch_vm_ioctl+0x17a/0xcf8 [kvm]
 [<ffffffff810a8692>] ? unlock_page+0x27/0x2c
 [<ffffffff810bf879>] ? __do_fault+0x3a9/0x3e1
 [<ffffffffa011b12f>] kvm_vm_ioctl+0x364/0x38d [kvm]
 [<ffffffff81060cfa>] ? up_read+0x23/0x3d
 [<ffffffff810f3587>] vfs_ioctl+0x32/0xa6
 [<ffffffff810f3b19>] do_vfs_ioctl+0x495/0x4db
 [<ffffffff810e6b2f>] ? fget_light+0xc2/0x241
 [<ffffffff810e416c>] ? do_sys_open+0x104/0x116
 [<ffffffff81382d6d>] ? retint_swapgs+0xe/0x13
 [<ffffffff810f3ba6>] sys_ioctl+0x47/0x6a
 [<ffffffff810021db>] system_call_fastpath+0x16/0x1b

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c |  2 +-
 arch/s390/kvm/kvm-s390.h |  2 +-
 arch/x86/kvm/mmu.c       |  7 ++++---
 arch/x86/kvm/vmx.c       |  2 +-
 arch/x86/kvm/x86.c       |  4 ++--
 arch/x86/kvm/x86.h       |  7 +++++++
 include/linux/kvm_host.h |  7 +++++++
 virt/kvm/iommu.c         |  4 ++--
 virt/kvm/kvm_main.c      | 13 ++++++++-----
 9 files changed, 33 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index d7bac1f75af0..d5f4e9161201 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1381,7 +1381,7 @@ static void kvm_release_vm_pages(struct kvm *kvm)
 	int i, j;
 	unsigned long base_gfn;
 
-	slots = rcu_dereference(kvm->memslots);
+	slots = kvm_memslots(kvm);
 	for (i = 0; i < slots->nmemslots; i++) {
 		memslot = &slots->memslots[i];
 		base_gfn = memslot->base_gfn;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 60f09ab3672c..cfa9d1777457 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -72,7 +72,7 @@ static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu)
 	struct kvm_memslots *memslots;
 
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
-	memslots = rcu_dereference(vcpu->kvm->memslots);
+	memslots = kvm_memslots(vcpu->kvm);
 
 	mem = &memslots->memslots[0];
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7a17db1cdcd6..0682a393ad90 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -787,7 +787,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 	int retval = 0;
 	struct kvm_memslots *slots;
 
-	slots = rcu_dereference(kvm->memslots);
+	slots = kvm_memslots(kvm);
 
 	for (i = 0; i < slots->nmemslots; i++) {
 		struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -3016,7 +3016,8 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
 	unsigned int  nr_pages = 0;
 	struct kvm_memslots *slots;
 
-	slots = rcu_dereference(kvm->memslots);
+	slots = kvm_memslots(kvm);
+
 	for (i = 0; i < slots->nmemslots; i++)
 		nr_pages += slots->memslots[i].npages;
 
@@ -3292,7 +3293,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
 	int i, j, k, idx;
 
 	idx = srcu_read_lock(&kvm->srcu);
-	slots = rcu_dereference(kvm->memslots);
+	slots = kvm_memslots(kvm);
 	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
 		struct kvm_memory_slot *m = &slots->memslots[i];
 		struct kvm_rmap_desc *d;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0b896ac7e4bb..d0a10b5612e9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1558,7 +1558,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
 		struct kvm_memslots *slots;
 		gfn_t base_gfn;
 
-		slots = rcu_dereference(kvm->memslots);
+		slots = kvm_memslots(kvm);
 		base_gfn = kvm->memslots->memslots[0].base_gfn +
 				 kvm->memslots->memslots[0].npages - 3;
 		return base_gfn << PAGE_SHIFT;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 58a96e6a234c..638248c96999 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2497,7 +2497,7 @@ gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
 	struct kvm_mem_alias *alias;
 	struct kvm_mem_aliases *aliases;
 
-	aliases = rcu_dereference(kvm->arch.aliases);
+	aliases = kvm_aliases(kvm);
 
 	for (i = 0; i < aliases->naliases; ++i) {
 		alias = &aliases->aliases[i];
@@ -2516,7 +2516,7 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 	struct kvm_mem_alias *alias;
 	struct kvm_mem_aliases *aliases;
 
-	aliases = rcu_dereference(kvm->arch.aliases);
+	aliases = kvm_aliases(kvm);
 
 	for (i = 0; i < aliases->naliases; ++i) {
 		alias = &aliases->aliases[i];
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b7a404722d2b..f4b54458285b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,6 +65,13 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
 	return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
 }
 
+static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
+{
+	return rcu_dereference_check(kvm->arch.aliases,
+			srcu_read_lock_held(&kvm->srcu)
+			|| lockdep_is_held(&kvm->slots_lock));
+}
+
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 55830638aeb1..1ed030bad59e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -250,6 +250,13 @@ void kvm_exit(void);
 void kvm_get_kvm(struct kvm *kvm);
 void kvm_put_kvm(struct kvm *kvm);
 
+static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
+{
+	return rcu_dereference_check(kvm->memslots,
+			srcu_read_lock_held(&kvm->srcu)
+			|| lockdep_is_held(&kvm->slots_lock));
+}
+
 #define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
 #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
 static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 80fd3ad3b2de..37ca71ebdba8 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -78,7 +78,7 @@ static int kvm_iommu_map_memslots(struct kvm *kvm)
 	int i, r = 0;
 	struct kvm_memslots *slots;
 
-	slots = rcu_dereference(kvm->memslots);
+	slots = kvm_memslots(kvm);
 
 	for (i = 0; i < slots->nmemslots; i++) {
 		r = kvm_iommu_map_pages(kvm, &slots->memslots[i]);
@@ -217,7 +217,7 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm)
 	int i;
 	struct kvm_memslots *slots;
 
-	slots = rcu_dereference(kvm->memslots);
+	slots = kvm_memslots(kvm);
 
 	for (i = 0; i < slots->nmemslots; i++) {
 		kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d6351a34b297..4901ec5061ba 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -834,7 +834,7 @@ EXPORT_SYMBOL_GPL(kvm_is_error_hva);
 struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
 {
 	int i;
-	struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
+	struct kvm_memslots *slots = kvm_memslots(kvm);
 
 	for (i = 0; i < slots->nmemslots; ++i) {
 		struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -856,7 +856,7 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 {
 	int i;
-	struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
+	struct kvm_memslots *slots = kvm_memslots(kvm);
 
 	gfn = unalias_gfn_instantiation(kvm, gfn);
 	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
@@ -900,7 +900,7 @@ out:
 int memslot_id(struct kvm *kvm, gfn_t gfn)
 {
 	int i;
-	struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
+	struct kvm_memslots *slots = kvm_memslots(kvm);
 	struct kvm_memory_slot *memslot = NULL;
 
 	gfn = unalias_gfn(kvm, gfn);
@@ -1994,7 +1994,9 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val)
 {
 	int i;
-	struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]);
+	struct kvm_io_bus *bus;
+
+	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
 	for (i = 0; i < bus->dev_count; i++)
 		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
 			return 0;
@@ -2006,8 +2008,9 @@ int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		    int len, void *val)
 {
 	int i;
-	struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]);
+	struct kvm_io_bus *bus;
 
+	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
 	for (i = 0; i < bus->dev_count; i++)
 		if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
 			return 0;
-- 
cgit v1.2.3-55-g7522


From 87bc3bf972af0585ba5415aebbc8bd09b6a2ee94 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Mon, 19 Apr 2010 17:25:53 +0300
Subject: KVM: MMU: Drop cr4.pge from shadow page role

Since commit bf47a760f66ad, we no longer handle ptes with the global bit
set specially, so there is no reason to distinguish between shadow pages
created with cr4.gpe set and clear.

Such tracking is expensive when the guest toggles cr4.pge, so drop it.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/mmutrace.h         | 3 +--
 arch/x86/kvm/x86.c              | 1 -
 3 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3c31c5ad37ab..d47d087568fe 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -178,7 +178,6 @@ union kvm_mmu_page_role {
 		unsigned direct:1;
 		unsigned access:3;
 		unsigned invalid:1;
-		unsigned cr4_pge:1;
 		unsigned nxe:1;
 	};
 };
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3851f1f3030c..bc4f7f0be2b1 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -28,7 +28,7 @@
 								        \
 	role.word = __entry->role;					\
 									\
-	trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s %spge"		\
+	trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s"		\
 			 " %snxe root %u %s%c",				\
 			 __entry->gfn, role.level,			\
 			 role.cr4_pae ? " pae" : "",			\
@@ -36,7 +36,6 @@
 			 role.direct ? " direct" : "",			\
 			 access_str[role.access],			\
 			 role.invalid ? " invalid" : "",		\
-			 role.cr4_pge ? "" : "!",			\
 			 role.nxe ? "" : "!",				\
 			 __entry->root_count,				\
 			 __entry->unsync ? "unsync" : "sync", 0);	\
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 638248c96999..cf37ac6644e0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -488,7 +488,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	}
 	kvm_x86_ops->set_cr4(vcpu, cr4);
 	vcpu->arch.cr4 = cr4;
-	vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
 	kvm_mmu_reset_context(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
-- 
cgit v1.2.3-55-g7522


From 51fb60d81b483ae75614d401e7d4271884894113 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng
Date: Fri, 16 Apr 2010 17:16:40 +0800
Subject: KVM: MMU: Move sync_page() first pte address calculation out of loop

Move first pte address calculation out of loop to save some cycles.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d9dea288e51d..5910557b3f33 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -572,12 +572,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
 	int i, offset, nr_present;
 	bool reset_host_protection;
+	gpa_t first_pte_gpa;
 
 	offset = nr_present = 0;
 
 	if (PTTYPE == 32)
 		offset = sp->role.quadrant << PT64_LEVEL_BITS;
 
+	first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
+
 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 		unsigned pte_access;
 		pt_element_t gpte;
@@ -587,8 +590,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		if (!is_shadow_present_pte(sp->spt[i]))
 			continue;
 
-		pte_gpa = gfn_to_gpa(sp->gfn);
-		pte_gpa += (i+offset) * sizeof(pt_element_t);
+		pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
 
 		if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
 					  sizeof(pt_element_t)))
-- 
cgit v1.2.3-55-g7522


From 814a59d2077d630cffca7e2878c5b6f9b91ba725 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng
Date: Fri, 16 Apr 2010 17:18:01 +0800
Subject: KVM: MMU: Make use of is_large_pte() in walker

Make use of is_large_pte() instead of checking PT_PAGE_SIZE_MASK
bit directly.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 5910557b3f33..d0cc07eb6eda 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -190,10 +190,10 @@ walk:
 
 		if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
 		    ((walker->level == PT_DIRECTORY_LEVEL) &&
-				(pte & PT_PAGE_SIZE_MASK)  &&
+				is_large_pte(pte) &&
 				(PTTYPE == 64 || is_pse(vcpu))) ||
 		    ((walker->level == PT_PDPE_LEVEL) &&
-				(pte & PT_PAGE_SIZE_MASK)  &&
+				is_large_pte(pte) &&
 				is_long_mode(vcpu))) {
 			int lvl = walker->level;
 
-- 
cgit v1.2.3-55-g7522


From b2fc15a5ef6679a5f87fee012a836294532bc628 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng
Date: Fri, 16 Apr 2010 17:18:54 +0800
Subject: KVM: MMU: Remove unused varialbe in rmap_next()

Remove unused varialbe in rmap_next()

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0682a393ad90..7eff794457f3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -647,7 +647,6 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 {
 	struct kvm_rmap_desc *desc;
-	struct kvm_rmap_desc *prev_desc;
 	u64 *prev_spte;
 	int i;
 
@@ -659,7 +658,6 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 		return NULL;
 	}
 	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-	prev_desc = NULL;
 	prev_spte = NULL;
 	while (desc) {
 		for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
-- 
cgit v1.2.3-55-g7522


From 2a059bf444dd7758ccf48f217cd981570132be85 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng
Date: Fri, 16 Apr 2010 17:19:48 +0800
Subject: KVM: Get rid of dead function gva_to_page()

Nobody use gva_to_page() anymore, get rid of it.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c       | 14 --------------
 include/linux/kvm_host.h |  1 -
 2 files changed, 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7eff794457f3..d2a110e870fa 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1618,20 +1618,6 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp)
 	}
 }
 
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
-{
-	struct page *page;
-
-	gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
-
-	if (gpa == UNMAPPED_GVA)
-		return NULL;
-
-	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-
-	return page;
-}
-
 /*
  * The function is based on mtrr_type_lookup() in
  * arch/x86/kernel/cpu/mtrr/generic.c
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1ed030bad59e..ce027d518096 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -260,7 +260,6 @@ static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
 #define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
 #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
 static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
 
 extern struct page *bad_page;
 extern pfn_t bad_pfn;
-- 
cgit v1.2.3-55-g7522


From 77a1a715707d0f60ce0cfbe44070527a0a561f01 Mon Sep 17 00:00:00 2001
From: Wei Yongjun
Date: Fri, 16 Apr 2010 16:21:42 +0800
Subject: KVM: MMU: cleanup for function unaccount_shadowed()

Since gfn is not changed in the for loop, we do not need to call
gfn_to_memslot_unaliased() under the loop, and it is safe to move
it out.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d2a110e870fa..ddfa8658fb6d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -431,9 +431,9 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 	int i;
 
 	gfn = unalias_gfn(kvm, gfn);
+	slot = gfn_to_memslot_unaliased(kvm, gfn);
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		slot          = gfn_to_memslot_unaliased(kvm, gfn);
 		write_count   = slot_largepage_idx(gfn, slot, i);
 		*write_count -= 1;
 		WARN_ON(*write_count < 0);
-- 
cgit v1.2.3-55-g7522


From cdbecfc398a904ce9f5c126638b09a2429fb86ed Mon Sep 17 00:00:00 2001
From: Lai Jiangshan
Date: Sat, 17 Apr 2010 16:41:47 +0800
Subject: KVM: VMX: free vpid when fail to create vcpu

Fix bug of the exception path, free allocated vpid when fail
to create vcpu.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d0a10b5612e9..54c0035a63f0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2333,6 +2333,16 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
 	spin_unlock(&vmx_vpid_lock);
 }
 
+static void free_vpid(struct vcpu_vmx *vmx)
+{
+	if (!enable_vpid)
+		return;
+	spin_lock(&vmx_vpid_lock);
+	if (vmx->vpid != 0)
+		__clear_bit(vmx->vpid, vmx_vpid_bitmap);
+	spin_unlock(&vmx_vpid_lock);
+}
+
 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
 {
 	int f = sizeof(unsigned long);
@@ -3916,10 +3926,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	spin_lock(&vmx_vpid_lock);
-	if (vmx->vpid != 0)
-		__clear_bit(vmx->vpid, vmx_vpid_bitmap);
-	spin_unlock(&vmx_vpid_lock);
+	free_vpid(vmx);
 	vmx_free_vmcs(vcpu);
 	kfree(vmx->guest_msrs);
 	kvm_vcpu_uninit(vcpu);
@@ -3981,6 +3988,7 @@ free_msrs:
 uninit_vcpu:
 	kvm_vcpu_uninit(&vmx->vcpu);
 free_vcpu:
+	free_vpid(vmx);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 	return ERR_PTR(err);
 }
-- 
cgit v1.2.3-55-g7522


From 924584ccb08c338ebd2f40936ff2321c1cce6a6d Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Thu, 22 Apr 2010 12:33:07 +0200
Subject: KVM: SVM: Fix nested nmi handling

The patch introducing nested nmi handling had a bug. The
check does not belong to enable_nmi_window but must be in
nmi_allowed. This patch fixes this.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ab78eb8ba899..ec205847be6a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2771,8 +2771,12 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb *vmcb = svm->vmcb;
-	return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
-		!(svm->vcpu.arch.hflags & HF_NMI_MASK);
+	int ret;
+	ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+	      !(svm->vcpu.arch.hflags & HF_NMI_MASK);
+	ret = ret && gif_set(svm) && nested_svm_nmi(svm);
+
+	return ret;
 }
 
 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -2841,11 +2845,9 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 	 * Something prevents NMI from been injected. Single step over possible
 	 * problem (IRET or exception injection or interrupt shadow)
 	 */
-	if (gif_set(svm) && nested_svm_nmi(svm)) {
-		svm->nmi_singlestep = true;
-		svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
-		update_db_intercept(vcpu);
-	}
+	svm->nmi_singlestep = true;
+	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+	update_db_intercept(vcpu);
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
-- 
cgit v1.2.3-55-g7522


From 2041a06a50a2ef4062c8454482aa06e25f6cccde Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Thu, 22 Apr 2010 12:33:08 +0200
Subject: KVM: SVM: Make sure rip is synced to vmcb before nested vmexit

This patch fixes a bug where a nested guest always went over
the same instruction because the rip was not advanced on a
nested vmexit.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ec205847be6a..c480d7f64a60 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2960,6 +2960,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	u16 gs_selector;
 	u16 ldt_selector;
 
+	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
 	/*
 	 * A vmexit emulation is required before the vcpu can be executed
 	 * again.
@@ -2967,10 +2971,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(svm->nested.exit_required))
 		return;
 
-	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
-	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
-	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
-
 	pre_svm_run(svm);
 
 	sync_lapic_to_cr8(vcpu);
-- 
cgit v1.2.3-55-g7522


From 2be4fc7a02c368dcfda83a386a5101c211b9535e Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Thu, 22 Apr 2010 12:33:09 +0200
Subject: KVM: SVM: Sync cr0 and cr3 to kvm state before nested handling

This patch syncs cr0 and cr3 from the vmcb to the kvm state
before nested intercept handling is done. This allows to
simplify the vmexit path.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c480d7f64a60..5ad9d802bd16 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1799,10 +1799,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->save.gdtr   = vmcb->save.gdtr;
 	nested_vmcb->save.idtr   = vmcb->save.idtr;
 	nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
-	if (npt_enabled)
-		nested_vmcb->save.cr3    = vmcb->save.cr3;
-	else
-		nested_vmcb->save.cr3    = svm->vcpu.arch.cr3;
+	nested_vmcb->save.cr3    = svm->vcpu.arch.cr3;
 	nested_vmcb->save.cr2    = vmcb->save.cr2;
 	nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
 	nested_vmcb->save.rflags = vmcb->save.rflags;
@@ -2641,6 +2638,11 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 
 	trace_kvm_exit(exit_code, vcpu);
 
+	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
+		vcpu->arch.cr0 = svm->vmcb->save.cr0;
+	if (npt_enabled)
+		vcpu->arch.cr3 = svm->vmcb->save.cr3;
+
 	if (unlikely(svm->nested.exit_required)) {
 		nested_svm_vmexit(svm);
 		svm->nested.exit_required = false;
@@ -2668,11 +2670,6 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 
 	svm_complete_interrupts(svm);
 
-	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
-		vcpu->arch.cr0 = svm->vmcb->save.cr0;
-	if (npt_enabled)
-		vcpu->arch.cr3 = svm->vmcb->save.cr3;
-
 	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		kvm_run->fail_entry.hardware_entry_failure_reason
-- 
cgit v1.2.3-55-g7522


From 228070b1b31abc16c331b57bf965101c6eb1d167 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Thu, 22 Apr 2010 12:33:10 +0200
Subject: KVM: SVM: Propagate nested entry failure into guest hypervisor

This patch implements propagation of a failes guest vmrun
back into the guest instead of killing the whole guest.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5ad9d802bd16..b10d1630c203 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1715,6 +1715,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
 			vmexit = NESTED_EXIT_DONE;
 		break;
 	}
+	case SVM_EXIT_ERR: {
+		vmexit = NESTED_EXIT_DONE;
+		break;
+	}
 	default: {
 		u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
 		if (svm->nested.intercept & exit_bits)
-- 
cgit v1.2.3-55-g7522


From d4330ef2fb2236a1e3a176f0f68360f4c0a8661b Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Thu, 22 Apr 2010 12:33:11 +0200
Subject: KVM: x86: Add callback to let modules decide over some supported
 cpuid bits

This patch adds the get_supported_cpuid callback to
kvm_x86_ops. It will be used in do_cpuid_ent to delegate the
decission about some supported cpuid bits to the
architecture modules.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 ++
 arch/x86/kvm/svm.c              | 6 ++++++
 arch/x86/kvm/vmx.c              | 6 ++++++
 arch/x86/kvm/x86.c              | 3 +++
 4 files changed, 17 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d47d087568fe..357573af974f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -528,6 +528,8 @@ struct kvm_x86_ops {
 	int (*get_lpage_level)(void);
 	bool (*rdtscp_supported)(void);
 
+	void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
+
 	const struct trace_print_flags *exit_reasons_str;
 };
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b10d1630c203..0fa2035bd8e7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3152,6 +3152,10 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 {
 }
 
+static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+{
+}
+
 static const struct trace_print_flags svm_exit_reasons_str[] = {
 	{ SVM_EXIT_READ_CR0,			"read_cr0" },
 	{ SVM_EXIT_READ_CR3,			"read_cr3" },
@@ -3297,6 +3301,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.cpuid_update = svm_cpuid_update,
 
 	.rdtscp_supported = svm_rdtscp_supported,
+
+	.set_supported_cpuid = svm_set_supported_cpuid,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 54c0035a63f0..9f8532b1fa9a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4119,6 +4119,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+{
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -4191,6 +4195,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.cpuid_update = vmx_cpuid_update,
 
 	.rdtscp_supported = vmx_rdtscp_supported,
+
+	.set_supported_cpuid = vmx_set_supported_cpuid,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 848c814e8c3c..6e6434332f21 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1960,6 +1960,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->ecx &= kvm_supported_word6_x86_features;
 		break;
 	}
+
+	kvm_x86_ops->set_supported_cpuid(function, entry);
+
 	put_cpu();
 }
 
-- 
cgit v1.2.3-55-g7522


From c2c63a493924e09a1984d1374a0e60dfd54fc0b0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Thu, 22 Apr 2010 12:33:12 +0200
Subject: KVM: SVM: Report emulated SVM features to userspace

This patch implements the reporting of the emulated SVM
features to userspace instead of the real hardware
capabilities. Every real hardware capability needs emulation
in nested svm so the old behavior was broken.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0fa2035bd8e7..65fc11438b75 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3154,6 +3154,16 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 
 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 {
+	switch (func) {
+	case 0x8000000A:
+		entry->eax = 1; /* SVM revision 1 */
+		entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
+				   ASID emulation to nested SVM */
+		entry->ecx = 0; /* Reserved */
+		entry->edx = 0; /* Do not support any additional features */
+
+		break;
+	}
 }
 
 static const struct trace_print_flags svm_exit_reasons_str[] = {
-- 
cgit v1.2.3-55-g7522


From ce7ddec4bbbc08f0c2901cc103773aed864b09fd Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Thu, 22 Apr 2010 12:33:13 +0200
Subject: KVM: x86: Allow marking an exception as reinjected

This patch adds logic to kvm/x86 which allows to mark an
injected exception as reinjected. This allows to remove an
ugly hack from svm_complete_interrupts that prevented
exceptions from being reinjected at all in the nested case.
The hack was necessary because an reinjected exception into
the nested guest could cause a nested vmexit emulation. But
reinjected exceptions must not intercept. The downside of
the hack is that a exception that in injected could get
lost.
This patch fixes the problem and puts the code for it into
generic x86 files because. Nested-VMX will likely have the
same problem and could reuse the code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  6 +++++-
 arch/x86/kvm/svm.c              | 12 ++++++------
 arch/x86/kvm/vmx.c              |  3 ++-
 arch/x86/kvm/x86.c              | 23 +++++++++++++++++++----
 4 files changed, 32 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 357573af974f..3f0007b076da 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -312,6 +312,7 @@ struct kvm_vcpu_arch {
 	struct kvm_queued_exception {
 		bool pending;
 		bool has_error_code;
+		bool reinject;
 		u8 nr;
 		u32 error_code;
 	} exception;
@@ -514,7 +515,8 @@ struct kvm_x86_ops {
 	void (*set_irq)(struct kvm_vcpu *vcpu);
 	void (*set_nmi)(struct kvm_vcpu *vcpu);
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
-				bool has_error_code, u32 error_code);
+				bool has_error_code, u32 error_code,
+				bool reinject);
 	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
 	int (*nmi_allowed)(struct kvm_vcpu *vcpu);
 	bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
@@ -617,6 +619,8 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
 
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
+void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
 			   u32 error_code);
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 65fc11438b75..30e49fe7f8c0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -338,7 +338,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 }
 
 static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
-				bool has_error_code, u32 error_code)
+				bool has_error_code, u32 error_code,
+				bool reinject)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -346,7 +347,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	 * If we are within a nested VM we'd better #VMEXIT and let the guest
 	 * handle the exception
 	 */
-	if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
+	if (!reinject &&
+	    nested_svm_check_exception(svm, nr, has_error_code, error_code))
 		return;
 
 	if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) {
@@ -2918,8 +2920,6 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 		svm->vcpu.arch.nmi_injected = true;
 		break;
 	case SVM_EXITINTINFO_TYPE_EXEPT:
-		if (is_nested(svm))
-			break;
 		/*
 		 * In case of software exceptions, do not reinject the vector,
 		 * but re-execute the instruction instead. Rewind RIP first
@@ -2935,10 +2935,10 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 		}
 		if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
 			u32 err = svm->vmcb->control.exit_int_info_err;
-			kvm_queue_exception_e(&svm->vcpu, vector, err);
+			kvm_requeue_exception_e(&svm->vcpu, vector, err);
 
 		} else
-			kvm_queue_exception(&svm->vcpu, vector);
+			kvm_requeue_exception(&svm->vcpu, vector);
 		break;
 	case SVM_EXITINTINFO_TYPE_INTR:
 		kvm_queue_interrupt(&svm->vcpu, vector, false);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9f8532b1fa9a..875b785228f6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -919,7 +919,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 }
 
 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
-				bool has_error_code, u32 error_code)
+				bool has_error_code, u32 error_code,
+				bool reinject)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6e6434332f21..6b2ce1d2d748 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -265,7 +265,8 @@ static int exception_class(int vector)
 }
 
 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
-		unsigned nr, bool has_error, u32 error_code)
+		unsigned nr, bool has_error, u32 error_code,
+		bool reinject)
 {
 	u32 prev_nr;
 	int class1, class2;
@@ -276,6 +277,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 		vcpu->arch.exception.has_error_code = has_error;
 		vcpu->arch.exception.nr = nr;
 		vcpu->arch.exception.error_code = error_code;
+		vcpu->arch.exception.reinject = true;
 		return;
 	}
 
@@ -304,10 +306,16 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 {
-	kvm_multiple_exception(vcpu, nr, false, 0);
+	kvm_multiple_exception(vcpu, nr, false, 0, false);
 }
 EXPORT_SYMBOL_GPL(kvm_queue_exception);
 
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+	kvm_multiple_exception(vcpu, nr, false, 0, true);
+}
+EXPORT_SYMBOL_GPL(kvm_requeue_exception);
+
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 			   u32 error_code)
 {
@@ -324,10 +332,16 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 {
-	kvm_multiple_exception(vcpu, nr, true, error_code);
+	kvm_multiple_exception(vcpu, nr, true, error_code, false);
 }
 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 
+void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
+{
+	kvm_multiple_exception(vcpu, nr, true, error_code, true);
+}
+EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
+
 /*
  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
  * a #GP and return false.
@@ -4408,7 +4422,8 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
 					vcpu->arch.exception.error_code);
 		kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
 					  vcpu->arch.exception.has_error_code,
-					  vcpu->arch.exception.error_code);
+					  vcpu->arch.exception.error_code,
+					  vcpu->arch.exception.reinject);
 		return;
 	}
 
-- 
cgit v1.2.3-55-g7522


From ff47a49b235e59e606e1cb267a08f7fbeb5719d1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Thu, 22 Apr 2010 12:33:14 +0200
Subject: KVM: SVM: Handle MCE intercepts always on host level

This patch prevents MCE intercepts from being propagated
into the L1 guest if they happened in an L2 guest.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 30e49fe7f8c0..889f66022e57 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1651,6 +1651,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
 	switch (exit_code) {
 	case SVM_EXIT_INTR:
 	case SVM_EXIT_NMI:
+	case SVM_EXIT_EXCP_BASE + MC_VECTOR:
 		return NESTED_EXIT_HOST;
 	case SVM_EXIT_NPF:
 		/* For now we are always handling NPFs when using them */
-- 
cgit v1.2.3-55-g7522


From df2fb6e7106dd0b76e3576bfaecbeb6f34843709 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng
Date: Thu, 22 Apr 2010 17:33:57 +0800
Subject: KVM: MMU: fix sp->unsync type error in trace event definition

sp->unsync is bool now, so update trace event declaration.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmutrace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index bc4f7f0be2b1..40a1786f36c7 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -11,7 +11,7 @@
 	__field(__u64, gfn) \
 	__field(__u32, role) \
 	__field(__u32, root_count) \
-	__field(__u32, unsync)
+	__field(bool, unsync)
 
 #define KVM_MMU_PAGE_ASSIGN(sp)			     \
 	__entry->gfn = sp->gfn;			     \
-- 
cgit v1.2.3-55-g7522


From f3d46f9d3194e0329216002a8724d4c0957abc79 Mon Sep 17 00:00:00 2001
From: Anton Blanchard
Date: Mon, 17 May 2010 14:33:53 +1000
Subject: atomic_t: Cast to volatile when accessing atomic variables

In preparation for removing volatile from the atomic_t definition, this
patch adds a volatile cast to all the atomic read functions.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/atomic.h    | 4 ++--
 arch/arm/include/asm/atomic.h      | 2 +-
 arch/avr32/include/asm/atomic.h    | 2 +-
 arch/cris/include/asm/atomic.h     | 2 +-
 arch/frv/include/asm/atomic.h      | 2 +-
 arch/h8300/include/asm/atomic.h    | 2 +-
 arch/ia64/include/asm/atomic.h     | 4 ++--
 arch/m32r/include/asm/atomic.h     | 2 +-
 arch/m68k/include/asm/atomic_mm.h  | 2 +-
 arch/m68k/include/asm/atomic_no.h  | 2 +-
 arch/mips/include/asm/atomic.h     | 4 ++--
 arch/mn10300/include/asm/atomic.h  | 2 +-
 arch/parisc/include/asm/atomic.h   | 4 ++--
 arch/sh/include/asm/atomic.h       | 2 +-
 arch/sparc/include/asm/atomic_32.h | 2 +-
 arch/sparc/include/asm/atomic_64.h | 4 ++--
 arch/x86/include/asm/atomic.h      | 2 +-
 arch/x86/include/asm/atomic64_64.h | 2 +-
 arch/xtensa/include/asm/atomic.h   | 2 +-
 include/asm-generic/atomic.h       | 2 +-
 20 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h
index 610dff44d94b..e756d04b6cd5 100644
--- a/arch/alpha/include/asm/atomic.h
+++ b/arch/alpha/include/asm/atomic.h
@@ -17,8 +17,8 @@
 #define ATOMIC_INIT(i)		( (atomic_t) { (i) } )
 #define ATOMIC64_INIT(i)	( (atomic64_t) { (i) } )
 
-#define atomic_read(v)		((v)->counter + 0)
-#define atomic64_read(v)	((v)->counter + 0)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
+#define atomic64_read(v)	(*(volatile long *)&(v)->counter)
 
 #define atomic_set(v,i)		((v)->counter = (i))
 #define atomic64_set(v,i)	((v)->counter = (i))
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index e8ddec2cb158..a0162fa94564 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -24,7 +24,7 @@
  * strex/ldrex monitor on some implementations. The reason we can use it for
  * atomic_set() is the clrex or dummy strex done on every exception return.
  */
-#define atomic_read(v)	((v)->counter)
+#define atomic_read(v)	(*(volatile int *)&(v)->counter)
 #define atomic_set(v,i)	(((v)->counter) = (i))
 
 #if __LINUX_ARM_ARCH__ >= 6
diff --git a/arch/avr32/include/asm/atomic.h b/arch/avr32/include/asm/atomic.h
index b131c27ddf57..bbce6a1c6bb6 100644
--- a/arch/avr32/include/asm/atomic.h
+++ b/arch/avr32/include/asm/atomic.h
@@ -19,7 +19,7 @@
 
 #define ATOMIC_INIT(i)  { (i) }
 
-#define atomic_read(v)		((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
 #define atomic_set(v, i)	(((v)->counter) = i)
 
 /*
diff --git a/arch/cris/include/asm/atomic.h b/arch/cris/include/asm/atomic.h
index a6aca819e9f3..88dc9b9c4ba0 100644
--- a/arch/cris/include/asm/atomic.h
+++ b/arch/cris/include/asm/atomic.h
@@ -15,7 +15,7 @@
 
 #define ATOMIC_INIT(i)  { (i) }
 
-#define atomic_read(v) ((v)->counter)
+#define atomic_read(v) (*(volatile int *)&(v)->counter)
 #define atomic_set(v,i) (((v)->counter) = (i))
 
 /* These should be written in asm but we do it in C for now. */
diff --git a/arch/frv/include/asm/atomic.h b/arch/frv/include/asm/atomic.h
index 00a57af79afc..fae32c7fdcb6 100644
--- a/arch/frv/include/asm/atomic.h
+++ b/arch/frv/include/asm/atomic.h
@@ -36,7 +36,7 @@
 #define smp_mb__after_atomic_inc()	barrier()
 
 #define ATOMIC_INIT(i)		{ (i) }
-#define atomic_read(v)		((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
 #define atomic_set(v, i)	(((v)->counter) = (i))
 
 #ifndef CONFIG_FRV_OUTOFLINE_ATOMIC_OPS
diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h
index 33c8c0fa9583..e936804b7508 100644
--- a/arch/h8300/include/asm/atomic.h
+++ b/arch/h8300/include/asm/atomic.h
@@ -10,7 +10,7 @@
 
 #define ATOMIC_INIT(i)	{ (i) }
 
-#define atomic_read(v)		((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
 #define atomic_set(v, i)	(((v)->counter) = i)
 
 #include <asm/system.h>
diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h
index 88405cb0832a..4e1948447a00 100644
--- a/arch/ia64/include/asm/atomic.h
+++ b/arch/ia64/include/asm/atomic.h
@@ -21,8 +21,8 @@
 #define ATOMIC_INIT(i)		((atomic_t) { (i) })
 #define ATOMIC64_INIT(i)	((atomic64_t) { (i) })
 
-#define atomic_read(v)		((v)->counter)
-#define atomic64_read(v)	((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
+#define atomic64_read(v)	(*(volatile long *)&(v)->counter)
 
 #define atomic_set(v,i)		(((v)->counter) = (i))
 #define atomic64_set(v,i)	(((v)->counter) = (i))
diff --git a/arch/m32r/include/asm/atomic.h b/arch/m32r/include/asm/atomic.h
index 63f0cf0f50dd..d44a51e5271b 100644
--- a/arch/m32r/include/asm/atomic.h
+++ b/arch/m32r/include/asm/atomic.h
@@ -26,7 +26,7 @@
  *
  * Atomically reads the value of @v.
  */
-#define atomic_read(v)	((v)->counter)
+#define atomic_read(v)	(*(volatile int *)&(v)->counter)
 
 /**
  * atomic_set - set atomic variable
diff --git a/arch/m68k/include/asm/atomic_mm.h b/arch/m68k/include/asm/atomic_mm.h
index d9d2ed647435..6a223b3f7e74 100644
--- a/arch/m68k/include/asm/atomic_mm.h
+++ b/arch/m68k/include/asm/atomic_mm.h
@@ -15,7 +15,7 @@
 
 #define ATOMIC_INIT(i)	{ (i) }
 
-#define atomic_read(v)		((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
 #define atomic_set(v, i)	(((v)->counter) = i)
 
 static inline void atomic_add(int i, atomic_t *v)
diff --git a/arch/m68k/include/asm/atomic_no.h b/arch/m68k/include/asm/atomic_no.h
index 5674cb9449bd..289310c63a8a 100644
--- a/arch/m68k/include/asm/atomic_no.h
+++ b/arch/m68k/include/asm/atomic_no.h
@@ -15,7 +15,7 @@
 
 #define ATOMIC_INIT(i)	{ (i) }
 
-#define atomic_read(v)		((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
 #define atomic_set(v, i)	(((v)->counter) = i)
 
 static __inline__ void atomic_add(int i, atomic_t *v)
diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h
index 519197ede089..59dc0c7ef733 100644
--- a/arch/mips/include/asm/atomic.h
+++ b/arch/mips/include/asm/atomic.h
@@ -29,7 +29,7 @@
  *
  * Atomically reads the value of @v.
  */
-#define atomic_read(v)		((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
 
 /*
  * atomic_set - set atomic variable
@@ -410,7 +410,7 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u)
  * @v: pointer of type atomic64_t
  *
  */
-#define atomic64_read(v)	((v)->counter)
+#define atomic64_read(v)	(*(volatile long *)&(v)->counter)
 
 /*
  * atomic64_set - set atomic variable
diff --git a/arch/mn10300/include/asm/atomic.h b/arch/mn10300/include/asm/atomic.h
index 5bf5be9566de..e41222d6c2fd 100644
--- a/arch/mn10300/include/asm/atomic.h
+++ b/arch/mn10300/include/asm/atomic.h
@@ -31,7 +31,7 @@
  * Atomically reads the value of @v.  Note that the guaranteed
  * useful range of an atomic_t is only 24 bits.
  */
-#define atomic_read(v)	((v)->counter)
+#define atomic_read(v)	(*(volatile int *)&(v)->counter)
 
 /**
  * atomic_set - set atomic variable
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 716634d1f546..f81955934aeb 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -189,7 +189,7 @@ static __inline__ void atomic_set(atomic_t *v, int i)
 
 static __inline__ int atomic_read(const atomic_t *v)
 {
-	return v->counter;
+	return (*(volatile int *)&(v)->counter);
 }
 
 /* exported interface */
@@ -286,7 +286,7 @@ atomic64_set(atomic64_t *v, s64 i)
 static __inline__ s64
 atomic64_read(const atomic64_t *v)
 {
-	return v->counter;
+	return (*(volatile long *)&(v)->counter);
 }
 
 #define atomic64_add(i,v)	((void)(__atomic64_add_return( ((s64)(i)),(v))))
diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h
index 275a448ae8c2..c7983124d99d 100644
--- a/arch/sh/include/asm/atomic.h
+++ b/arch/sh/include/asm/atomic.h
@@ -13,7 +13,7 @@
 
 #define ATOMIC_INIT(i)	( (atomic_t) { (i) } )
 
-#define atomic_read(v)		((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
 #define atomic_set(v,i)		((v)->counter = (i))
 
 #if defined(CONFIG_GUSA_RB)
diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h
index f0d343c3b956..7ae128b19d3f 100644
--- a/arch/sparc/include/asm/atomic_32.h
+++ b/arch/sparc/include/asm/atomic_32.h
@@ -25,7 +25,7 @@ extern int atomic_cmpxchg(atomic_t *, int, int);
 extern int atomic_add_unless(atomic_t *, int, int);
 extern void atomic_set(atomic_t *, int);
 
-#define atomic_read(v)          ((v)->counter)
+#define atomic_read(v)          (*(volatile int *)&(v)->counter)
 
 #define atomic_add(i, v)	((void)__atomic_add_return( (int)(i), (v)))
 #define atomic_sub(i, v)	((void)__atomic_add_return(-(int)(i), (v)))
diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h
index f2e48009989e..2050ca02c423 100644
--- a/arch/sparc/include/asm/atomic_64.h
+++ b/arch/sparc/include/asm/atomic_64.h
@@ -13,8 +13,8 @@
 #define ATOMIC_INIT(i)		{ (i) }
 #define ATOMIC64_INIT(i)	{ (i) }
 
-#define atomic_read(v)		((v)->counter)
-#define atomic64_read(v)	((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
+#define atomic64_read(v)	(*(volatile long *)&(v)->counter)
 
 #define atomic_set(v, i)	(((v)->counter) = i)
 #define atomic64_set(v, i)	(((v)->counter) = i)
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 8f8217b9bdac..37b39d27abe0 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -22,7 +22,7 @@
  */
 static inline int atomic_read(const atomic_t *v)
 {
-	return v->counter;
+	return (*(volatile int *)&(v)->counter);
 }
 
 /**
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 51c5b4056929..b014e235ea8d 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -18,7 +18,7 @@
  */
 static inline long atomic64_read(const atomic64_t *v)
 {
-	return v->counter;
+	return (*(volatile long *)&(v)->counter);
 }
 
 /**
diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h
index 22d6dde42619..a96a0619d0b7 100644
--- a/arch/xtensa/include/asm/atomic.h
+++ b/arch/xtensa/include/asm/atomic.h
@@ -46,7 +46,7 @@
  *
  * Atomically reads the value of @v.
  */
-#define atomic_read(v)		((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
 
 /**
  * atomic_set - set atomic variable
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index c99c64dc5f3d..c33749f95b32 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -33,7 +33,7 @@
  * Atomically reads the value of @v.  Note that the guaranteed
  * useful range of an atomic_t is only 24 bits.
  */
-#define atomic_read(v)	((v)->counter)
+#define atomic_read(v)	(*(volatile int *)&(v)->counter)
 
 /**
  * atomic_set - set atomic variable
-- 
cgit v1.2.3-55-g7522


From 9a58a3333923c7fef4ba6ac9afd817429e63a1fe Mon Sep 17 00:00:00 2001
From: Sreedhara DS
Date: Mon, 26 Apr 2010 18:13:05 +0100
Subject: IPC driver for Intel Mobile Internet Device (MID) platforms

The IPC (inter processor communications) is used to provide the
communications between kernel and system control units on some embedded
Intel x86 platforms.

(Various bits of clean up and restructuring by Alan Cox)

Signed-off-by: Sreedhara DS <sreedhara.ds@intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
---
 arch/x86/include/asm/intel_scu_ipc.h |  55 +++
 drivers/platform/x86/Kconfig         |   8 +
 drivers/platform/x86/Makefile        |   1 +
 drivers/platform/x86/intel_scu_ipc.c | 829 +++++++++++++++++++++++++++++++++++
 4 files changed, 893 insertions(+)
 create mode 100644 arch/x86/include/asm/intel_scu_ipc.h
 create mode 100644 drivers/platform/x86/intel_scu_ipc.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
new file mode 100644
index 000000000000..4470c9ad4a3e
--- /dev/null
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -0,0 +1,55 @@
+#ifndef _ASM_X86_INTEL_SCU_IPC_H_
+#define  _ASM_X86_INTEL_SCU_IPC_H_
+
+/* Read single register */
+int intel_scu_ipc_ioread8(u16 addr, u8 *data);
+
+/* Read two sequential registers */
+int intel_scu_ipc_ioread16(u16 addr, u16 *data);
+
+/* Read four sequential registers */
+int intel_scu_ipc_ioread32(u16 addr, u32 *data);
+
+/* Read a vector */
+int intel_scu_ipc_readv(u16 *addr, u8 *data, int len);
+
+/* Write single register */
+int intel_scu_ipc_iowrite8(u16 addr, u8 data);
+
+/* Write two sequential registers */
+int intel_scu_ipc_iowrite16(u16 addr, u16 data);
+
+/* Write four sequential registers */
+int intel_scu_ipc_iowrite32(u16 addr, u32 data);
+
+/* Write a vector */
+int intel_scu_ipc_writev(u16 *addr, u8 *data, int len);
+
+/* Update single register based on the mask */
+int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask);
+
+/*
+ * Indirect register read
+ * Can be used when SCCB(System Controller Configuration Block) register
+ * HRIM(Honor Restricted IPC Messages) is set (bit 23)
+ */
+int intel_scu_ipc_register_read(u32 addr, u32 *data);
+
+/*
+ * Indirect register write
+ * Can be used when SCCB(System Controller Configuration Block) register
+ * HRIM(Honor Restricted IPC Messages) is set (bit 23)
+ */
+int intel_scu_ipc_register_write(u32 addr, u32 data);
+
+/* Issue commands to the SCU with or without data */
+int intel_scu_ipc_simple_command(int cmd, int sub);
+int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen,
+							u32 *out, int outlen);
+/* I2C control api */
+int intel_scu_ipc_i2c_cntrl(u32 addr, u32 *data);
+
+/* Update FW version */
+int intel_scu_ipc_fw_update(u8 *buffer, u32 length);
+
+#endif
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 6c3320d75055..619134151ca6 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -527,4 +527,12 @@ config ACPI_CMPC
 	  keys as input device, backlight device, tablet and accelerometer
 	  devices.
 
+config INTEL_SCU_IPC
+	bool "Intel SCU IPC Support"
+	depends on X86_MRST
+	default y
+	---help---
+	  IPC is used to bridge the communications between kernel and SCU on
+	  some embedded Intel x86 platforms.
+
 endif # X86_PLATFORM_DEVICES
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index a906490e3530..8770bfe71431 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_ACPI_ASUS)		+= asus_acpi.o
 obj-$(CONFIG_TOPSTAR_LAPTOP)	+= topstar-laptop.o
 obj-$(CONFIG_ACPI_TOSHIBA)	+= toshiba_acpi.o
 obj-$(CONFIG_TOSHIBA_BT_RFKILL)	+= toshiba_bluetooth.o
+obj-$(CONFIG_INTEL_SCU_IPC)	+= intel_scu_ipc.o
diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c
new file mode 100644
index 000000000000..576c3ed92435
--- /dev/null
+++ b/drivers/platform/x86/intel_scu_ipc.c
@@ -0,0 +1,829 @@
+/*
+ * intel_scu_ipc.c: Driver for the Intel SCU IPC mechanism
+ *
+ * (C) Copyright 2008-2010 Intel Corporation
+ * Author: Sreedhara DS (sreedhara.ds@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * SCU runing in ARC processor communicates with other entity running in IA
+ * core through IPC mechanism which in turn messaging between IA core ad SCU.
+ * SCU has two IPC mechanism IPC-1 and IPC-2. IPC-1 is used between IA32 and
+ * SCU where IPC-2 is used between P-Unit and SCU. This driver delas with
+ * IPC-1 Driver provides an API for power control unit registers (e.g. MSIC)
+ * along with other APIs.
+ */
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/sysdev.h>
+#include <linux/pm.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <asm/setup.h>
+#include <asm/intel_scu_ipc.h>
+
+/* IPC defines the following message types */
+#define IPCMSG_WATCHDOG_TIMER 0xF8 /* Set Kernel Watchdog Threshold */
+#define IPCMSG_BATTERY        0xEF /* Coulomb Counter Accumulator */
+#define IPCMSG_FW_UPDATE      0xFE /* Firmware update */
+#define IPCMSG_PCNTRL         0xFF /* Power controller unit read/write */
+#define IPCMSG_FW_REVISION    0xF4 /* Get firmware revision */
+
+/* Command id associated with message IPCMSG_PCNTRL */
+#define IPC_CMD_PCNTRL_W      0 /* Register write */
+#define IPC_CMD_PCNTRL_R      1 /* Register read */
+#define IPC_CMD_PCNTRL_M      2 /* Register read-modify-write */
+
+/* Miscelaneous Command ids */
+#define IPC_CMD_INDIRECT_RD   2 /* 32bit indirect read */
+#define IPC_CMD_INDIRECT_WR   5 /* 32bit indirect write */
+
+/*
+ * IPC register summary
+ *
+ * IPC register blocks are memory mapped at fixed address of 0xFF11C000
+ * To read or write information to the SCU, driver writes to IPC-1 memory
+ * mapped registers (base address 0xFF11C000). The following is the IPC
+ * mechanism
+ *
+ * 1. IA core cDMI interface claims this transaction and converts it to a
+ *    Transaction Layer Packet (TLP) message which is sent across the cDMI.
+ *
+ * 2. South Complex cDMI block receives this message and writes it to
+ *    the IPC-1 register block, causing an interrupt to the SCU
+ *
+ * 3. SCU firmware decodes this interrupt and IPC message and the appropriate
+ *    message handler is called within firmware.
+ */
+
+#define IPC_BASE_ADDR     0xFF11C000	/* IPC1 base register address */
+#define IPC_MAX_ADDR      0x100		/* Maximum IPC regisers */
+#define IPC_WWBUF_SIZE    16		/* IPC Write buffer Size */
+#define IPC_RWBUF_SIZE    16		/* IPC Read buffer Size */
+#define IPC_I2C_BASE      0xFF12B000	/* I2C control register base address */
+#define IPC_I2C_MAX_ADDR  0x10		/* Maximum I2C regisers */
+
+static int ipc_probe(struct pci_dev *dev, const struct pci_device_id *id);
+static void ipc_remove(struct pci_dev *pdev);
+
+struct intel_scu_ipc_dev {
+	struct pci_dev *pdev;
+	void __iomem *ipc_base;
+	void __iomem *i2c_base;
+};
+
+static struct intel_scu_ipc_dev  ipcdev; /* Only one for now */
+
+static int platform = 1;
+module_param(platform, int, 0);
+MODULE_PARM_DESC(platform, "1 for moorestown platform");
+
+
+
+
+/*
+ * IPC Read Buffer (Read Only):
+ * 16 byte buffer for receiving data from SCU, if IPC command
+ * processing results in response data
+ */
+#define IPC_READ_BUFFER		0x90
+
+#define IPC_I2C_CNTRL_ADDR	0
+#define I2C_DATA_ADDR		0x04
+
+static DEFINE_MUTEX(ipclock); /* lock used to prevent multiple call to SCU */
+
+/*
+ * Command Register (Write Only):
+ * A write to this register results in an interrupt to the SCU core processor
+ * Format:
+ * |rfu2(8) | size(8) | command id(4) | rfu1(3) | ioc(1) | command(8)|
+ */
+static inline void ipc_command(u32 cmd) /* Send ipc command */
+{
+	writel(cmd, ipcdev.ipc_base);
+}
+
+/*
+ * IPC Write Buffer (Write Only):
+ * 16-byte buffer for sending data associated with IPC command to
+ * SCU. Size of the data is specified in the IPC_COMMAND_REG register
+ */
+static inline void ipc_data_writel(u32 data, u32 offset) /* Write ipc data */
+{
+	writel(data, ipcdev.ipc_base + 0x80 + offset);
+}
+
+/*
+ * IPC destination Pointer (Write Only):
+ * Use content as pointer for destination write
+ */
+static inline void ipc_write_dptr(u32 data) /* Write dptr data */
+{
+	writel(data, ipcdev.ipc_base + 0x0C);
+}
+
+/*
+ * IPC Source Pointer (Write Only):
+ * Use content as pointer for read location
+*/
+static inline void ipc_write_sptr(u32 data) /* Write dptr data */
+{
+	writel(data, ipcdev.ipc_base + 0x08);
+}
+
+/*
+ * Status Register (Read Only):
+ * Driver will read this register to get the ready/busy status of the IPC
+ * block and error status of the IPC command that was just processed by SCU
+ * Format:
+ * |rfu3(8)|error code(8)|initiator id(8)|cmd id(4)|rfu1(2)|error(1)|busy(1)|
+ */
+
+static inline u8 ipc_read_status(void)
+{
+	return __raw_readl(ipcdev.ipc_base + 0x04);
+}
+
+static inline u8 ipc_data_readb(u32 offset) /* Read ipc byte data */
+{
+	return readb(ipcdev.ipc_base + IPC_READ_BUFFER + offset);
+}
+
+static inline u8 ipc_data_readl(u32 offset) /* Read ipc u32 data */
+{
+	return readl(ipcdev.ipc_base + IPC_READ_BUFFER + offset);
+}
+
+static inline int busy_loop(void) /* Wait till scu status is busy */
+{
+	u32 status = 0;
+	u32 loop_count = 0;
+
+	status = ipc_read_status();
+	while (status & 1) {
+		udelay(1); /* scu processing time is in few u secods */
+		status = ipc_read_status();
+		loop_count++;
+		/* break if scu doesn't reset busy bit after huge retry */
+		if (loop_count > 100000) {
+			dev_err(&ipcdev.pdev->dev, "IPC timed out");
+			return -ETIMEDOUT;
+		}
+	}
+	return (status >> 1) & 1;
+}
+
+/* Read/Write power control(PMIC in Langwell, MSIC in PenWell) registers */
+static int pwr_reg_rdwr(u16 *addr, u8 *data, u32 count, u32 op, u32 id)
+{
+	int nc;
+	u32 offset = 0;
+	u32 err = 0;
+	u8 cbuf[IPC_WWBUF_SIZE] = { '\0' };
+	u32 *wbuf = (u32 *)&cbuf;
+
+	mutex_lock(&ipclock);
+	if (ipcdev.pdev == NULL) {
+		mutex_unlock(&ipclock);
+		return -ENODEV;
+	}
+
+	if (platform == 1) {
+		/* Entry is 4 bytes for read/write, 5 bytes for read modify */
+		for (nc = 0; nc < count; nc++) {
+			cbuf[offset] = addr[nc];
+			cbuf[offset + 1] = addr[nc] >> 8;
+			if (id != IPC_CMD_PCNTRL_R)
+				cbuf[offset + 2] = data[nc];
+			if (id == IPC_CMD_PCNTRL_M) {
+				cbuf[offset + 3] = data[nc + 1];
+				offset += 1;
+			}
+			offset += 3;
+		}
+		for (nc = 0, offset = 0; nc < count; nc++, offset += 4)
+			ipc_data_writel(wbuf[nc], offset); /* Write wbuff */
+
+	} else {
+		for (nc = 0, offset = 0; nc < count; nc++, offset += 2)
+			ipc_data_writel(addr[nc], offset); /* Write addresses */
+		if (id != IPC_CMD_PCNTRL_R) {
+			for (nc = 0; nc < count; nc++, offset++)
+				ipc_data_writel(data[nc], offset); /* Write data */
+			if (id == IPC_CMD_PCNTRL_M)
+				ipc_data_writel(data[nc + 1], offset); /* Mask value*/
+		}
+	}
+
+	if (id != IPC_CMD_PCNTRL_M)
+		ipc_command((count * 3) << 16 |  id << 12 | 0 << 8 | op);
+	else
+		ipc_command((count * 4) << 16 |  id << 12 | 0 << 8 | op);
+
+	err = busy_loop();
+
+	if (id == IPC_CMD_PCNTRL_R) { /* Read rbuf */
+		/* Workaround: values are read as 0 without memcpy_fromio */
+		memcpy_fromio(cbuf, ipcdev.ipc_base + IPC_READ_BUFFER, 16);
+		if (platform == 1) {
+			for (nc = 0, offset = 2; nc < count; nc++, offset += 3)
+				data[nc] = ipc_data_readb(offset);
+		} else {
+			for (nc = 0; nc < count; nc++)
+				data[nc] = ipc_data_readb(nc);
+		}
+	}
+	mutex_unlock(&ipclock);
+	return err;
+}
+
+/**
+ *	intel_scu_ipc_ioread8		-	read a word via the SCU
+ *	@addr: register on SCU
+ *	@data: return pointer for read byte
+ *
+ *	Read a single register. Returns 0 on success or an error code. All
+ *	locking between SCU accesses is handled for the caller.
+ *
+ *	This function may sleep.
+ */
+int intel_scu_ipc_ioread8(u16 addr, u8 *data)
+{
+	return pwr_reg_rdwr(&addr, data, 1, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_R);
+}
+EXPORT_SYMBOL(intel_scu_ipc_ioread8);
+
+/**
+ *	intel_scu_ipc_ioread16		-	read a word via the SCU
+ *	@addr: register on SCU
+ *	@data: return pointer for read word
+ *
+ *	Read a register pair. Returns 0 on success or an error code. All
+ *	locking between SCU accesses is handled for the caller.
+ *
+ *	This function may sleep.
+ */
+int intel_scu_ipc_ioread16(u16 addr, u16 *data)
+{
+	u16 x[2] = {addr, addr + 1 };
+	return pwr_reg_rdwr(x, (u8 *)data, 2, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_R);
+}
+EXPORT_SYMBOL(intel_scu_ipc_ioread16);
+
+/**
+ *	intel_scu_ipc_ioread32		-	read a dword via the SCU
+ *	@addr: register on SCU
+ *	@data: return pointer for read dword
+ *
+ *	Read four registers. Returns 0 on success or an error code. All
+ *	locking between SCU accesses is handled for the caller.
+ *
+ *	This function may sleep.
+ */
+int intel_scu_ipc_ioread32(u16 addr, u32 *data)
+{
+	u16 x[4] = {addr, addr + 1, addr + 2, addr + 3};
+	return pwr_reg_rdwr(x, (u8 *)data, 4, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_R);
+}
+EXPORT_SYMBOL(intel_scu_ipc_ioread32);
+
+/**
+ *	intel_scu_ipc_iowrite8		-	write a byte via the SCU
+ *	@addr: register on SCU
+ *	@data: byte to write
+ *
+ *	Write a single register. Returns 0 on success or an error code. All
+ *	locking between SCU accesses is handled for the caller.
+ *
+ *	This function may sleep.
+ */
+int intel_scu_ipc_iowrite8(u16 addr, u8 data)
+{
+	return pwr_reg_rdwr(&addr, &data, 1, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_W);
+}
+EXPORT_SYMBOL(intel_scu_ipc_iowrite8);
+
+/**
+ *	intel_scu_ipc_iowrite16		-	write a word via the SCU
+ *	@addr: register on SCU
+ *	@data: word to write
+ *
+ *	Write two registers. Returns 0 on success or an error code. All
+ *	locking between SCU accesses is handled for the caller.
+ *
+ *	This function may sleep.
+ */
+int intel_scu_ipc_iowrite16(u16 addr, u16 data)
+{
+	u16 x[2] = {addr, addr + 1 };
+	return pwr_reg_rdwr(x, (u8 *)&data, 2, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_W);
+}
+EXPORT_SYMBOL(intel_scu_ipc_iowrite16);
+
+/**
+ *	intel_scu_ipc_iowrite32		-	write a dword via the SCU
+ *	@addr: register on SCU
+ *	@data: dword to write
+ *
+ *	Write four registers. Returns 0 on success or an error code. All
+ *	locking between SCU accesses is handled for the caller.
+ *
+ *	This function may sleep.
+ */
+int intel_scu_ipc_iowrite32(u16 addr, u32 data)
+{
+	u16 x[4] = {addr, addr + 1, addr + 2, addr + 3};
+	return pwr_reg_rdwr(x, (u8 *)&data, 4, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_W);
+}
+EXPORT_SYMBOL(intel_scu_ipc_iowrite32);
+
+/**
+ *	intel_scu_ipc_readvv		-	read a set of registers
+ *	@addr: register list
+ *	@data: bytes to return
+ *	@len: length of array
+ *
+ *	Read registers. Returns 0 on success or an error code. All
+ *	locking between SCU accesses is handled for the caller.
+ *
+ *	The largest array length permitted by the hardware is 5 items.
+ *
+ *	This function may sleep.
+ */
+int intel_scu_ipc_readv(u16 *addr, u8 *data, int len)
+{
+	return pwr_reg_rdwr(addr, data, len, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_R);
+}
+EXPORT_SYMBOL(intel_scu_ipc_readv);
+
+/**
+ *	intel_scu_ipc_writev		-	write a set of registers
+ *	@addr: register list
+ *	@data: bytes to write
+ *	@len: length of array
+ *
+ *	Write registers. Returns 0 on success or an error code. All
+ *	locking between SCU accesses is handled for the caller.
+ *
+ *	The largest array length permitted by the hardware is 5 items.
+ *
+ *	This function may sleep.
+ *
+ */
+int intel_scu_ipc_writev(u16 *addr, u8 *data, int len)
+{
+	return pwr_reg_rdwr(addr, data, len, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_W);
+}
+EXPORT_SYMBOL(intel_scu_ipc_writev);
+
+
+/**
+ *	intel_scu_ipc_update_register	-	r/m/w a register
+ *	@addr: register address
+ *	@bits: bits to update
+ *	@mask: mask of bits to update
+ *
+ *	Read-modify-write power control unit register. The first data argument
+ *	must be register value and second is mask value
+ *	mask is a bitmap that indicates which bits to update.
+ *	0 = masked. Don't modify this bit, 1 = modify this bit.
+ *	returns 0 on success or an error code.
+ *
+ *	This function may sleep. Locking between SCU accesses is handled
+ *	for the caller.
+ */
+int intel_scu_ipc_update_register(u16 addr, u8 bits, u8 mask)
+{
+	u8 data[2] = { bits, mask };
+	return pwr_reg_rdwr(&addr, data, 1, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_M);
+}
+EXPORT_SYMBOL(intel_scu_ipc_update_register);
+
+/**
+ *	intel_scu_ipc_register_read	-	32bit indirect read
+ *	@addr: register address
+ *	@value: 32bit value return
+ *
+ *	Performs IA 32 bit indirect read, returns 0 on success, or an
+ *	error code.
+ *
+ *	Can be used when SCCB(System Controller Configuration Block) register
+ *	HRIM(Honor Restricted IPC Messages) is set (bit 23)
+ *
+ *	This function may sleep. Locking for SCU accesses is handled for
+ *	the caller.
+ */
+int intel_scu_ipc_register_read(u32 addr, u32 *value)
+{
+	u32 err = 0;
+
+	mutex_lock(&ipclock);
+	if (ipcdev.pdev == NULL) {
+		mutex_unlock(&ipclock);
+		return -ENODEV;
+	}
+	ipc_write_sptr(addr);
+	ipc_command(4 << 16 | IPC_CMD_INDIRECT_RD);
+	err = busy_loop();
+	*value = ipc_data_readl(0);
+	mutex_unlock(&ipclock);
+	return err;
+}
+EXPORT_SYMBOL(intel_scu_ipc_register_read);
+
+/**
+ *	intel_scu_ipc_register_write	-	32bit indirect write
+ *	@addr: register address
+ *	@value: 32bit value to write
+ *
+ *	Performs IA 32 bit indirect write, returns 0 on success, or an
+ *	error code.
+ *
+ *	Can be used when SCCB(System Controller Configuration Block) register
+ *	HRIM(Honor Restricted IPC Messages) is set (bit 23)
+ *
+ *	This function may sleep. Locking for SCU accesses is handled for
+ *	the caller.
+ */
+int intel_scu_ipc_register_write(u32 addr, u32 value)
+{
+	u32 err = 0;
+
+	mutex_lock(&ipclock);
+	if (ipcdev.pdev == NULL) {
+		mutex_unlock(&ipclock);
+		return -ENODEV;
+	}
+	ipc_write_dptr(addr);
+	ipc_data_writel(value, 0);
+	ipc_command(4 << 16 | IPC_CMD_INDIRECT_WR);
+	err = busy_loop();
+	mutex_unlock(&ipclock);
+	return err;
+}
+EXPORT_SYMBOL(intel_scu_ipc_register_write);
+
+/**
+ *	intel_scu_ipc_simple_command	-	send a simple command
+ *	@cmd: command
+ *	@sub: sub type
+ *
+ *	Issue a simple command to the SCU. Do not use this interface if
+ *	you must then access data as any data values may be overwritten
+ *	by another SCU access by the time this function returns.
+ *
+ *	This function may sleep. Locking for SCU accesses is handled for
+ *	the caller.
+ */
+int intel_scu_ipc_simple_command(int cmd, int sub)
+{
+	u32 err = 0;
+
+	mutex_lock(&ipclock);
+	if (ipcdev.pdev == NULL) {
+		mutex_unlock(&ipclock);
+		return -ENODEV;
+	}
+	ipc_command(cmd << 12 | sub);
+	err = busy_loop();
+	mutex_unlock(&ipclock);
+	return err;
+}
+EXPORT_SYMBOL(intel_scu_ipc_simple_command);
+
+/**
+ *	intel_scu_ipc_command	-	command with data
+ *	@cmd: command
+ *	@sub: sub type
+ *	@in: input data
+ *	@inlen: input length
+ *	@out: output data
+ *	@outlein: output length
+ *
+ *	Issue a command to the SCU which involves data transfers. Do the
+ *	data copies under the lock but leave it for the caller to interpret
+ */
+
+int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen,
+							u32 *out, int outlen)
+{
+	u32 err = 0;
+	int i = 0;
+
+	mutex_lock(&ipclock);
+	if (ipcdev.pdev == NULL) {
+		mutex_unlock(&ipclock);
+		return -ENODEV;
+	}
+
+	for (i = 0; i < inlen; i++)
+		ipc_data_writel(*in++, 4 * i);
+
+	ipc_command(cmd << 12 | sub);
+	err = busy_loop();
+
+	for (i = 0; i < outlen; i++)
+		*out++ = ipc_data_readl(4 * i);
+
+	mutex_unlock(&ipclock);
+	return err;
+}
+EXPORT_SYMBOL(intel_scu_ipc_command);
+
+/*I2C commands */
+#define IPC_I2C_WRITE 1 /* I2C Write command */
+#define IPC_I2C_READ  2 /* I2C Read command */
+
+/**
+ *	intel_scu_ipc_i2c_cntrl		-	I2C read/write operations
+ *	@addr: I2C address + command bits
+ *	@data: data to read/write
+ *
+ *	Perform an an I2C read/write operation via the SCU. All locking is
+ *	handled for the caller. This function may sleep.
+ *
+ *	Returns an error code or 0 on success.
+ *
+ *	This has to be in the IPC driver for the locking.
+ */
+int intel_scu_ipc_i2c_cntrl(u32 addr, u32 *data)
+{
+	u32 cmd = 0;
+
+	mutex_lock(&ipclock);
+	cmd = (addr >> 24) & 0xFF;
+	if (cmd == IPC_I2C_READ) {
+		writel(addr, ipcdev.i2c_base + IPC_I2C_CNTRL_ADDR);
+		/* Write not getting updated without delay */
+		mdelay(1);
+		*data = readl(ipcdev.i2c_base + I2C_DATA_ADDR);
+	} else if (cmd == IPC_I2C_WRITE) {
+		writel(addr, ipcdev.i2c_base + I2C_DATA_ADDR);
+		mdelay(1);
+		writel(addr, ipcdev.i2c_base + IPC_I2C_CNTRL_ADDR);
+	} else {
+		dev_err(&ipcdev.pdev->dev,
+			"intel_scu_ipc: I2C INVALID_CMD = 0x%x\n", cmd);
+
+		mutex_unlock(&ipclock);
+		return -1;
+	}
+	mutex_unlock(&ipclock);
+	return 0;
+}
+EXPORT_SYMBOL(intel_scu_ipc_i2c_cntrl);
+
+#define IPC_FW_LOAD_ADDR 0xFFFC0000 /* Storage location for FW image */
+#define IPC_FW_UPDATE_MBOX_ADDR 0xFFFFDFF4 /* Mailbox between ipc and scu */
+#define IPC_MAX_FW_SIZE 262144 /* 256K storage size for loading the FW image */
+#define IPC_FW_MIP_HEADER_SIZE 2048 /* Firmware MIP header size */
+/* IPC inform SCU to get ready for update process */
+#define IPC_CMD_FW_UPDATE_READY  0x10FE
+/* IPC inform SCU to go for update process */
+#define IPC_CMD_FW_UPDATE_GO     0x20FE
+/* Status code for fw update */
+#define IPC_FW_UPDATE_SUCCESS	0x444f4e45 /* Status code 'DONE' */
+#define IPC_FW_UPDATE_BADN	0x4241444E /* Status code 'BADN' */
+#define IPC_FW_TXHIGH		0x54784849 /* Status code 'IPC_FW_TXHIGH' */
+#define IPC_FW_TXLOW		0x54784c4f /* Status code 'IPC_FW_TXLOW' */
+
+struct fw_update_mailbox {
+	u32    status;
+	u32    scu_flag;
+	u32    driver_flag;
+};
+
+
+/**
+ *	intel_scu_ipc_fw_update	-	 Firmware update utility
+ *	@buffer: firmware buffer
+ *	@length: size of firmware buffer
+ *
+ *	This function provides an interface to load the firmware into
+ *	the SCU. Returns 0 on success or -1 on failure
+ */
+int intel_scu_ipc_fw_update(u8 *buffer, u32 length)
+{
+	void __iomem *fw_update_base;
+	struct fw_update_mailbox __iomem *mailbox = NULL;
+	int retry_cnt = 0;
+	u32 status;
+
+	mutex_lock(&ipclock);
+	fw_update_base = ioremap_nocache(IPC_FW_LOAD_ADDR, (128*1024));
+	if (fw_update_base == NULL) {
+		mutex_unlock(&ipclock);
+		return -ENOMEM;
+	}
+	mailbox = ioremap_nocache(IPC_FW_UPDATE_MBOX_ADDR,
+					sizeof(struct fw_update_mailbox));
+	if (mailbox == NULL) {
+		iounmap(fw_update_base);
+		mutex_unlock(&ipclock);
+		return -ENOMEM;
+	}
+
+	ipc_command(IPC_CMD_FW_UPDATE_READY);
+
+	/* Intitialize mailbox */
+	writel(0, &mailbox->status);
+	writel(0, &mailbox->scu_flag);
+	writel(0, &mailbox->driver_flag);
+
+	/* Driver copies the 2KB MIP header to SRAM at 0xFFFC0000*/
+	memcpy_toio(fw_update_base, buffer, 0x800);
+
+	/* Driver sends "FW Update" IPC command (CMD_ID 0xFE; MSG_ID 0x02).
+	* Upon receiving this command, SCU will write the 2K MIP header
+	* from 0xFFFC0000 into NAND.
+	* SCU will write a status code into the Mailbox, and then set scu_flag.
+	*/
+
+	ipc_command(IPC_CMD_FW_UPDATE_GO);
+
+	/*Driver stalls until scu_flag is set */
+	while (readl(&mailbox->scu_flag) != 1) {
+		rmb();
+		mdelay(1);
+	}
+
+	/* Driver checks Mailbox status.
+	 * If the status is 'BADN', then abort (bad NAND).
+	 * If the status is 'IPC_FW_TXLOW', then continue.
+	 */
+	while (readl(&mailbox->status) != IPC_FW_TXLOW) {
+		rmb();
+		mdelay(10);
+	}
+	mdelay(10);
+
+update_retry:
+	if (retry_cnt > 5)
+		goto update_end;
+
+	if (readl(&mailbox->status) != IPC_FW_TXLOW)
+		goto update_end;
+	buffer = buffer + 0x800;
+	memcpy_toio(fw_update_base, buffer, 0x20000);
+	writel(1, &mailbox->driver_flag);
+	while (readl(&mailbox->scu_flag) == 1) {
+		rmb();
+		mdelay(1);
+	}
+
+	/* check for 'BADN' */
+	if (readl(&mailbox->status) == IPC_FW_UPDATE_BADN)
+		goto update_end;
+
+	while (readl(&mailbox->status) != IPC_FW_TXHIGH) {
+		rmb();
+		mdelay(10);
+	}
+	mdelay(10);
+
+	if (readl(&mailbox->status) != IPC_FW_TXHIGH)
+		goto update_end;
+
+	buffer = buffer + 0x20000;
+	memcpy_toio(fw_update_base, buffer, 0x20000);
+	writel(0, &mailbox->driver_flag);
+
+	while (mailbox->scu_flag == 0) {
+		rmb();
+		mdelay(1);
+	}
+
+	/* check for 'BADN' */
+	if (readl(&mailbox->status) == IPC_FW_UPDATE_BADN)
+		goto update_end;
+
+	if (readl(&mailbox->status) == IPC_FW_TXLOW) {
+		++retry_cnt;
+		goto update_retry;
+	}
+
+update_end:
+	status = readl(&mailbox->status);
+
+	iounmap(fw_update_base);
+	iounmap(mailbox);
+	mutex_unlock(&ipclock);
+
+	if (status == IPC_FW_UPDATE_SUCCESS)
+		return 0;
+	return -1;
+}
+EXPORT_SYMBOL(intel_scu_ipc_fw_update);
+
+/*
+ * Interrupt handler gets called when ioc bit of IPC_COMMAND_REG set to 1
+ * When ioc bit is set to 1, caller api must wait for interrupt handler called
+ * which in turn unlocks the caller api. Currently this is not used
+ *
+ * This is edge triggered so we need take no action to clear anything
+ */
+static irqreturn_t ioc(int irq, void *dev_id)
+{
+	return IRQ_HANDLED;
+}
+
+/**
+ *	ipc_probe	-	probe an Intel SCU IPC
+ *	@dev: the PCI device matching
+ *	@id: entry in the match table
+ *
+ *	Enable and install an intel SCU IPC. This appears in the PCI space
+ *	but uses some hard coded addresses as well.
+ */
+static int ipc_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	int err;
+	resource_size_t pci_resource;
+
+	if (ipcdev.pdev)		/* We support only one SCU */
+		return -EBUSY;
+
+	ipcdev.pdev = pci_dev_get(dev);
+
+	err = pci_enable_device(dev);
+	if (err)
+		return err;
+
+	err = pci_request_regions(dev, "intel_scu_ipc");
+	if (err)
+		return err;
+
+	pci_resource = pci_resource_start(dev, 0);
+	if (!pci_resource)
+		return -ENOMEM;
+
+	if (request_irq(dev->irq, ioc, 0, "intel_scu_ipc", &ipcdev))
+		return -EBUSY;
+
+	ipcdev.ipc_base = ioremap_nocache(IPC_BASE_ADDR, IPC_MAX_ADDR);
+	if (!ipcdev.ipc_base)
+		return -ENOMEM;
+
+	ipcdev.i2c_base = ioremap_nocache(IPC_I2C_BASE, IPC_I2C_MAX_ADDR);
+	if (!ipcdev.i2c_base) {
+		iounmap(ipcdev.ipc_base);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+/**
+ *	ipc_remove	-	remove a bound IPC device
+ *	@pdev: PCI device
+ *
+ *	In practice the SCU is not removable but this function is also
+ *	called for each device on a module unload or cleanup which is the
+ *	path that will get used.
+ *
+ *	Free up the mappings and release the PCI resources
+ */
+static void ipc_remove(struct pci_dev *pdev)
+{
+	free_irq(pdev->irq, &ipcdev);
+	pci_release_regions(pdev);
+	pci_dev_put(ipcdev.pdev);
+	iounmap(ipcdev.ipc_base);
+	iounmap(ipcdev.i2c_base);
+	ipcdev.pdev = NULL;
+}
+
+static const struct pci_device_id pci_ids[] = {
+	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x080e)},
+	{ 0,}
+};
+MODULE_DEVICE_TABLE(pci, pci_ids);
+
+static struct pci_driver ipc_driver = {
+	.name = "intel_scu_ipc",
+	.id_table = pci_ids,
+	.probe = ipc_probe,
+	.remove = ipc_remove,
+};
+
+
+static int __init intel_scu_ipc_init(void)
+{
+	return  pci_register_driver(&ipc_driver);
+}
+
+static void __exit intel_scu_ipc_exit(void)
+{
+	pci_unregister_driver(&ipc_driver);
+}
+
+MODULE_AUTHOR("Sreedhara DS <sreedhara.ds@intel.com>");
+MODULE_DESCRIPTION("Intel SCU IPC driver");
+MODULE_LICENSE("GPL");
+
+module_init(intel_scu_ipc_init);
+module_exit(intel_scu_ipc_exit);
-- 
cgit v1.2.3-55-g7522


From fec84e330719c20d2146c8dbdc9ff50b3a1d7039 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann
Date: Mon, 17 May 2010 18:43:24 +0200
Subject: x86, hpet: Add reference to chipset erratum documentation for
 disable-hpet-msi-quirk

(At the moment the "SB700 Family Product Errata" document is available
at http://support.amd.com/us/Embedded_TechDocs/46837.pdf)

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <20100517164324.GB10254@alberich.amd.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/quirks.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index cd2c336c8d09..e72d3fc6547d 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -495,6 +495,9 @@ void force_hpet_resume(void)
 /*
  * HPET MSI on some boards (ATI SB700/SB800) has side effect on
  * floppy DMA. Disable HPET MSI on such platforms.
+ * See erratum #27 (Misinterpreted MSI Requests May Result in
+ * Corrupted LPC DMA Data) in AMD Publication #46837,
+ * "SB700 Family Product Errata", Rev. 1.0, March 2010.
  *
  * Also force the read back of the CMP register in hpet_next_event()
  * to work around the problem that the CMP register write seems to be
-- 
cgit v1.2.3-55-g7522


From c59bd5688299cddb71183e156e7a3c1409b90df2 Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Mon, 17 May 2010 15:13:23 -0700
Subject: x86, hweight: Use a 32-bit popcnt for __arch_hweight32()

Use a 32-bit popcnt instruction for __arch_hweight32(), even on
x86-64.  Even though the input register will *usually* be
zero-extended due to the standard operation of the hardware, it isn't
necessarily so if the input value was the result of truncating a
64-bit operation.

Note: the POPCNT32 variant used on x86-64 has a technically
unnecessary REX prefix to make it five bytes long, the same as a CALL
instruction, therefore avoiding an unnecessary NOP.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <alpine.LFD.2.00.1005171443060.4195@i5.linux-foundation.org>
---
 arch/x86/include/asm/arch_hweight.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index d1fc3c219ae6..9686c3d9ff73 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -2,13 +2,15 @@
 #define _ASM_X86_HWEIGHT_H
 
 #ifdef CONFIG_64BIT
+/* popcnt %edi, %eax -- redundant REX prefix for alignment */
+#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
 /* popcnt %rdi, %rax */
-#define POPCNT ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
+#define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
 #define REG_IN "D"
 #define REG_OUT "a"
 #else
 /* popcnt %eax, %eax */
-#define POPCNT ".byte 0xf3,0x0f,0xb8,0xc0"
+#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc0"
 #define REG_IN "a"
 #define REG_OUT "a"
 #endif
@@ -23,7 +25,7 @@ static inline unsigned int __arch_hweight32(unsigned int w)
 {
 	unsigned int res = 0;
 
-	asm (ALTERNATIVE("call __sw_hweight32", POPCNT, X86_FEATURE_POPCNT)
+	asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
 		     : "="REG_OUT (res)
 		     : REG_IN (w));
 
@@ -48,7 +50,7 @@ static inline unsigned long __arch_hweight64(__u64 w)
 	return  __arch_hweight32((u32)w) +
 		__arch_hweight32((u32)(w >> 32));
 #else
-	asm (ALTERNATIVE("call __sw_hweight64", POPCNT, X86_FEATURE_POPCNT)
+	asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
 		     : "="REG_OUT (res)
 		     : REG_IN (w));
 #endif /* CONFIG_X86_32 */
-- 
cgit v1.2.3-55-g7522


From 0db1a7bc00216a981d0b7056627ad8682f4f0636 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Mon, 17 May 2010 16:13:04 +0800
Subject: perf, x86: P4 PMU -- handle unflagged events

It might happen that an event can overflow without
the proper overflow flag set. Check the sign bit in
the raw counter value to solve this problem.

Tested-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: fweisbec@gmail.com
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <1274083984.6540.15.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 424fc8de68e4..02f072830237 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -465,15 +465,21 @@ out:
 	return rc;
 }
 
-static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
+static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
 {
-	unsigned long dummy;
+	int overflow = 0;
+	u32 low, high;
 
-	rdmsrl(hwc->config_base + hwc->idx, dummy);
-	if (dummy & P4_CCCR_OVF) {
+	rdmsr(hwc->config_base + hwc->idx, low, high);
+
+	/* we need to check high bit for unflagged overflows */
+	if ((low & P4_CCCR_OVF) || (high & (1 << 31))) {
+		overflow = 1;
 		(void)checking_wrmsrl(hwc->config_base + hwc->idx,
-			((u64)dummy) & ~P4_CCCR_OVF);
+			((u64)low) & ~P4_CCCR_OVF);
 	}
+
+	return overflow;
 }
 
 static inline void p4_pmu_disable_event(struct perf_event *event)
@@ -584,21 +590,15 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 
 		WARN_ON_ONCE(hwc->idx != idx);
 
-		/*
-		 * FIXME: Redundant call, actually not needed
-		 * but just to check if we're screwed
-		 */
-		p4_pmu_clear_cccr_ovf(hwc);
+		/* it might be unflagged overflow */
+		handled = p4_pmu_clear_cccr_ovf(hwc);
 
 		val = x86_perf_event_update(event);
-		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
+		if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
 			continue;
 
-		/*
-		 * event overflow
-		 */
-		handled		= 1;
-		data.period	= event->hw.last_period;
+		/* event overflow for sure */
+		data.period = event->hw.last_period;
 
 		if (!x86_perf_event_set_period(event))
 			continue;
-- 
cgit v1.2.3-55-g7522


From ef4f30f54e265c2f6f9ac9eda4db158a4e16050b Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Tue, 18 May 2010 17:29:14 +0800
Subject: perf, x86: P4 PMU -- fix typo in unflagged NMI handling

Tested-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
LKML-Reference: <1274174954.22793.17.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 02f072830237..87e1803e67a6 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -473,7 +473,7 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
 	rdmsr(hwc->config_base + hwc->idx, low, high);
 
 	/* we need to check high bit for unflagged overflows */
-	if ((low & P4_CCCR_OVF) || (high & (1 << 31))) {
+	if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
 		overflow = 1;
 		(void)checking_wrmsrl(hwc->config_base + hwc->idx,
 			((u64)low) & ~P4_CCCR_OVF);
-- 
cgit v1.2.3-55-g7522


From a02ce953a14d6a8aab721b129b3c8ff4981a76e6 Mon Sep 17 00:00:00 2001
From: Feng Tang
Date: Wed, 5 May 2010 17:08:49 +0800
Subject: x86/PCI: make ACPI MCFG reserved error messages ACPI specific

Both ACPI and SFI firmwares will have MCFG space, but the error message
isn't valid on SFI, so don't print the message in that case.

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/mmconfig-shared.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 39b9ebe8f886..a918553ebc75 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -483,16 +483,17 @@ static void __init pci_mmcfg_reject_broken(int early)
 	list_for_each_entry(cfg, &pci_mmcfg_list, list) {
 		int valid = 0;
 
-		if (!early && !acpi_disabled)
+		if (!early && !acpi_disabled) {
 			valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0);
 
-		if (valid)
-			continue;
-
-		if (!early)
-			printk(KERN_ERR FW_BUG PREFIX
-			       "MMCONFIG at %pR not reserved in "
-			       "ACPI motherboard resources\n", &cfg->res);
+			if (valid)
+				continue;
+			else
+				printk(KERN_ERR FW_BUG PREFIX
+				       "MMCONFIG at %pR not reserved in "
+				       "ACPI motherboard resources\n",
+				       &cfg->res);
+		}
 
 		/* Don't try to do this check unless configuration
 		   type 1 is available. how about type 2 ?*/
-- 
cgit v1.2.3-55-g7522


From 623aab896ee1a532cb540bcf0d5ae8a88275afd5 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Wed, 19 May 2010 01:19:17 +0400
Subject: perf, x86: P4 PMU -- do a real check for ESCR address being in hash

To prevent from clashes in future code modifications
do a real check for ESCR address being in hash. At
moment the callers are known to pass sane values but
better to be on a safe side.

And comment fix.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Lin Ming <ming.m.lin@intel.com>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100518212439.004503600@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 87e1803e67a6..5f8e36d62793 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -670,7 +670,7 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
 
 /*
  * ESCR address hashing is tricky, ESCRs are not sequential
- * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03e0) and
+ * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
  * the metric between any ESCRs is laid in range [0xa0,0xe1]
  *
  * so we make ~70% filled hashtable
@@ -735,8 +735,9 @@ static int p4_get_escr_idx(unsigned int addr)
 {
 	unsigned int idx = P4_ESCR_MSR_IDX(addr);
 
-	if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
-			!p4_escr_table[idx])) {
+	if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE	||
+			!p4_escr_table[idx]		||
+			p4_escr_table[idx] != addr)) {
 		WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
 		return -1;
 	}
-- 
cgit v1.2.3-55-g7522


From 9d36dfcf219e2ba1f1d169a7f92dcf2cbd4e05f0 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Wed, 19 May 2010 01:19:18 +0400
Subject: perf, x86: P4_pmu_schedule_events -- use smp_processor_id instead of
 raw_

This snippet somehow escaped the commit:

 | commit 137351e0feeb9f25d99488ee1afc1c79f5499a9a
 | Author: Cyrill Gorcunov <gorcunov@openvz.org>
 | Date:   Sat May 8 15:25:52 2010 +0400
 |
 |    x86, perf: P4 PMU -- protect sensible procedures from preemption

so bring it eventually back. It helps to catch
preemption issue (if there will be, rule of thumb --
don't use raw_ if you can).

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100518212439.167259349@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 5f8e36d62793..ae85d69644d1 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -763,7 +763,7 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
 {
 	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
-	int cpu = raw_smp_processor_id();
+	int cpu = smp_processor_id();
 	struct hw_perf_event *hwc;
 	struct p4_event_bind *bind;
 	unsigned int i, thread, num;
-- 
cgit v1.2.3-55-g7522


From ce7f15452cc1dc1eca795542367871a07f37aa79 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov
Date: Wed, 19 May 2010 01:19:19 +0400
Subject: perf, x86: P4 PMU -- add missing bit in CCCR mask

Should be there for the sake of RAW events.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Lin Ming <ming.m.lin@intel.com>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100518212439.354345151@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index b05400a542ff..64a8ebff06fc 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -89,7 +89,8 @@
 	P4_CCCR_ENABLE)
 
 /* HT mask */
-#define P4_CCCR_MASK_HT	(P4_CCCR_MASK | P4_CCCR_THREAD_ANY)
+#define P4_CCCR_MASK_HT				\
+	(P4_CCCR_MASK | P4_CCCR_OVF_PMI_T1 | P4_CCCR_THREAD_ANY)
 
 #define P4_GEN_ESCR_EMASK(class, name, bit)	\
 	class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
-- 
cgit v1.2.3-55-g7522


From 5a7388c2d2faa2cc70c2d4717c8d7836d55459e0 Mon Sep 17 00:00:00 2001
From: Eric Northup
Date: Mon, 26 Apr 2010 17:00:05 -0700
Subject: KVM: MMU: fix hashing for TDP and non-paging modes

For TDP mode, avoid creating multiple page table roots for the single
guest-to-host physical address map by fixing the inputs used for the
shadow page table hash in mmu_alloc_roots().

Signed-off-by: Eric Northup <digitaleric@google.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ddfa8658fb6d..9696d654b01f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2059,10 +2059,12 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
 		ASSERT(!VALID_PAGE(root));
-		if (tdp_enabled)
-			direct = 1;
 		if (mmu_check_root(vcpu, root_gfn))
 			return 1;
+		if (tdp_enabled) {
+			direct = 1;
+			root_gfn = 0;
+		}
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
 				      PT64_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
@@ -2072,8 +2074,6 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 	direct = !is_paging(vcpu);
-	if (tdp_enabled)
-		direct = 1;
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -2089,6 +2089,10 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = 0;
 		if (mmu_check_root(vcpu, root_gfn))
 			return 1;
+		if (tdp_enabled) {
+			direct = 1;
+			root_gfn = i << 30;
+		}
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
 				      PT32_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
-- 
cgit v1.2.3-55-g7522


From d35b8dd9355805f17225fdbfee4bc704d7bf7547 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng
Date: Tue, 27 Apr 2010 10:39:49 +0800
Subject: KVM: Fix mmu shrinker error

kvm_mmu_remove_one_alloc_mmu_page() assumes kvm_mmu_zap_page() only reclaims
only one sp, but that's not the case. This will cause mmu shrinker returns
a wrong number. This patch fix the counting error.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9696d654b01f..18d2f584945b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2902,13 +2902,13 @@ restart:
 	kvm_flush_remote_tlbs(kvm);
 }
 
-static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
+static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
 {
 	struct kvm_mmu_page *page;
 
 	page = container_of(kvm->arch.active_mmu_pages.prev,
 			    struct kvm_mmu_page, link);
-	kvm_mmu_zap_page(kvm, page);
+	return kvm_mmu_zap_page(kvm, page) + 1;
 }
 
 static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
@@ -2920,7 +2920,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
 	spin_lock(&kvm_lock);
 
 	list_for_each_entry(kvm, &vm_list, vm_list) {
-		int npages, idx;
+		int npages, idx, freed_pages;
 
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
@@ -2928,8 +2928,8 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
 			 kvm->arch.n_free_mmu_pages;
 		cache_count += npages;
 		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
-			kvm_mmu_remove_one_alloc_mmu_page(kvm);
-			cache_count--;
+			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm);
+			cache_count -= freed_pages;
 			kvm_freed = kvm;
 		}
 		nr_to_scan--;
-- 
cgit v1.2.3-55-g7522


From 22c9b2d166a88573a933e8d355833d36579d83be Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Wed, 28 Apr 2010 11:54:44 +0800
Subject: KVM: MMU: fix for calculating gpa in invlpg code

If the guest is 32-bit, we should use 'quadrant' to adjust gpa
offset

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d0cc07eb6eda..3464fdbdb98f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -478,8 +478,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 		    ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
 		    ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
 			struct kvm_mmu_page *sp = page_header(__pa(sptep));
+			int offset, shift;
 
-			pte_gpa = (sp->gfn << PAGE_SHIFT);
+			shift = PAGE_SHIFT -
+				  (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
+			offset = sp->role.quadrant << shift;
+
+			pte_gpa = (sp->gfn << PAGE_SHIFT) + offset;
 			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
 
 			if (is_shadow_present_pte(*sptep)) {
-- 
cgit v1.2.3-55-g7522


From 85f2067c3192212bce3bd2e1d6ad10153d6f2a4e Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Wed, 28 Apr 2010 11:54:55 +0800
Subject: KVM: MMU: convert mmu tracepoints

Convert mmu tracepoints by using DECLARE_EVENT_CLASS

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmutrace.h | 69 +++++++++++++++++++------------------------------
 1 file changed, 26 insertions(+), 43 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 40a1786f36c7..42f07b1bfbc9 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -92,15 +92,15 @@ TRACE_EVENT(
 	TP_printk("pte %llx level %u", __entry->pte, __entry->level)
 );
 
-/* We set a pte accessed bit */
-TRACE_EVENT(
-	kvm_mmu_set_accessed_bit,
+DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class,
+
 	TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
 	TP_ARGS(table_gfn, index, size),
 
 	TP_STRUCT__entry(
 		__field(__u64, gpa)
-		),
+	),
 
 	TP_fast_assign(
 		__entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
@@ -110,22 +110,20 @@ TRACE_EVENT(
 	TP_printk("gpa %llx", __entry->gpa)
 );
 
-/* We set a pte dirty bit */
-TRACE_EVENT(
-	kvm_mmu_set_dirty_bit,
+/* We set a pte accessed bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit,
+
 	TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
-	TP_ARGS(table_gfn, index, size),
 
-	TP_STRUCT__entry(
-		__field(__u64, gpa)
-		),
+	TP_ARGS(table_gfn, index, size)
+);
 
-	TP_fast_assign(
-		__entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
-				+ index * size;
-		),
+/* We set a pte dirty bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit,
 
-	TP_printk("gpa %llx", __entry->gpa)
+	TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+	TP_ARGS(table_gfn, index, size)
 );
 
 TRACE_EVENT(
@@ -164,54 +162,39 @@ TRACE_EVENT(
 		  __entry->created ? "new" : "existing")
 );
 
-TRACE_EVENT(
-	kvm_mmu_sync_page,
+DECLARE_EVENT_CLASS(kvm_mmu_page_class,
+
 	TP_PROTO(struct kvm_mmu_page *sp),
 	TP_ARGS(sp),
 
 	TP_STRUCT__entry(
 		KVM_MMU_PAGE_FIELDS
-		),
+	),
 
 	TP_fast_assign(
 		KVM_MMU_PAGE_ASSIGN(sp)
-		),
+	),
 
 	TP_printk("%s", KVM_MMU_PAGE_PRINTK())
 );
 
-TRACE_EVENT(
-	kvm_mmu_unsync_page,
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page,
 	TP_PROTO(struct kvm_mmu_page *sp),
-	TP_ARGS(sp),
-
-	TP_STRUCT__entry(
-		KVM_MMU_PAGE_FIELDS
-		),
-
-	TP_fast_assign(
-		KVM_MMU_PAGE_ASSIGN(sp)
-		),
 
-	TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+	TP_ARGS(sp)
 );
 
-TRACE_EVENT(
-	kvm_mmu_zap_page,
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
 	TP_PROTO(struct kvm_mmu_page *sp),
-	TP_ARGS(sp),
 
-	TP_STRUCT__entry(
-		KVM_MMU_PAGE_FIELDS
-		),
+	TP_ARGS(sp)
+);
 
-	TP_fast_assign(
-		KVM_MMU_PAGE_ASSIGN(sp)
-		),
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page,
+	TP_PROTO(struct kvm_mmu_page *sp),
 
-	TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+	TP_ARGS(sp)
 );
-
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit v1.2.3-55-g7522


From 5e1b3ddbf220d2256a6a0d676a219ed76b8048d0 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Wed, 28 Apr 2010 11:55:06 +0800
Subject: KVM: MMU: move unsync/sync tracpoints to proper place

Move unsync/sync tracepoints to the proper place, it's good
for us to obtain unsync page live time

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 18d2f584945b..51eb6d6abd86 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1189,6 +1189,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	WARN_ON(!sp->unsync);
+	trace_kvm_mmu_sync_page(sp);
 	sp->unsync = 0;
 	--kvm->stat.mmu_unsync;
 }
@@ -1202,7 +1203,6 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		return 1;
 	}
 
-	trace_kvm_mmu_sync_page(sp);
 	if (rmap_write_protect(vcpu->kvm, sp->gfn))
 		kvm_flush_remote_tlbs(vcpu->kvm);
 	kvm_unlink_unsync_page(vcpu->kvm, sp);
@@ -1730,7 +1730,6 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	struct kvm_mmu_page *s;
 	struct hlist_node *node, *n;
 
-	trace_kvm_mmu_unsync_page(sp);
 	index = kvm_page_table_hashfn(sp->gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 	/* don't unsync if pagetable is shadowed with multiple roles */
@@ -1740,6 +1739,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		if (s->role.word != sp->role.word)
 			return 1;
 	}
+	trace_kvm_mmu_unsync_page(sp);
 	++vcpu->kvm->stat.mmu_unsync;
 	sp->unsync = 1;
 
-- 
cgit v1.2.3-55-g7522


From 884a0ff0b68b3ece5987507de168215e14ef7849 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Wed, 28 Apr 2010 11:55:15 +0800
Subject: KVM: MMU: cleanup invlpg code

Using is_last_spte() to cleanup invlpg code

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 3464fdbdb98f..89d66ca4d87c 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -474,9 +474,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 		level = iterator.level;
 		sptep = iterator.sptep;
 
-		if (level == PT_PAGE_TABLE_LEVEL  ||
-		    ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
-		    ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
+		if (is_last_spte(*sptep, level)) {
 			struct kvm_mmu_page *sp = page_header(__pa(sptep));
 			int offset, shift;
 
-- 
cgit v1.2.3-55-g7522


From 0ee75bead83da4791e5cbf659806c54d8ee40f12 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Wed, 28 Apr 2010 15:39:01 +0300
Subject: KVM: Let vcpu structure alignment be determined at runtime

vmx and svm vcpus have different contents and therefore may have different
alignmment requirements.  Let each specify its required alignment.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/vmm.c       | 2 +-
 arch/powerpc/kvm/44x.c    | 2 +-
 arch/powerpc/kvm/book3s.c | 3 ++-
 arch/powerpc/kvm/e500.c   | 2 +-
 arch/s390/kvm/kvm-s390.c  | 2 +-
 arch/x86/kvm/svm.c        | 2 +-
 arch/x86/kvm/vmx.c        | 3 ++-
 include/linux/kvm_host.h  | 2 +-
 virt/kvm/kvm_main.c       | 7 ++++---
 9 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/kvm/vmm.c b/arch/ia64/kvm/vmm.c
index 7a62f75778c5..f0b9cac82414 100644
--- a/arch/ia64/kvm/vmm.c
+++ b/arch/ia64/kvm/vmm.c
@@ -51,7 +51,7 @@ static int __init  kvm_vmm_init(void)
 	vmm_fpswa_interface = fpswa_interface;
 
 	/*Register vmm data to kvm side*/
-	return kvm_init(&vmm_info, 1024, THIS_MODULE);
+	return kvm_init(&vmm_info, 1024, 0, THIS_MODULE);
 }
 
 static void __exit kvm_vmm_exit(void)
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 689a57c2ac80..73c0a3f64ed1 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -147,7 +147,7 @@ static int __init kvmppc_44x_init(void)
 	if (r)
 		return r;
 
-	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), THIS_MODULE);
+	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), 0, THIS_MODULE);
 }
 
 static void __exit kvmppc_44x_exit(void)
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 28e785fb2cab..11f226ff4468 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1385,7 +1385,8 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 static int kvmppc_book3s_init(void)
 {
-	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), THIS_MODULE);
+	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
+			THIS_MODULE);
 }
 
 static void kvmppc_book3s_exit(void)
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 669a5c5fc7d7..bc2b4004eb26 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -161,7 +161,7 @@ static int __init kvmppc_e500_init(void)
 	flush_icache_range(kvmppc_booke_handlers,
 			kvmppc_booke_handlers + max_ivor + kvmppc_handler_len);
 
-	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), THIS_MODULE);
+	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
 }
 
 static void __init kvmppc_e500_exit(void)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ee7c713686ce..8093e6f47f49 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -752,7 +752,7 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 static int __init kvm_s390_init(void)
 {
 	int ret;
-	ret = kvm_init(NULL, sizeof(struct kvm_vcpu), THIS_MODULE);
+	ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
 	if (ret)
 		return ret;
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 889f66022e57..2511664ff671 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3319,7 +3319,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 static int __init svm_init(void)
 {
 	return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
-			      THIS_MODULE);
+			__alignof__(struct vcpu_svm), THIS_MODULE);
 }
 
 static void __exit svm_exit(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 875b785228f6..2e8729678600 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4245,7 +4245,8 @@ static int __init vmx_init(void)
 
 	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
-	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
+	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+		     __alignof__(struct vcpu_vmx), THIS_MODULE);
 	if (r)
 		goto out3;
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ce027d518096..7cb116afa1cd 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -243,7 +243,7 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
 void vcpu_load(struct kvm_vcpu *vcpu);
 void vcpu_put(struct kvm_vcpu *vcpu);
 
-int kvm_init(void *opaque, unsigned int vcpu_size,
+int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 		  struct module *module);
 void kvm_exit(void);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9ab1a77941ef..f032806a212f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2178,7 +2178,7 @@ static void kvm_sched_out(struct preempt_notifier *pn,
 	kvm_arch_vcpu_put(vcpu);
 }
 
-int kvm_init(void *opaque, unsigned int vcpu_size,
+int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 		  struct module *module)
 {
 	int r;
@@ -2228,8 +2228,9 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 		goto out_free_4;
 
 	/* A kmem cache lets us meet the alignment requirements of fx_save. */
-	kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
-					   __alignof__(struct kvm_vcpu),
+	if (!vcpu_align)
+		vcpu_align = __alignof__(struct kvm_vcpu);
+	kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
 					   0, NULL);
 	if (!kvm_vcpu_cache) {
 		r = -ENOMEM;
-- 
cgit v1.2.3-55-g7522


From 19b95dba0324e55505682a18ff9a437fbf4a2592 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Wed, 28 Apr 2010 15:40:31 +0300
Subject: KVM: VMX: Add definition for msr autoload entry

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/vmx.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index fb9a080740ec..449731890763 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -25,6 +25,8 @@
  *
  */
 
+#include <linux/types.h>
+
 /*
  * Definitions of Primary Processor-Based VM-Execution Controls.
  */
@@ -394,6 +396,10 @@ enum vmcs_field {
 #define ASM_VMX_INVEPT		  ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
 #define ASM_VMX_INVVPID		  ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
 
-
+struct vmx_msr_entry {
+	u32 index;
+	u32 reserved;
+	u64 value;
+} __aligned(16);
 
 #endif
-- 
cgit v1.2.3-55-g7522


From 5dfa3d170e17cbf9e4816a5ba2f5913c31c03e93 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Wed, 28 Apr 2010 15:41:03 +0300
Subject: KVM: VMX: Add definitions for guest and host EFER autoswitch vmcs
 entries

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/vmx.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 449731890763..9e6779f7cf2d 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -122,6 +122,8 @@ enum vmcs_field {
 	GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
 	GUEST_IA32_PAT			= 0x00002804,
 	GUEST_IA32_PAT_HIGH		= 0x00002805,
+	GUEST_IA32_EFER			= 0x00002806,
+	GUEST_IA32_EFER_HIGH		= 0x00002807,
 	GUEST_PDPTR0                    = 0x0000280a,
 	GUEST_PDPTR0_HIGH               = 0x0000280b,
 	GUEST_PDPTR1                    = 0x0000280c,
@@ -132,6 +134,8 @@ enum vmcs_field {
 	GUEST_PDPTR3_HIGH               = 0x00002811,
 	HOST_IA32_PAT			= 0x00002c00,
 	HOST_IA32_PAT_HIGH		= 0x00002c01,
+	HOST_IA32_EFER			= 0x00002c02,
+	HOST_IA32_EFER_HIGH		= 0x00002c03,
 	PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
 	CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
 	EXCEPTION_BITMAP                = 0x00004004,
-- 
cgit v1.2.3-55-g7522


From 61d2ef2ce3e0161bedf5d2867f546a8df77fa9bc Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Wed, 28 Apr 2010 16:40:38 +0300
Subject: KVM: VMX: Add facility to atomically switch MSRs on guest entry/exit

Some guest msr values cannot be used on the host (for example. EFER.NX=0),
so we need to switch them atomically during guest entry or exit.

Add a facility to program the vmx msr autoload registers accordingly.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2e8729678600..ae22dcf17211 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -98,6 +98,8 @@ module_param(ple_gap, int, S_IRUGO);
 static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
 module_param(ple_window, int, S_IRUGO);
 
+#define NR_AUTOLOAD_MSRS 1
+
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
@@ -125,6 +127,11 @@ struct vcpu_vmx {
 	u64 		      msr_guest_kernel_gs_base;
 #endif
 	struct vmcs          *vmcs;
+	struct msr_autoload {
+		unsigned nr;
+		struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
+		struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
+	} msr_autoload;
 	struct {
 		int           loaded;
 		u16           fs_sel, gs_sel, ldt_sel;
@@ -595,6 +602,46 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
+static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
+{
+	unsigned i;
+	struct msr_autoload *m = &vmx->msr_autoload;
+
+	for (i = 0; i < m->nr; ++i)
+		if (m->guest[i].index == msr)
+			break;
+
+	if (i == m->nr)
+		return;
+	--m->nr;
+	m->guest[i] = m->guest[m->nr];
+	m->host[i] = m->host[m->nr];
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
+	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
+}
+
+static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
+				  u64 guest_val, u64 host_val)
+{
+	unsigned i;
+	struct msr_autoload *m = &vmx->msr_autoload;
+
+	for (i = 0; i < m->nr; ++i)
+		if (m->guest[i].index == msr)
+			break;
+
+	if (i == m->nr) {
+		++m->nr;
+		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
+		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
+	}
+
+	m->guest[i].index = msr;
+	m->guest[i].value = guest_val;
+	m->host[i].index = msr;
+	m->host[i].value = host_val;
+}
+
 static void reload_tss(void)
 {
 	/*
@@ -2470,7 +2517,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
 	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
 
 	rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
 	vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
-- 
cgit v1.2.3-55-g7522


From 84ad33ef5dbc12665ad42ee07a2daed473d3ec54 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Wed, 28 Apr 2010 16:42:29 +0300
Subject: KVM: VMX: Atomically switch efer if EPT && !EFER.NX

When EPT is enabled, we cannot emulate EFER.NX=0 through the shadow page
tables.  This causes accesses through ptes with bit 63 set to succeed instead
of failing a reserved bit check.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ae22dcf17211..c4f3955c64e0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -678,6 +678,17 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 	guest_efer |= host_efer & ignore_bits;
 	vmx->guest_msrs[efer_offset].data = guest_efer;
 	vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+
+	clear_atomic_switch_msr(vmx, MSR_EFER);
+	/* On ept, can't emulate nx, and must switch nx atomically */
+	if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
+		guest_efer = vmx->vcpu.arch.efer;
+		if (!(guest_efer & EFER_LMA))
+			guest_efer &= ~EFER_LME;
+		add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
+		return false;
+	}
+
 	return true;
 }
 
@@ -1734,6 +1745,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 	vmcs_write32(VM_ENTRY_CONTROLS,
 		     vmcs_read32(VM_ENTRY_CONTROLS)
 		     & ~VM_ENTRY_IA32E_MODE);
+	vmx_set_efer(vcpu, vcpu->arch.efer);
 }
 
 #endif
-- 
cgit v1.2.3-55-g7522


From f1d86e469b60f9e1afed5c17a6e723c2c9c55ceb Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti
Date: Mon, 3 May 2010 23:04:27 -0300
Subject: KVM: x86: properly update ready_for_interrupt_injection

The recent changes to emulate string instructions without entering guest
mode exposed a bug where pending interrupts are not properly reflected
in ready_for_interrupt_injection.

The result is that userspace overwrites a previously queued interrupt,
when irqchip's are emulated in userspace.

Fix by always updating state before returning to userspace.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6b2ce1d2d748..dff08e527ec7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4653,7 +4653,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 	}
 
 	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
-	post_kvm_run_save(vcpu);
 
 	vapic_exit(vcpu);
 
@@ -4703,6 +4702,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	r = __vcpu_run(vcpu);
 
 out:
+	post_kvm_run_save(vcpu);
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-- 
cgit v1.2.3-55-g7522


From cafd66595d92591e4bd25c3904e004fc6f897e2d Mon Sep 17 00:00:00 2001
From: Shane Wang
Date: Thu, 29 Apr 2010 12:09:01 -0400
Subject: KVM: VMX: enable VMXON check with SMX enabled (Intel TXT)

Per document, for feature control MSR:

  Bit 1 enables VMXON in SMX operation. If the bit is clear, execution
        of VMXON in SMX operation causes a general-protection exception.
  Bit 2 enables VMXON outside SMX operation. If the bit is clear, execution
        of VMXON outside SMX operation causes a general-protection exception.

This patch is to enable this kind of check with SMX for VMXON in KVM.

Signed-off-by: Shane Wang <shane.wang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/msr-index.h |  5 +++--
 arch/x86/kernel/tboot.c          |  1 +
 arch/x86/kvm/vmx.c               | 32 +++++++++++++++++++++-----------
 include/linux/tboot.h            |  1 +
 4 files changed, 26 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index bc473acfa7f9..f9324851eba0 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -202,8 +202,9 @@
 #define MSR_IA32_EBL_CR_POWERON		0x0000002a
 #define MSR_IA32_FEATURE_CONTROL        0x0000003a
 
-#define FEATURE_CONTROL_LOCKED		(1<<0)
-#define FEATURE_CONTROL_VMXON_ENABLED	(1<<2)
+#define FEATURE_CONTROL_LOCKED				(1<<0)
+#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX	(1<<1)
+#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX	(1<<2)
 
 #define MSR_IA32_APICBASE		0x0000001b
 #define MSR_IA32_APICBASE_BSP		(1<<8)
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 86c9f91b48ae..46b827778d16 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -46,6 +46,7 @@
 
 /* Global pointer to shared data; NULL means no measured launch. */
 struct tboot *tboot __read_mostly;
+EXPORT_SYMBOL(tboot);
 
 /* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
 #define AP_WAIT_TIMEOUT		1
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c4f3955c64e0..d2a47aefdee7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -27,6 +27,7 @@
 #include <linux/moduleparam.h>
 #include <linux/ftrace_event.h>
 #include <linux/slab.h>
+#include <linux/tboot.h>
 #include "kvm_cache_regs.h"
 #include "x86.h"
 
@@ -1272,9 +1273,16 @@ static __init int vmx_disabled_by_bios(void)
 	u64 msr;
 
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
-	return (msr & (FEATURE_CONTROL_LOCKED |
-		       FEATURE_CONTROL_VMXON_ENABLED))
-	    == FEATURE_CONTROL_LOCKED;
+	if (msr & FEATURE_CONTROL_LOCKED) {
+		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
+			&& tboot_enabled())
+			return 1;
+		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
+			&& !tboot_enabled())
+			return 1;
+	}
+
+	return 0;
 	/* locked but not enabled */
 }
 
@@ -1282,21 +1290,23 @@ static int hardware_enable(void *garbage)
 {
 	int cpu = raw_smp_processor_id();
 	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
-	u64 old;
+	u64 old, test_bits;
 
 	if (read_cr4() & X86_CR4_VMXE)
 		return -EBUSY;
 
 	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
-	if ((old & (FEATURE_CONTROL_LOCKED |
-		    FEATURE_CONTROL_VMXON_ENABLED))
-	    != (FEATURE_CONTROL_LOCKED |
-		FEATURE_CONTROL_VMXON_ENABLED))
+
+	test_bits = FEATURE_CONTROL_LOCKED;
+	test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+	if (tboot_enabled())
+		test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
+
+	if ((old & test_bits) != test_bits) {
 		/* enable and lock */
-		wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
-		       FEATURE_CONTROL_LOCKED |
-		       FEATURE_CONTROL_VMXON_ENABLED);
+		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
+	}
 	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
 	asm volatile (ASM_VMX_VMXON_RAX
 		      : : "a"(&phys_addr), "m"(phys_addr)
diff --git a/include/linux/tboot.h b/include/linux/tboot.h
index bf2a0c748878..1dba6ee55203 100644
--- a/include/linux/tboot.h
+++ b/include/linux/tboot.h
@@ -150,6 +150,7 @@ extern int tboot_force_iommu(void);
 
 #else
 
+#define tboot_enabled()			0
 #define tboot_probe()			do { } while (0)
 #define tboot_shutdown(shutdown_type)	do { } while (0)
 #define tboot_sleep(sleep_state, pm1a_control, pm1b_control)	\
-- 
cgit v1.2.3-55-g7522


From 8facbbff071ff2b19268d3732e31badc60471e21 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Tue, 4 May 2010 12:58:32 +0300
Subject: KVM: MMU: Don't read pdptrs with mmu spinlock held in mmu_alloc_roots

On svm, kvm_read_pdptr() may require reading guest memory, which can sleep.

Push the spinlock into mmu_alloc_roots(), and only take it after we've read
the pdptr.

Tested-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 51eb6d6abd86..de996380ec26 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2065,11 +2065,13 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			direct = 1;
 			root_gfn = 0;
 		}
+		spin_lock(&vcpu->kvm->mmu_lock);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
 				      PT64_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
+		spin_unlock(&vcpu->kvm->mmu_lock);
 		vcpu->arch.mmu.root_hpa = root;
 		return 0;
 	}
@@ -2093,11 +2095,14 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			direct = 1;
 			root_gfn = i << 30;
 		}
+		spin_lock(&vcpu->kvm->mmu_lock);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
 				      PT32_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
+		spin_unlock(&vcpu->kvm->mmu_lock);
+
 		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
 	}
 	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
@@ -2466,7 +2471,9 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 		goto out;
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
+	spin_unlock(&vcpu->kvm->mmu_lock);
 	r = mmu_alloc_roots(vcpu);
+	spin_lock(&vcpu->kvm->mmu_lock);
 	mmu_sync_roots(vcpu);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	if (r)
-- 
cgit v1.2.3-55-g7522


From 9ed3c444ab8987c7b219173a2f7807e3f71e234e Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Tue, 4 May 2010 15:00:37 +0300
Subject: KVM: Fix wallclock version writing race

Wallclock writing uses an unprotected global variable to hold the version;
this can cause one guest to interfere with another if both write their
wallclock at the same time.

Acked-by: Glauber Costa <glommer@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dff08e527ec7..54f73b6a006b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -754,14 +754,22 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 
 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
-	static int version;
+	int version;
+	int r;
 	struct pvclock_wall_clock wc;
 	struct timespec boot;
 
 	if (!wall_clock)
 		return;
 
-	version++;
+	r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
+	if (r)
+		return;
+
+	if (version & 1)
+		++version;  /* first time write, random junk */
+
+	++version;
 
 	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 
-- 
cgit v1.2.3-55-g7522


From 3f0fd2927b737c0ac2e04af7858b60d1e927d4b1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Wed, 5 May 2010 16:04:41 +0200
Subject: KVM: x86: Fix exception reinjection forced to true

The patch merged recently which allowed to mark an exception
as reinjected has a bug as it always marks the exception as
reinjected. This breaks nested-svm shadow-on-shadow
implementation.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 54f73b6a006b..161ede2b5f91 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -277,7 +277,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 		vcpu->arch.exception.has_error_code = has_error;
 		vcpu->arch.exception.nr = nr;
 		vcpu->arch.exception.error_code = error_code;
-		vcpu->arch.exception.reinject = true;
+		vcpu->arch.exception.reinject = reinject;
 		return;
 	}
 
-- 
cgit v1.2.3-55-g7522


From 0d945bd9351199744c1e89d57a70615b6ee9f394 Mon Sep 17 00:00:00 2001
From: Joerg Roedel
Date: Wed, 5 May 2010 16:04:45 +0200
Subject: KVM: SVM: Don't allow nested guest to VMMCALL into host

This patch disables the possibility for a l2-guest to do a
VMMCALL directly into the host. This would happen if the
l1-hypervisor doesn't intercept VMMCALL and the l2-guest
executes this instruction.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2511664ff671..4a66ffe1dc87 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2036,6 +2036,9 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 		svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
 	}
 
+	/* We don't want to see VMMCALLs from a nested guest */
+	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL);
+
 	/*
 	 * We don't want a nested guest to be more powerful than the guest, so
 	 * all intercepts are ORed
-- 
cgit v1.2.3-55-g7522


From b69e8caef5b190af48c525f6d715e7b7728a77f6 Mon Sep 17 00:00:00 2001
From: Roedel, Joerg
Date: Thu, 6 May 2010 11:38:43 +0200
Subject: KVM: x86: Inject #GP with the right rip on efer writes

This patch fixes a bug in the KVM efer-msr write path. If a
guest writes to a reserved efer bit the set_efer function
injects the #GP directly. The architecture dependent wrmsr
function does not see this, assumes success and advances the
rip. This results in a #GP in the guest with the wrong rip.
This patch fixes this by reporting efer write errors back to
the architectural wrmsr function.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 161ede2b5f91..fe6d126633d8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -683,37 +683,29 @@ static u32 emulated_msrs[] = {
 	MSR_IA32_MISC_ENABLE,
 };
 
-static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
+static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
-	if (efer & efer_reserved_bits) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	if (efer & efer_reserved_bits)
+		return 1;
 
 	if (is_paging(vcpu)
-	    && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	    && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
+		return 1;
 
 	if (efer & EFER_FFXSR) {
 		struct kvm_cpuid_entry2 *feat;
 
 		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
-		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
+		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
+			return 1;
 	}
 
 	if (efer & EFER_SVME) {
 		struct kvm_cpuid_entry2 *feat;
 
 		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
-		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
+		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
+			return 1;
 	}
 
 	kvm_x86_ops->set_efer(vcpu, efer);
@@ -725,6 +717,8 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
 	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
 	kvm_mmu_reset_context(vcpu);
+
+	return 0;
 }
 
 void kvm_enable_efer_bits(u64 mask)
@@ -1153,8 +1147,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	switch (msr) {
 	case MSR_EFER:
-		set_efer(vcpu, data);
-		break;
+		return set_efer(vcpu, data);
 	case MSR_K7_HWCR:
 		data &= ~(u64)0x40;	/* ignore flush filter disable */
 		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
-- 
cgit v1.2.3-55-g7522


From 424c32f1aa3112632a657d45698c8e7666668f78 Mon Sep 17 00:00:00 2001
From: Glauber Costa
Date: Tue, 11 May 2010 12:17:39 -0400
Subject: x86, paravirt: Enable pvclock flags in vcpu_time_info structure

This patch removes one padding byte and transform it into a flags
field. New versions of guests using pvclock will query these flags
upon each read.

Flags, however, will only be interpreted when the guest decides to.
It uses the pvclock_valid_flags function to signal that a specific
set of flags should be taken into consideration. Which flags are valid
are usually devised via HV negotiation.

Signed-off-by: Glauber Costa <glommer@redhat.com>
CC: Jeremy Fitzhardinge <jeremy@goop.org>
Acked-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/pvclock-abi.h | 3 ++-
 arch/x86/include/asm/pvclock.h     | 1 +
 arch/x86/kernel/pvclock.c          | 9 +++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 6d93508f2626..ec5c41ac4bef 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -29,7 +29,8 @@ struct pvclock_vcpu_time_info {
 	u64   system_time;
 	u32   tsc_to_system_mul;
 	s8    tsc_shift;
-	u8    pad[3];
+	u8    flags;
+	u8    pad[2];
 } __attribute__((__packed__)); /* 32 bytes */
 
 struct pvclock_wall_clock {
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 53235fd5f8ce..cd02f324aa6b 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -6,6 +6,7 @@
 
 /* some helper functions for xen and kvm pv clock sources */
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+void pvclock_set_flags(u8 flags);
 unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
 void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
 			    struct pvclock_vcpu_time_info *vcpu,
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 03801f2f761f..f7fdd56bc0ab 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -31,8 +31,16 @@ struct pvclock_shadow_time {
 	u32 tsc_to_nsec_mul;
 	int tsc_shift;
 	u32 version;
+	u8  flags;
 };
 
+static u8 valid_flags __read_mostly = 0;
+
+void pvclock_set_flags(u8 flags)
+{
+	valid_flags = flags;
+}
+
 /*
  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
  * yielding a 64-bit result.
@@ -91,6 +99,7 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
 		dst->system_timestamp  = src->system_time;
 		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
 		dst->tsc_shift         = src->tsc_shift;
+		dst->flags             = src->flags;
 		rmb();		/* test version after fetching data */
 	} while ((src->version & 1) || (dst->version != src->version));
 
-- 
cgit v1.2.3-55-g7522


From 489fb490dbf8dab0249ad82b56688ae3842a79e8 Mon Sep 17 00:00:00 2001
From: Glauber Costa
Date: Tue, 11 May 2010 12:17:40 -0400
Subject: x86, paravirt: Add a global synchronization point for pvclock

In recent stress tests, it was found that pvclock-based systems
could seriously warp in smp systems. Using ingo's time-warp-test.c,
I could trigger a scenario as bad as 1.5mi warps a minute in some systems.
(to be fair, it wasn't that bad in most of them). Investigating further, I
found out that such warps were caused by the very offset-based calculation
pvclock is based on.

This happens even on some machines that report constant_tsc in its tsc flags,
specially on multi-socket ones.

Two reads of the same kernel timestamp at approx the same time, will likely
have tsc timestamped in different occasions too. This means the delta we
calculate is unpredictable at best, and can probably be smaller in a cpu
that is legitimately reading clock in a forward ocasion.

Some adjustments on the host could make this window less likely to happen,
but still, it pretty much poses as an intrinsic problem of the mechanism.

A while ago, I though about using a shared variable anyway, to hold clock
last state, but gave up due to the high contention locking was likely
to introduce, possibly rendering the thing useless on big machines. I argue,
however, that locking is not necessary.

We do a read-and-return sequence in pvclock, and between read and return,
the global value can have changed. However, it can only have changed
by means of an addition of a positive value. So if we detected that our
clock timestamp is less than the current global, we know that we need to
return a higher one, even though it is not exactly the one we compared to.

OTOH, if we detect we're greater than the current time source, we atomically
replace the value with our new readings. This do causes contention on big
boxes (but big here means *BIG*), but it seems like a good trade off, since
it provide us with a time source guaranteed to be stable wrt time warps.

After this patch is applied, I don't see a single warp in time during 5 days
of execution, in any of the machines I saw them before.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Zachary Amsden <zamsden@redhat.com>
CC: Jeremy Fitzhardinge <jeremy@goop.org>
CC: Avi Kivity <avi@redhat.com>
CC: Marcelo Tosatti <mtosatti@redhat.com>
CC: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kernel/pvclock.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index f7fdd56bc0ab..f5bc40e1697e 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -118,11 +118,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
 	return pv_tsc_khz;
 }
 
+static atomic64_t last_value = ATOMIC64_INIT(0);
+
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 {
 	struct pvclock_shadow_time shadow;
 	unsigned version;
 	cycle_t ret, offset;
+	u64 last;
 
 	do {
 		version = pvclock_get_time_values(&shadow, src);
@@ -132,6 +135,27 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 		barrier();
 	} while (version != src->version);
 
+	/*
+	 * Assumption here is that last_value, a global accumulator, always goes
+	 * forward. If we are less than that, we should not be much smaller.
+	 * We assume there is an error marging we're inside, and then the correction
+	 * does not sacrifice accuracy.
+	 *
+	 * For reads: global may have changed between test and return,
+	 * but this means someone else updated poked the clock at a later time.
+	 * We just need to make sure we are not seeing a backwards event.
+	 *
+	 * For updates: last_value = ret is not enough, since two vcpus could be
+	 * updating at the same time, and one of them could be slightly behind,
+	 * making the assumption that last_value always go forward fail to hold.
+	 */
+	last = atomic64_read(&last_value);
+	do {
+		if (ret < last)
+			return last;
+		last = atomic64_cmpxchg(&last_value, last, ret);
+	} while (unlikely(last != ret));
+
 	return ret;
 }
 
-- 
cgit v1.2.3-55-g7522


From 11c6bffa42b85e703c21a1d2372dce7262daca8e Mon Sep 17 00:00:00 2001
From: Glauber Costa
Date: Tue, 11 May 2010 12:17:41 -0400
Subject: KVM: x86: change msr numbers for kvmclock

Avi pointed out a while ago that those MSRs falls into the pentium
PMU range. So the idea here is to add new ones, and after a while,
deprecate the old ones.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_para.h | 4 ++++
 arch/x86/kvm/x86.c              | 7 ++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index ffae1420e7d7..97348082782a 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -20,6 +20,10 @@
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
+/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
+#define MSR_KVM_WALL_CLOCK_NEW  0x4b564d00
+#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
+
 #define KVM_MAX_MMU_OP_BATCH           32
 
 /* Operations for KVM_HC_MMU_OP */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fe6d126633d8..73d342c69ed4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -664,9 +664,10 @@ static inline u32 bit(int bitno)
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN	5
+#define KVM_SAVE_MSRS_BEGIN	7
 static u32 msrs_to_save[] = {
 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
 	HV_X64_MSR_APIC_ASSIST_PAGE,
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
@@ -1193,10 +1194,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	case MSR_IA32_MISC_ENABLE:
 		vcpu->arch.ia32_misc_enable_msr = data;
 		break;
+	case MSR_KVM_WALL_CLOCK_NEW:
 	case MSR_KVM_WALL_CLOCK:
 		vcpu->kvm->arch.wall_clock = data;
 		kvm_write_wall_clock(vcpu->kvm, data);
 		break;
+	case MSR_KVM_SYSTEM_TIME_NEW:
 	case MSR_KVM_SYSTEM_TIME: {
 		if (vcpu->arch.time_page) {
 			kvm_release_page_dirty(vcpu->arch.time_page);
@@ -1468,9 +1471,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		data = vcpu->arch.efer;
 		break;
 	case MSR_KVM_WALL_CLOCK:
+	case MSR_KVM_WALL_CLOCK_NEW:
 		data = vcpu->kvm->arch.wall_clock;
 		break;
 	case MSR_KVM_SYSTEM_TIME:
+	case MSR_KVM_SYSTEM_TIME_NEW:
 		data = vcpu->arch.time;
 		break;
 	case MSR_IA32_P5_MC_ADDR:
-- 
cgit v1.2.3-55-g7522


From 0e6ac58acbcddbc9d1687214f0d43d8657cc036c Mon Sep 17 00:00:00 2001
From: Glauber Costa
Date: Tue, 11 May 2010 12:17:42 -0400
Subject: KVM: x86: add new KVMCLOCK cpuid feature

This cpuid, KVM_CPUID_CLOCKSOURCE2, will indicate to the guest
that kvmclock is available through a new set of MSRs. The old ones
are deprecated.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_para.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 97348082782a..f019f8cb182e 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -16,6 +16,10 @@
 #define KVM_FEATURE_CLOCKSOURCE		0
 #define KVM_FEATURE_NOP_IO_DELAY	1
 #define KVM_FEATURE_MMU_OP		2
+/* This indicates that the new set of kvmclock msrs
+ * are available. The use of 0x11 and 0x12 is deprecated
+ */
+#define KVM_FEATURE_CLOCKSOURCE2        3
 
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
-- 
cgit v1.2.3-55-g7522


From 84478c829d0f474a1d6749207c53daacc305d4e1 Mon Sep 17 00:00:00 2001
From: Glauber Costa
Date: Tue, 11 May 2010 12:17:43 -0400
Subject: KVM: x86: export paravirtual cpuid flags in KVM_GET_SUPPORTED_CPUID

Right now, we were using individual KVM_CAP entities to communicate
userspace about which cpuids we support. This is suboptimal, since it
generates a delay between the feature arriving in the host, and
being available at the guest.

A much better mechanism is to list para features in KVM_GET_SUPPORTED_CPUID.
This makes userspace automatically aware of what we provide. And if we
ever add a new cpuid bit in the future, we have to do that again,
which create some complexity and delay in feature adoption.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 73d342c69ed4..419c4512e270 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1972,6 +1972,23 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		}
 		break;
 	}
+	case KVM_CPUID_SIGNATURE: {
+		char signature[12] = "KVMKVMKVM\0\0";
+		u32 *sigptr = (u32 *)signature;
+		entry->eax = 0;
+		entry->ebx = sigptr[0];
+		entry->ecx = sigptr[1];
+		entry->edx = sigptr[2];
+		break;
+	}
+	case KVM_CPUID_FEATURES:
+		entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
+			     (1 << KVM_FEATURE_NOP_IO_DELAY) |
+			     (1 << KVM_FEATURE_CLOCKSOURCE2);
+		entry->ebx = 0;
+		entry->ecx = 0;
+		entry->edx = 0;
+		break;
 	case 0x80000000:
 		entry->eax = min(entry->eax, 0x8000001a);
 		break;
@@ -2018,6 +2035,23 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
 		do_cpuid_ent(&cpuid_entries[nent], func, 0,
 			     &nent, cpuid->nent);
+
+
+
+	r = -E2BIG;
+	if (nent >= cpuid->nent)
+		goto out_free;
+
+	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
+		     cpuid->nent);
+
+	r = -E2BIG;
+	if (nent >= cpuid->nent)
+		goto out_free;
+
+	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
+		     cpuid->nent);
+
 	r = -E2BIG;
 	if (nent >= cpuid->nent)
 		goto out_free;
-- 
cgit v1.2.3-55-g7522


From 838815a78785022f6611e5c48386567aea7b818b Mon Sep 17 00:00:00 2001
From: Glauber Costa
Date: Tue, 11 May 2010 12:17:44 -0400
Subject: x86: KVM guest: Try using new kvm clock msrs

We now added a new set of clock-related msrs in replacement of the old
ones. In theory, we could just try to use them and get a return value
indicating they do not exist, due to our use of kvm_write_msr_save.

However, kvm clock registration happens very early, and if we ever
try to write to a non-existant MSR, we raise a lethal #GP, since our
idt handlers are not in place yet.

So this patch tests for a cpuid feature exported by the host to
decide which set of msrs are supported.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kernel/kvmclock.c | 53 ++++++++++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index feaeb0d3aa4f..59c740fcb9b3 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -29,6 +29,8 @@
 #define KVM_SCALE 22
 
 static int kvmclock = 1;
+static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
+static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
 
 static int parse_no_kvmclock(char *arg)
 {
@@ -54,7 +56,8 @@ static unsigned long kvm_get_wallclock(void)
 
 	low = (int)__pa_symbol(&wall_clock);
 	high = ((u64)__pa_symbol(&wall_clock) >> 32);
-	native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
+
+	native_write_msr(msr_kvm_wall_clock, low, high);
 
 	vcpu_time = &get_cpu_var(hv_clock);
 	pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
@@ -130,7 +133,8 @@ static int kvm_register_clock(char *txt)
 	high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
 	printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
 	       cpu, high, low, txt);
-	return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
+
+	return native_write_msr_safe(msr_kvm_system_time, low, high);
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -165,14 +169,14 @@ static void __init kvm_smp_prepare_boot_cpu(void)
 #ifdef CONFIG_KEXEC
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
-	native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+	native_write_msr(msr_kvm_system_time, 0, 0);
 	native_machine_crash_shutdown(regs);
 }
 #endif
 
 static void kvm_shutdown(void)
 {
-	native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+	native_write_msr(msr_kvm_system_time, 0, 0);
 	native_machine_shutdown();
 }
 
@@ -181,27 +185,34 @@ void __init kvmclock_init(void)
 	if (!kvm_para_available())
 		return;
 
-	if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
-		if (kvm_register_clock("boot clock"))
-			return;
-		pv_time_ops.sched_clock = kvm_clock_read;
-		x86_platform.calibrate_tsc = kvm_get_tsc_khz;
-		x86_platform.get_wallclock = kvm_get_wallclock;
-		x86_platform.set_wallclock = kvm_set_wallclock;
+	if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
+		msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
+		msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
+	} else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
+		return;
+
+	printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
+		msr_kvm_system_time, msr_kvm_wall_clock);
+
+	if (kvm_register_clock("boot clock"))
+		return;
+	pv_time_ops.sched_clock = kvm_clock_read;
+	x86_platform.calibrate_tsc = kvm_get_tsc_khz;
+	x86_platform.get_wallclock = kvm_get_wallclock;
+	x86_platform.set_wallclock = kvm_set_wallclock;
 #ifdef CONFIG_X86_LOCAL_APIC
-		x86_cpuinit.setup_percpu_clockev =
-			kvm_setup_secondary_clock;
+	x86_cpuinit.setup_percpu_clockev =
+		kvm_setup_secondary_clock;
 #endif
 #ifdef CONFIG_SMP
-		smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
 #endif
-		machine_ops.shutdown  = kvm_shutdown;
+	machine_ops.shutdown  = kvm_shutdown;
 #ifdef CONFIG_KEXEC
-		machine_ops.crash_shutdown  = kvm_crash_shutdown;
+	machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
-		kvm_get_preset_lpj();
-		clocksource_register(&kvm_clock);
-		pv_info.paravirt_enabled = 1;
-		pv_info.name = "KVM";
-	}
+	kvm_get_preset_lpj();
+	clocksource_register(&kvm_clock);
+	pv_info.paravirt_enabled = 1;
+	pv_info.name = "KVM";
 }
-- 
cgit v1.2.3-55-g7522


From 3a0d7256a6fb8c13f9fac6cd63250f97a8f0d8de Mon Sep 17 00:00:00 2001
From: Glauber Costa
Date: Tue, 11 May 2010 12:17:45 -0400
Subject: x86, paravirt: don't compute pvclock adjustments if we trust the tsc

If the HV told us we can fully trust the TSC, skip any
correction

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_para.h    | 5 +++++
 arch/x86/include/asm/pvclock-abi.h | 1 +
 arch/x86/kernel/kvmclock.c         | 3 +++
 arch/x86/kernel/pvclock.c          | 4 ++++
 4 files changed, 13 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index f019f8cb182e..05eba5e9a8e8 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -21,6 +21,11 @@
  */
 #define KVM_FEATURE_CLOCKSOURCE2        3
 
+/* The last 8 bits are used to indicate how to interpret the flags field
+ * in pvclock structure. If no bits are set, all flags are ignored.
+ */
+#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT	24
+
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index ec5c41ac4bef..35f2d1948ada 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -39,5 +39,6 @@ struct pvclock_wall_clock {
 	u32   nsec;
 } __attribute__((__packed__));
 
+#define PVCLOCK_TSC_STABLE_BIT	(1 << 0)
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 59c740fcb9b3..eb9b76c716c2 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -215,4 +215,7 @@ void __init kvmclock_init(void)
 	clocksource_register(&kvm_clock);
 	pv_info.paravirt_enabled = 1;
 	pv_info.name = "KVM";
+
+	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
 }
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index f5bc40e1697e..239427ca02af 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -135,6 +135,10 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 		barrier();
 	} while (version != src->version);
 
+	if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
+		(shadow.flags & PVCLOCK_TSC_STABLE_BIT))
+		return ret;
+
 	/*
 	 * Assumption here is that last_value, a global accumulator, always goes
 	 * forward. If we are less than that, we should not be much smaller.
-- 
cgit v1.2.3-55-g7522


From 371bcf646d170ee1325abaf4f3e73485b4fd4d2d Mon Sep 17 00:00:00 2001
From: Glauber Costa
Date: Tue, 11 May 2010 12:17:46 -0400
Subject: KVM: x86: Tell the guest we'll warn it about tsc stability

This patch puts up the flag that tells the guest that we'll warn it
about the tsc being trustworthy or not. By now, we also say
it is not.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 419c4512e270..474a27fc42df 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -857,6 +857,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	vcpu->hv_clock.system_time = ts.tv_nsec +
 				     (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
 
+	vcpu->hv_clock.flags = 0;
+
 	/*
 	 * The interface expects us to write an even number signaling that the
 	 * update is finished. Since the guest won't see the intermediate
@@ -1984,7 +1986,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	case KVM_CPUID_FEATURES:
 		entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
 			     (1 << KVM_FEATURE_NOP_IO_DELAY) |
-			     (1 << KVM_FEATURE_CLOCKSOURCE2);
+			     (1 << KVM_FEATURE_CLOCKSOURCE2) |
+			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
 		entry->ebx = 0;
 		entry->ecx = 0;
 		entry->edx = 0;
-- 
cgit v1.2.3-55-g7522


From f78e917688edbf1f14c318d2e50dc8e7dad20445 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Wed, 12 May 2010 00:28:44 +0300
Subject: KVM: Don't allow lmsw to clear cr0.pe

The current lmsw implementation allows the guest to clear cr0.pe, contrary
to the manual, which breaks EMM386.EXE.

Fix by ORing the old cr0.pe with lmsw's operand.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 474a27fc42df..fa1c51925597 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -470,7 +470,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
 
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 {
-	kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f));
+	kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
-- 
cgit v1.2.3-55-g7522


From a3d204e28579427609c3d15d2310127ebaa47d94 Mon Sep 17 00:00:00 2001
From: Sheng Yang
Date: Wed, 12 May 2010 16:40:40 +0800
Subject: KVM: x86: Check LMA bit before set_efer

kvm_x86_ops->set_efer() would execute vcpu->arch.efer = efer, so the
checking of LMA bit didn't work.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa1c51925597..f0846d2aa956 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -709,11 +709,11 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 			return 1;
 	}
 
-	kvm_x86_ops->set_efer(vcpu, efer);
-
 	efer &= ~EFER_LMA;
 	efer |= vcpu->arch.efer & EFER_LMA;
 
+	kvm_x86_ops->set_efer(vcpu, efer);
+
 	vcpu->arch.efer = efer;
 
 	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
-- 
cgit v1.2.3-55-g7522


From 3dbe141595faa48a067add3e47bba3205b79d33c Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Wed, 12 May 2010 11:48:18 +0300
Subject: KVM: MMU: Segregate shadow pages with different cr0.wp

When cr0.wp=0, we may shadow a gpte having u/s=1 and r/w=0 with an spte
having u/s=0 and r/w=1.  This allows excessive access if the guest sets
cr0.wp=1 and accesses through this spte.

Fix by making cr0.wp part of the base role; we'll have different sptes for
the two cases and the problem disappears.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/kvm/mmu.txt       | 2 ++
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/mmu.c              | 3 ++-
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/Documentation/kvm/mmu.txt b/Documentation/kvm/mmu.txt
index 0cc28fb84f4f..aaed6ab9d7ab 100644
--- a/Documentation/kvm/mmu.txt
+++ b/Documentation/kvm/mmu.txt
@@ -163,6 +163,8 @@ Shadow pages contain the following information:
     32-bit or 64-bit gptes are in use).
   role.cr4_nxe:
     Contains the value of efer.nxe for which the page is valid.
+  role.cr0_wp:
+    Contains the value of cr0.wp for which the page is valid.
   gfn:
     Either the guest page table containing the translations shadowed by this
     page, or the base page frame for linear translations.  See role.direct.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3f0007b076da..76f5483cffec 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -179,6 +179,7 @@ union kvm_mmu_page_role {
 		unsigned access:3;
 		unsigned invalid:1;
 		unsigned nxe:1;
+		unsigned cr0_wp:1;
 	};
 };
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index de996380ec26..81563e76e28f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -217,7 +217,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
-static int is_write_protection(struct kvm_vcpu *vcpu)
+static bool is_write_protection(struct kvm_vcpu *vcpu)
 {
 	return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
 }
@@ -2432,6 +2432,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 		r = paging32_init_context(vcpu);
 
 	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
+	vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
 
 	return r;
 }
-- 
cgit v1.2.3-55-g7522


From 8fbf065d625617bbbf6b72d5f78f84ad13c8b547 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Thu, 13 May 2010 11:50:19 +0300
Subject: KVM: x86: Add missing locking to arch specific vcpu ioctls

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f0846d2aa956..39f495802050 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1833,6 +1833,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 {
 	int r;
 
+	vcpu_load(vcpu);
 	r = -E2BIG;
 	if (cpuid->nent < vcpu->arch.cpuid_nent)
 		goto out;
@@ -1844,6 +1845,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 
 out:
 	cpuid->nent = vcpu->arch.cpuid_nent;
+	vcpu_put(vcpu);
 	return r;
 }
 
@@ -2134,6 +2136,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	int r;
 	unsigned bank_num = mcg_cap & 0xff, bank;
 
+	vcpu_load(vcpu);
 	r = -EINVAL;
 	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
 		goto out;
@@ -2148,6 +2151,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	for (bank = 0; bank < bank_num; bank++)
 		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
 out:
+	vcpu_put(vcpu);
 	return r;
 }
 
@@ -2456,7 +2460,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EFAULT;
 		if (copy_from_user(&mce, argp, sizeof mce))
 			goto out;
+		vcpu_load(vcpu);
 		r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+		vcpu_put(vcpu);
 		break;
 	}
 	case KVM_GET_VCPU_EVENTS: {
-- 
cgit v1.2.3-55-g7522


From d334a49113a4a33109fd24e46073280ecd1bea0d Mon Sep 17 00:00:00 2001
From: Huang Ying
Date: Tue, 18 May 2010 14:35:20 +0800
Subject: ACPI, APEI, Generic Hardware Error Source memory error support

Generic Hardware Error Source provides a way to report platform
hardware errors (such as that from chipset). It works in so called
"Firmware First" mode, that is, hardware errors are reported to
firmware firstly, then reported to Linux by firmware. This way, some
non-standard hardware error registers or non-standard hardware link
can be checked by firmware to produce more valuable hardware error
information for Linux.

Now, only SCI notification type and memory errors are supported. More
notification type and hardware error type will be added later. These
memory errors are reported to user space through /dev/mcelog via
faking a corrected Machine Check, so that the error memory page can be
offlined by /sbin/mcelog if the error count for one page is beyond the
threshold.

On some machines, Machine Check can not report physical address for
some corrected memory errors, but GHES can do that. So this simplified
GHES is implemented firstly.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/include/asm/mce.h            |   8 +
 arch/x86/kernel/cpu/mcheck/Makefile   |   2 +
 arch/x86/kernel/cpu/mcheck/mce-apei.c |  52 +++++
 drivers/acpi/apei/Kconfig             |  14 ++
 drivers/acpi/apei/Makefile            |   1 +
 drivers/acpi/apei/ghes.c              | 427 ++++++++++++++++++++++++++++++++++
 6 files changed, 504 insertions(+)
 create mode 100644 arch/x86/kernel/cpu/mcheck/mce-apei.c
 create mode 100644 drivers/acpi/apei/ghes.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 6c3fdd631ed3..f32a4301c4d4 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -225,5 +225,13 @@ extern void mcheck_intel_therm_init(void);
 static inline void mcheck_intel_therm_init(void) { }
 #endif
 
+/*
+ * Used by APEI to report memory error via /dev/mcelog
+ */
+
+struct cper_sec_mem_err;
+extern void apei_mce_report_mem_error(int corrected,
+				      struct cper_sec_mem_err *mem_err);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 4ac6d48fe11b..bb34b03af252 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -7,3 +7,5 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
 obj-$(CONFIG_X86_MCE_INJECT)	+= mce-inject.o
 
 obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
+
+obj-$(CONFIG_ACPI_APEI)		+= mce-apei.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
new file mode 100644
index 000000000000..4eccd1fadb14
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -0,0 +1,52 @@
+/*
+ * Bridge between MCE and APEI
+ *
+ * On some machine, corrected memory errors are reported via APEI
+ * generic hardware error source (GHES) instead of corrected Machine
+ * Check. These corrected memory errors can be reported to user space
+ * through /dev/mcelog via faking a corrected Machine Check, so that
+ * the error memory page can be offlined by /sbin/mcelog if the error
+ * count for one page is beyond the threshold.
+ *
+ * Copyright 2010 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/cper.h>
+#include <acpi/apei.h>
+#include <asm/mce.h>
+
+#include "mce-internal.h"
+
+void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
+{
+	struct mce m;
+
+	/* Only corrected MC is reported */
+	if (!corrected)
+		return;
+
+	mce_setup(&m);
+	m.bank = 1;
+	/* Fake a memory read corrected error with unknown channel */
+	m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
+	m.addr = mem_err->physical_addr;
+	mce_log(&m);
+	mce_notify_irq();
+}
+EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
index 5f0a41c2bc62..f8c668f27b5a 100644
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -7,6 +7,20 @@ config ACPI_APEI
 	  especially. In addition it supports error serialization and
 	  error injection.
 
+config ACPI_APEI_GHES
+	tristate "APEI Generic Hardware Error Source"
+	depends on ACPI_APEI && X86
+	select ACPI_HED
+	help
+	  Generic Hardware Error Source provides a way to report
+	  platform hardware errors (such as that from chipset). It
+	  works in so called "Firmware First" mode, that is, hardware
+	  errors are reported to firmware firstly, then reported to
+	  Linux by firmware. This way, some non-standard hardware
+	  error registers or non-standard hardware link can be checked
+	  by firmware to produce more valuable hardware error
+	  information for Linux.
+
 config ACPI_APEI_EINJ
 	tristate "APEI Error INJection (EINJ)"
 	depends on ACPI_APEI && DEBUG_FS
diff --git a/drivers/acpi/apei/Makefile b/drivers/acpi/apei/Makefile
index fef963ec5362..41c61db4c51c 100644
--- a/drivers/acpi/apei/Makefile
+++ b/drivers/acpi/apei/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_ACPI_APEI)		+= apei.o
+obj-$(CONFIG_ACPI_APEI_GHES)	+= ghes.o
 obj-$(CONFIG_ACPI_APEI_EINJ)	+= einj.o
 
 apei-y := apei-base.o hest.o cper.o
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
new file mode 100644
index 000000000000..fd0cc016a099
--- /dev/null
+++ b/drivers/acpi/apei/ghes.c
@@ -0,0 +1,427 @@
+/*
+ * APEI Generic Hardware Error Source support
+ *
+ * Generic Hardware Error Source provides a way to report platform
+ * hardware errors (such as that from chipset). It works in so called
+ * "Firmware First" mode, that is, hardware errors are reported to
+ * firmware firstly, then reported to Linux by firmware. This way,
+ * some non-standard hardware error registers or non-standard hardware
+ * link can be checked by firmware to produce more hardware error
+ * information for Linux.
+ *
+ * For more information about Generic Hardware Error Source, please
+ * refer to ACPI Specification version 4.0, section 17.3.2.6
+ *
+ * Now, only SCI notification type and memory errors are
+ * supported. More notification type and hardware error type will be
+ * added later.
+ *
+ * Copyright 2010 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
+#include <linux/cper.h>
+#include <linux/kdebug.h>
+#include <acpi/apei.h>
+#include <acpi/atomicio.h>
+#include <acpi/hed.h>
+#include <asm/mce.h>
+
+#include "apei-internal.h"
+
+#define GHES_PFX	"GHES: "
+
+#define GHES_ESTATUS_MAX_SIZE		65536
+
+/*
+ * One struct ghes is created for each generic hardware error
+ * source.
+ *
+ * It provides the context for APEI hardware error timer/IRQ/SCI/NMI
+ * handler. Handler for one generic hardware error source is only
+ * triggered after the previous one is done. So handler can uses
+ * struct ghes without locking.
+ *
+ * estatus: memory buffer for error status block, allocated during
+ * HEST parsing.
+ */
+#define GHES_TO_CLEAR		0x0001
+
+struct ghes {
+	struct acpi_hest_generic *generic;
+	struct acpi_hest_generic_status *estatus;
+	struct list_head list;
+	u64 buffer_paddr;
+	unsigned long flags;
+};
+
+/*
+ * Error source lists, one list for each notification method. The
+ * members in lists are struct ghes.
+ *
+ * The list members are only added in HEST parsing and deleted during
+ * module_exit, that is, single-threaded. So no lock is needed for
+ * that.
+ *
+ * But the mutual exclusion is needed between members adding/deleting
+ * and timer/IRQ/SCI/NMI handler, which may traverse the list. RCU is
+ * used for that.
+ */
+static LIST_HEAD(ghes_sci);
+
+static struct ghes *ghes_new(struct acpi_hest_generic *generic)
+{
+	struct ghes *ghes;
+	unsigned int error_block_length;
+	int rc;
+
+	ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);
+	if (!ghes)
+		return ERR_PTR(-ENOMEM);
+	ghes->generic = generic;
+	INIT_LIST_HEAD(&ghes->list);
+	rc = acpi_pre_map_gar(&generic->error_status_address);
+	if (rc)
+		goto err_free;
+	error_block_length = generic->error_block_length;
+	if (error_block_length > GHES_ESTATUS_MAX_SIZE) {
+		pr_warning(FW_WARN GHES_PFX
+			   "Error status block length is too long: %u for "
+			   "generic hardware error source: %d.\n",
+			   error_block_length, generic->header.source_id);
+		error_block_length = GHES_ESTATUS_MAX_SIZE;
+	}
+	ghes->estatus = kmalloc(error_block_length, GFP_KERNEL);
+	if (!ghes->estatus) {
+		rc = -ENOMEM;
+		goto err_unmap;
+	}
+
+	return ghes;
+
+err_unmap:
+	acpi_post_unmap_gar(&generic->error_status_address);
+err_free:
+	kfree(ghes);
+	return ERR_PTR(rc);
+}
+
+static void ghes_fini(struct ghes *ghes)
+{
+	kfree(ghes->estatus);
+	acpi_post_unmap_gar(&ghes->generic->error_status_address);
+}
+
+enum {
+	GHES_SER_NO = 0x0,
+	GHES_SER_CORRECTED = 0x1,
+	GHES_SER_RECOVERABLE = 0x2,
+	GHES_SER_PANIC = 0x3,
+};
+
+static inline int ghes_severity(int severity)
+{
+	switch (severity) {
+	case CPER_SER_INFORMATIONAL:
+		return GHES_SER_NO;
+	case CPER_SER_CORRECTED:
+		return GHES_SER_CORRECTED;
+	case CPER_SER_RECOVERABLE:
+		return GHES_SER_RECOVERABLE;
+	case CPER_SER_FATAL:
+		return GHES_SER_PANIC;
+	default:
+		/* Unkown, go panic */
+		return GHES_SER_PANIC;
+	}
+}
+
+/* SCI handler run in work queue, so ioremap can be used here */
+static int ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len,
+				 int from_phys)
+{
+	void *vaddr;
+
+	vaddr = ioremap_cache(paddr, len);
+	if (!vaddr)
+		return -ENOMEM;
+	if (from_phys)
+		memcpy(buffer, vaddr, len);
+	else
+		memcpy(vaddr, buffer, len);
+	iounmap(vaddr);
+
+	return 0;
+}
+
+static int ghes_read_estatus(struct ghes *ghes, int silent)
+{
+	struct acpi_hest_generic *g = ghes->generic;
+	u64 buf_paddr;
+	u32 len;
+	int rc;
+
+	rc = acpi_atomic_read(&buf_paddr, &g->error_status_address);
+	if (rc) {
+		if (!silent && printk_ratelimit())
+			pr_warning(FW_WARN GHES_PFX
+"Failed to read error status block address for hardware error source: %d.\n",
+				   g->header.source_id);
+		return -EIO;
+	}
+	if (!buf_paddr)
+		return -ENOENT;
+
+	rc = ghes_copy_tofrom_phys(ghes->estatus, buf_paddr,
+				   sizeof(*ghes->estatus), 1);
+	if (rc)
+		return rc;
+	if (!ghes->estatus->block_status)
+		return -ENOENT;
+
+	ghes->buffer_paddr = buf_paddr;
+	ghes->flags |= GHES_TO_CLEAR;
+
+	rc = -EIO;
+	len = apei_estatus_len(ghes->estatus);
+	if (len < sizeof(*ghes->estatus))
+		goto err_read_block;
+	if (len > ghes->generic->error_block_length)
+		goto err_read_block;
+	if (apei_estatus_check_header(ghes->estatus))
+		goto err_read_block;
+	rc = ghes_copy_tofrom_phys(ghes->estatus + 1,
+				   buf_paddr + sizeof(*ghes->estatus),
+				   len - sizeof(*ghes->estatus), 1);
+	if (rc)
+		return rc;
+	if (apei_estatus_check(ghes->estatus))
+		goto err_read_block;
+	rc = 0;
+
+err_read_block:
+	if (rc && !silent)
+		pr_warning(FW_WARN GHES_PFX
+			   "Failed to read error status block!\n");
+	return rc;
+}
+
+static void ghes_clear_estatus(struct ghes *ghes)
+{
+	ghes->estatus->block_status = 0;
+	if (!(ghes->flags & GHES_TO_CLEAR))
+		return;
+	ghes_copy_tofrom_phys(ghes->estatus, ghes->buffer_paddr,
+			      sizeof(ghes->estatus->block_status), 0);
+	ghes->flags &= ~GHES_TO_CLEAR;
+}
+
+static void ghes_do_proc(struct ghes *ghes)
+{
+	int ser, processed = 0;
+	struct acpi_hest_generic_data *gdata;
+
+	ser = ghes_severity(ghes->estatus->error_severity);
+	apei_estatus_for_each_section(ghes->estatus, gdata) {
+#ifdef CONFIG_X86_MCE
+		if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
+				 CPER_SEC_PLATFORM_MEM)) {
+			apei_mce_report_mem_error(
+				ser == GHES_SER_CORRECTED,
+				(struct cper_sec_mem_err *)(gdata+1));
+			processed = 1;
+		}
+#endif
+	}
+
+	if (!processed && printk_ratelimit())
+		pr_warning(GHES_PFX
+		"Unknown error record from generic hardware error source: %d\n",
+			   ghes->generic->header.source_id);
+}
+
+static int ghes_proc(struct ghes *ghes)
+{
+	int rc;
+
+	rc = ghes_read_estatus(ghes, 0);
+	if (rc)
+		goto out;
+	ghes_do_proc(ghes);
+
+out:
+	ghes_clear_estatus(ghes);
+	return 0;
+}
+
+static int ghes_notify_sci(struct notifier_block *this,
+				  unsigned long event, void *data)
+{
+	struct ghes *ghes;
+	int ret = NOTIFY_DONE;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ghes, &ghes_sci, list) {
+		if (!ghes_proc(ghes))
+			ret = NOTIFY_OK;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static struct notifier_block ghes_notifier_sci = {
+	.notifier_call = ghes_notify_sci,
+};
+
+static int hest_ghes_parse(struct acpi_hest_header *hest_hdr, void *data)
+{
+	struct acpi_hest_generic *generic;
+	struct ghes *ghes = NULL;
+	int rc = 0;
+
+	if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR)
+		return 0;
+
+	generic = (struct acpi_hest_generic *)hest_hdr;
+	if (!generic->enabled)
+		return 0;
+
+	if (generic->error_block_length <
+	    sizeof(struct acpi_hest_generic_status)) {
+		pr_warning(FW_BUG GHES_PFX
+"Invalid error block length: %u for generic hardware error source: %d\n",
+			   generic->error_block_length,
+			   generic->header.source_id);
+		goto err;
+	}
+	if (generic->records_to_preallocate == 0) {
+		pr_warning(FW_BUG GHES_PFX
+"Invalid records to preallocate: %u for generic hardware error source: %d\n",
+			   generic->records_to_preallocate,
+			   generic->header.source_id);
+		goto err;
+	}
+	ghes = ghes_new(generic);
+	if (IS_ERR(ghes)) {
+		rc = PTR_ERR(ghes);
+		ghes = NULL;
+		goto err;
+	}
+	switch (generic->notify.type) {
+	case ACPI_HEST_NOTIFY_POLLED:
+		pr_warning(GHES_PFX
+"Generic hardware error source: %d notified via POLL is not supported!\n",
+			   generic->header.source_id);
+		break;
+	case ACPI_HEST_NOTIFY_EXTERNAL:
+	case ACPI_HEST_NOTIFY_LOCAL:
+		pr_warning(GHES_PFX
+"Generic hardware error source: %d notified via IRQ is not supported!\n",
+			   generic->header.source_id);
+		break;
+	case ACPI_HEST_NOTIFY_SCI:
+		if (list_empty(&ghes_sci))
+			register_acpi_hed_notifier(&ghes_notifier_sci);
+		list_add_rcu(&ghes->list, &ghes_sci);
+		break;
+	case ACPI_HEST_NOTIFY_NMI:
+		pr_warning(GHES_PFX
+"Generic hardware error source: %d notified via NMI is not supported!\n",
+			   generic->header.source_id);
+		break;
+	default:
+		pr_warning(FW_WARN GHES_PFX
+	"Unknown notification type: %u for generic hardware error source: %d\n",
+			   generic->notify.type, generic->header.source_id);
+		break;
+	}
+
+	return 0;
+err:
+	if (ghes)
+		ghes_fini(ghes);
+	return rc;
+}
+
+static void ghes_cleanup(void)
+{
+	struct ghes *ghes, *nghes;
+
+	if (!list_empty(&ghes_sci))
+		unregister_acpi_hed_notifier(&ghes_notifier_sci);
+
+	synchronize_rcu();
+
+	list_for_each_entry_safe(ghes, nghes, &ghes_sci, list) {
+		list_del(&ghes->list);
+		ghes_fini(ghes);
+		kfree(ghes);
+	}
+}
+
+static int __init ghes_init(void)
+{
+	int rc;
+
+	if (acpi_disabled)
+		return -ENODEV;
+
+	if (hest_disable) {
+		pr_info(GHES_PFX "HEST is not enabled!\n");
+		return -EINVAL;
+	}
+
+	rc = apei_hest_parse(hest_ghes_parse, NULL);
+	if (rc) {
+		pr_err(GHES_PFX
+		"Error during parsing HEST generic hardware error sources.\n");
+		goto err_cleanup;
+	}
+
+	if (list_empty(&ghes_sci)) {
+		pr_info(GHES_PFX
+			"No functional generic hardware error sources.\n");
+		rc = -ENODEV;
+		goto err_cleanup;
+	}
+
+	pr_info(GHES_PFX
+		"Generic Hardware Error Source support is initialized.\n");
+
+	return 0;
+err_cleanup:
+	ghes_cleanup();
+	return rc;
+}
+
+static void __exit ghes_exit(void)
+{
+	ghes_cleanup();
+}
+
+module_init(ghes_init);
+module_exit(ghes_exit);
+
+MODULE_AUTHOR("Huang Ying");
+MODULE_DESCRIPTION("APEI Generic Hardware Error Source support");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-55-g7522


From 482908b49ebfa453dd0455910c951c750567c05d Mon Sep 17 00:00:00 2001
From: Huang Ying
Date: Tue, 18 May 2010 14:35:22 +0800
Subject: ACPI, APEI, Use ERST for persistent storage of MCE

Traditionally, fatal MCE will cause Linux print error log to console
then reboot. Because MCE registers will preserve their content after
warm reboot, the hardware error can be logged to disk or network after
reboot. But system may fail to warm reboot, then you may lose the
hardware error log. ERST can help here. Through saving the hardware
error log into flash via ERST before go panic, the hardware error log
can be gotten from the flash after system boot successful again.

The fatal MCE processing procedure with ERST involved is as follow:

- Hardware detect error, MCE raised
- MCE read MCE registers, check error severity (fatal), prepare error record
- Write MCE error record into flash via ERST
- Go panic, then trigger system reboot
- System reboot, /sbin/mcelog run, it reads /dev/mcelog to check flash
  for error record of previous boot via ERST, and output and clear
  them if available
- /sbin/mcelog logs error records into disk or network

ERST only accepts CPER record format, but there is no pre-defined CPER
section can accommodate all information in struct mce, so a customized
section type is defined to hold struct mce inside a CPER record as an
error section.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-apei.c     | 86 +++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/mcheck/mce-internal.h | 23 +++++++++
 arch/x86/kernel/cpu/mcheck/mce.c          | 79 ++++++++++++++++++++++++----
 3 files changed, 177 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 4eccd1fadb14..745b54f9be89 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -8,6 +8,9 @@
  * the error memory page can be offlined by /sbin/mcelog if the error
  * count for one page is beyond the threshold.
  *
+ * For fatal MCE, save MCE record into persistent storage via ERST, so
+ * that the MCE record can be logged after reboot via ERST.
+ *
  * Copyright 2010 Intel Corp.
  *   Author: Huang Ying <ying.huang@intel.com>
  *
@@ -50,3 +53,86 @@ void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
 	mce_notify_irq();
 }
 EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
+
+#define CPER_CREATOR_MCE						\
+	UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c,	\
+		0x64, 0x90, 0xb8, 0x9d)
+#define CPER_SECTION_TYPE_MCE						\
+	UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96,	\
+		0x04, 0x4a, 0x38, 0xfc)
+
+/*
+ * CPER specification (in UEFI specification 2.3 appendix N) requires
+ * byte-packed.
+ */
+struct cper_mce_record {
+	struct cper_record_header hdr;
+	struct cper_section_descriptor sec_hdr;
+	struct mce mce;
+} __packed;
+
+int apei_write_mce(struct mce *m)
+{
+	struct cper_mce_record rcd;
+
+	memset(&rcd, 0, sizeof(rcd));
+	memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+	rcd.hdr.revision = CPER_RECORD_REV;
+	rcd.hdr.signature_end = CPER_SIG_END;
+	rcd.hdr.section_count = 1;
+	rcd.hdr.error_severity = CPER_SER_FATAL;
+	/* timestamp, platform_id, partition_id are all invalid */
+	rcd.hdr.validation_bits = 0;
+	rcd.hdr.record_length = sizeof(rcd);
+	rcd.hdr.creator_id = CPER_CREATOR_MCE;
+	rcd.hdr.notification_type = CPER_NOTIFY_MCE;
+	rcd.hdr.record_id = cper_next_record_id();
+	rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
+
+	rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
+	rcd.sec_hdr.section_length = sizeof(rcd.mce);
+	rcd.sec_hdr.revision = CPER_SEC_REV;
+	/* fru_id and fru_text is invalid */
+	rcd.sec_hdr.validation_bits = 0;
+	rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
+	rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
+	rcd.sec_hdr.section_severity = CPER_SER_FATAL;
+
+	memcpy(&rcd.mce, m, sizeof(*m));
+
+	return erst_write(&rcd.hdr);
+}
+
+ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+	struct cper_mce_record rcd;
+	ssize_t len;
+
+	len = erst_read_next(&rcd.hdr, sizeof(rcd));
+	if (len <= 0)
+		return len;
+	/* Can not skip other records in storage via ERST unless clear them */
+	else if (len != sizeof(rcd) ||
+		 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) {
+		if (printk_ratelimit())
+			pr_warning(
+			"MCE-APEI: Can not skip the unknown record in ERST");
+		return -EIO;
+	}
+
+	memcpy(m, &rcd.mce, sizeof(*m));
+	*record_id = rcd.hdr.record_id;
+
+	return sizeof(*m);
+}
+
+/* Check whether there is record in ERST */
+int apei_check_mce(void)
+{
+	return erst_get_record_count();
+}
+
+int apei_clear_mce(u64 record_id)
+{
+	return erst_clear(record_id);
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 32996f9fab67..fefcc69ee8b5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -28,3 +28,26 @@ extern int mce_ser;
 
 extern struct mce_bank *mce_banks;
 
+#ifdef CONFIG_ACPI_APEI
+int apei_write_mce(struct mce *m);
+ssize_t apei_read_mce(struct mce *m, u64 *record_id);
+int apei_check_mce(void);
+int apei_clear_mce(u64 record_id);
+#else
+static inline int apei_write_mce(struct mce *m)
+{
+	return -EINVAL;
+}
+static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+	return 0;
+}
+static inline int apei_check_mce(void)
+{
+	return 0;
+}
+static inline int apei_clear_mce(u64 record_id)
+{
+	return -EINVAL;
+}
+#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8a6f0afa767e..09535ca9b9d7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -264,7 +264,7 @@ static void wait_for_panic(void)
 
 static void mce_panic(char *msg, struct mce *final, char *exp)
 {
-	int i;
+	int i, apei_err = 0;
 
 	if (!fake_panic) {
 		/*
@@ -287,8 +287,11 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 		struct mce *m = &mcelog.entry[i];
 		if (!(m->status & MCI_STATUS_VAL))
 			continue;
-		if (!(m->status & MCI_STATUS_UC))
+		if (!(m->status & MCI_STATUS_UC)) {
 			print_mce(m);
+			if (!apei_err)
+				apei_err = apei_write_mce(m);
+		}
 	}
 	/* Now print uncorrected but with the final one last */
 	for (i = 0; i < MCE_LOG_LEN; i++) {
@@ -297,11 +300,17 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 			continue;
 		if (!(m->status & MCI_STATUS_UC))
 			continue;
-		if (!final || memcmp(m, final, sizeof(struct mce)))
+		if (!final || memcmp(m, final, sizeof(struct mce))) {
 			print_mce(m);
+			if (!apei_err)
+				apei_err = apei_write_mce(m);
+		}
 	}
-	if (final)
+	if (final) {
 		print_mce(final);
+		if (!apei_err)
+			apei_err = apei_write_mce(final);
+	}
 	if (cpu_missing)
 		printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
 	print_mce_tail();
@@ -1493,6 +1502,43 @@ static void collect_tscs(void *data)
 	rdtscll(cpu_tsc[smp_processor_id()]);
 }
 
+static int mce_apei_read_done;
+
+/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
+static int __mce_read_apei(char __user **ubuf, size_t usize)
+{
+	int rc;
+	u64 record_id;
+	struct mce m;
+
+	if (usize < sizeof(struct mce))
+		return -EINVAL;
+
+	rc = apei_read_mce(&m, &record_id);
+	/* Error or no more MCE record */
+	if (rc <= 0) {
+		mce_apei_read_done = 1;
+		return rc;
+	}
+	rc = -EFAULT;
+	if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
+		return rc;
+	/*
+	 * In fact, we should have cleared the record after that has
+	 * been flushed to the disk or sent to network in
+	 * /sbin/mcelog, but we have no interface to support that now,
+	 * so just clear it to avoid duplication.
+	 */
+	rc = apei_clear_mce(record_id);
+	if (rc) {
+		mce_apei_read_done = 1;
+		return rc;
+	}
+	*ubuf += sizeof(struct mce);
+
+	return 0;
+}
+
 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 			loff_t *off)
 {
@@ -1506,15 +1552,19 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 		return -ENOMEM;
 
 	mutex_lock(&mce_read_mutex);
+
+	if (!mce_apei_read_done) {
+		err = __mce_read_apei(&buf, usize);
+		if (err || buf != ubuf)
+			goto out;
+	}
+
 	next = rcu_dereference_check_mce(mcelog.next);
 
 	/* Only supports full reads right now */
-	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
-		mutex_unlock(&mce_read_mutex);
-		kfree(cpu_tsc);
-
-		return -EINVAL;
-	}
+	err = -EINVAL;
+	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
+		goto out;
 
 	err = 0;
 	prev = 0;
@@ -1562,10 +1612,15 @@ timeout:
 			memset(&mcelog.entry[i], 0, sizeof(struct mce));
 		}
 	}
+
+	if (err)
+		err = -EFAULT;
+
+out:
 	mutex_unlock(&mce_read_mutex);
 	kfree(cpu_tsc);
 
-	return err ? -EFAULT : buf - ubuf;
+	return err ? err : buf - ubuf;
 }
 
 static unsigned int mce_poll(struct file *file, poll_table *wait)
@@ -1573,6 +1628,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &mce_wait, wait);
 	if (rcu_dereference_check_mce(mcelog.next))
 		return POLLIN | POLLRDNORM;
+	if (!mce_apei_read_done && apei_check_mce())
+		return POLLIN | POLLRDNORM;
 	return 0;
 }
 
-- 
cgit v1.2.3-55-g7522


From dcc7871128e99458ca86186b7bc8bf27ff0c47b5 Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Thu, 20 May 2010 21:04:21 -0500
Subject: kgdb: core changes to support kdb

These are the minimum changes to the kgdb core in order to enable an
API to connect a new front end (kdb) to the debug core.

This patch introduces the dbg_kdb_mode variable controls where the
user level I/O is routed.  It will be routed to the gdbstub (kgdb) or
to the kdb front end which is a simple shell available over the kgdboc
connection.

You can switch back and forth between kdb or the gdb stub mode of
operation dynamically.  From gdb stub mode you can blindly type
"$3#33", or from the kdb mode you can enter "kgdb" to switch to the
gdb stub.

The logic in the debug core depends on kdb to look for the typical gdb
connection sequences and return immediately with KGDB_PASS_EVENT if a
gdb serial command sequence is detected.  That should allow a
reasonably seamless transition between kdb -> gdb without leaving the
kernel exception state.  The two gdb serial queries that kdb is
responsible for detecting are the "?" and "qSupported" packets.

CC: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Acked-by: Martin Hicks <mort@sgi.com>
---
 arch/arm/kernel/kgdb.c     |   5 +++
 arch/mips/kernel/kgdb.c    |   5 +++
 arch/powerpc/kernel/kgdb.c |   5 +++
 arch/x86/kernel/kgdb.c     |   5 +++
 include/linux/kgdb.h       |  11 ++++-
 kernel/debug/debug_core.c  | 107 ++++++++++++++++++++++++++++++++++++++-------
 kernel/debug/debug_core.h  |  24 ++++++++++
 kernel/debug/gdbstub.c     |  36 +++++++++++++++
 lib/Kconfig.kgdb           |   8 +++-
 9 files changed, 186 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c
index a5b846b9895d..c868a8864117 100644
--- a/arch/arm/kernel/kgdb.c
+++ b/arch/arm/kernel/kgdb.c
@@ -98,6 +98,11 @@ sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *task)
 	gdb_regs[_CPSR]		= thread_regs->ARM_cpsr;
 }
 
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
+{
+	regs->ARM_pc = pc;
+}
+
 static int compiled_break;
 
 int kgdb_arch_handle_exception(int exception_vector, int signo,
diff --git a/arch/mips/kernel/kgdb.c b/arch/mips/kernel/kgdb.c
index 50c9bb880667..6ed4c83c869b 100644
--- a/arch/mips/kernel/kgdb.c
+++ b/arch/mips/kernel/kgdb.c
@@ -180,6 +180,11 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
 	*(ptr++) = regs->cp0_epc;
 }
 
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
+{
+	regs->cp0_epc = pc;
+}
+
 /*
  * Calls linux_debug_hook before the kernel dies. If KGDB is enabled,
  * then try to fall into the debugger
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index 41bada0298c8..c81e3de1306e 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -309,6 +309,11 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
 	       (unsigned long)(((void *)gdb_regs) + NUMREGBYTES));
 }
 
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
+{
+	regs->nip = pc;
+}
+
 /*
  * This function does PowerPC specific procesing for interfacing to gdb.
  */
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index b2258ca91003..f95a2c0b915c 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -690,6 +690,11 @@ unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs)
 	return instruction_pointer(regs);
 }
 
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+{
+	regs->ip = ip;
+}
+
 struct kgdb_arch arch_kgdb_ops = {
 	/* Breakpoint instruction: */
 	.gdb_bpt_instr		= { 0xcc },
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 4830142ec339..5b37df00000d 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -16,10 +16,12 @@
 #include <linux/serial_8250.h>
 #include <linux/linkage.h>
 #include <linux/init.h>
-
 #include <asm/atomic.h>
+#ifdef CONFIG_HAVE_ARCH_KGDB
 #include <asm/kgdb.h>
+#endif
 
+#ifdef CONFIG_KGDB
 struct pt_regs;
 
 /**
@@ -262,6 +264,7 @@ extern struct kgdb_arch		arch_kgdb_ops;
 
 extern unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs);
 
+extern void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc);
 extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops);
 extern void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops);
 extern struct kgdb_io *dbg_io_ops;
@@ -279,5 +282,9 @@ extern int kgdb_nmicallback(int cpu, void *regs);
 
 extern int			kgdb_single_step;
 extern atomic_t			kgdb_active;
-
+#define in_dbg_master() \
+	(raw_smp_processor_id() == atomic_read(&kgdb_active))
+#else /* ! CONFIG_KGDB */
+#define in_dbg_master() (0)
+#endif /* ! CONFIG_KGDB */
 #endif /* _KGDB_H_ */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 7e03969330bc..6e1fa829fdeb 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -43,6 +43,7 @@
 #include <linux/sysrq.h>
 #include <linux/init.h>
 #include <linux/kgdb.h>
+#include <linux/kdb.h>
 #include <linux/pid.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
@@ -77,6 +78,11 @@ static DEFINE_SPINLOCK(kgdb_registration_lock);
 static int kgdb_con_registered;
 /* determine if kgdb console output should be used */
 static int kgdb_use_con;
+/* Next cpu to become the master debug core */
+int dbg_switch_cpu;
+
+/* Use kdb or gdbserver mode */
+static int dbg_kdb_mode = 1;
 
 static int __init opt_kgdb_con(char *str)
 {
@@ -100,6 +106,7 @@ static struct kgdb_bkpt		kgdb_break[KGDB_MAX_BREAKPOINTS] = {
  * The CPU# of the active CPU, or -1 if none:
  */
 atomic_t			kgdb_active = ATOMIC_INIT(-1);
+EXPORT_SYMBOL_GPL(kgdb_active);
 
 /*
  * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
@@ -301,7 +308,7 @@ int dbg_set_sw_break(unsigned long addr)
 	return 0;
 }
 
-static int kgdb_deactivate_sw_breakpoints(void)
+int dbg_deactivate_sw_breakpoints(void)
 {
 	unsigned long addr;
 	int error;
@@ -395,8 +402,14 @@ static int kgdb_io_ready(int print_wait)
 		return 1;
 	if (atomic_read(&kgdb_setting_breakpoint))
 		return 1;
-	if (print_wait)
+	if (print_wait) {
+#ifdef CONFIG_KGDB_KDB
+		if (!dbg_kdb_mode)
+			printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n");
+#else
 		printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
+#endif
+	}
 	return 1;
 }
 
@@ -410,7 +423,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
 	/* Panic on recursive debugger calls: */
 	exception_level++;
 	addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
-	kgdb_deactivate_sw_breakpoints();
+	dbg_deactivate_sw_breakpoints();
 
 	/*
 	 * If the break point removed ok at the place exception
@@ -443,11 +456,24 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
 	return 1;
 }
 
+static void dbg_cpu_switch(int cpu, int next_cpu)
+{
+	/* Mark the cpu we are switching away from as a slave when it
+	 * holds the kgdb_active token.  This must be done so that the
+	 * that all the cpus wait in for the debug core will not enter
+	 * again as the master. */
+	if (cpu == atomic_read(&kgdb_active)) {
+		kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
+		kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
+	}
+	kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
+}
+
 static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
 {
 	unsigned long flags;
 	int sstep_tries = 100;
-	int error = 0;
+	int error;
 	int i, cpu;
 	int trace_on = 0;
 acquirelock:
@@ -460,6 +486,8 @@ acquirelock:
 	cpu = ks->cpu;
 	kgdb_info[cpu].debuggerinfo = regs;
 	kgdb_info[cpu].task = current;
+	kgdb_info[cpu].ret_state = 0;
+	kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
 	/*
 	 * Make sure the above info reaches the primary CPU before
 	 * our cpu_in_kgdb[] flag setting does:
@@ -471,7 +499,11 @@ acquirelock:
 	 * master cpu and acquire the kgdb_active lock:
 	 */
 	while (1) {
-		if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
+cpu_loop:
+		if (kgdb_info[cpu].exception_state & DCPU_NEXT_MASTER) {
+			kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
+			goto cpu_master_loop;
+		} else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
 			if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
 				break;
 		} else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
@@ -513,7 +545,7 @@ return_normal:
 	}
 
 	if (!kgdb_io_ready(1)) {
-		error = 1;
+		kgdb_info[cpu].ret_state = 1;
 		goto kgdb_restore; /* No I/O connection, resume the system */
 	}
 
@@ -548,7 +580,7 @@ return_normal:
 	 * Wait for the other CPUs to be notified and be waiting for us:
 	 */
 	for_each_online_cpu(i) {
-		while (!atomic_read(&cpu_in_kgdb[i]))
+		while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i]))
 			cpu_relax();
 	}
 
@@ -557,7 +589,7 @@ return_normal:
 	 * in the debugger and all secondary CPUs are quiescent
 	 */
 	kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
-	kgdb_deactivate_sw_breakpoints();
+	dbg_deactivate_sw_breakpoints();
 	kgdb_single_step = 0;
 	kgdb_contthread = current;
 	exception_level = 0;
@@ -565,8 +597,26 @@ return_normal:
 	if (trace_on)
 		tracing_off();
 
-	/* Talk to debugger with gdbserial protocol */
-	error = gdb_serial_stub(ks);
+	while (1) {
+cpu_master_loop:
+		if (dbg_kdb_mode) {
+			kgdb_connected = 1;
+			error = kdb_stub(ks);
+		} else {
+			error = gdb_serial_stub(ks);
+		}
+
+		if (error == DBG_PASS_EVENT) {
+			dbg_kdb_mode = !dbg_kdb_mode;
+			kgdb_connected = 0;
+		} else if (error == DBG_SWITCH_CPU_EVENT) {
+			dbg_cpu_switch(cpu, dbg_switch_cpu);
+			goto cpu_loop;
+		} else {
+			kgdb_info[cpu].ret_state = error;
+			break;
+		}
+	}
 
 	/* Call the I/O driver's post_exception routine */
 	if (dbg_io_ops->post_exception)
@@ -578,11 +628,16 @@ return_normal:
 		for (i = NR_CPUS-1; i >= 0; i--)
 			atomic_dec(&passive_cpu_wait[i]);
 		/*
-		 * Wait till all the CPUs have quit
-		 * from the debugger.
+		 * Wait till all the CPUs have quit from the debugger,
+		 * but allow a CPU that hit an exception and is
+		 * waiting to become the master to remain in the debug
+		 * core.
 		 */
 		for_each_online_cpu(i) {
-			while (atomic_read(&cpu_in_kgdb[i]))
+			while (kgdb_do_roundup &&
+			       atomic_read(&cpu_in_kgdb[i]) &&
+			       !(kgdb_info[i].exception_state &
+				 DCPU_WANT_MASTER))
 				cpu_relax();
 		}
 	}
@@ -603,7 +658,7 @@ kgdb_restore:
 	clocksource_touch_watchdog();
 	local_irq_restore(flags);
 
-	return error;
+	return kgdb_info[cpu].ret_state;
 }
 
 /*
@@ -632,7 +687,8 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
 		return 0; /* Ouch, double exception ! */
 	kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
 	ret = kgdb_cpu_enter(ks, regs);
-	kgdb_info[ks->cpu].exception_state &= ~DCPU_WANT_MASTER;
+	kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER |
+						DCPU_IS_SLAVE);
 	return ret;
 }
 
@@ -665,7 +721,7 @@ static void kgdb_console_write(struct console *co, const char *s,
 
 	/* If we're debugging, or KGDB has not connected, don't try
 	 * and print. */
-	if (!kgdb_connected || atomic_read(&kgdb_active) != -1)
+	if (!kgdb_connected || atomic_read(&kgdb_active) != -1 || dbg_kdb_mode)
 		return;
 
 	local_irq_save(flags);
@@ -687,8 +743,14 @@ static void sysrq_handle_dbg(int key, struct tty_struct *tty)
 		printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
 		return;
 	}
-	if (!kgdb_connected)
+	if (!kgdb_connected) {
+#ifdef CONFIG_KGDB_KDB
+		if (!dbg_kdb_mode)
+			printk(KERN_CRIT "KGDB or $3#33 for KDB\n");
+#else
 		printk(KERN_CRIT "Entering KGDB\n");
+#endif
+	}
 
 	kgdb_breakpoint();
 }
@@ -817,6 +879,16 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
 }
 EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
 
+int dbg_io_get_char(void)
+{
+	int ret = dbg_io_ops->read_char();
+	if (!dbg_kdb_mode)
+		return ret;
+	if (ret == 127)
+		return 8;
+	return ret;
+}
+
 /**
  * kgdb_breakpoint - generate breakpoint exception
  *
@@ -839,6 +911,7 @@ static int __init opt_kgdb_wait(char *str)
 {
 	kgdb_break_asap = 1;
 
+	kdb_init(KDB_INIT_EARLY);
 	if (kgdb_io_module_registered)
 		kgdb_initial_breakpoint();
 
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index db554f9be51d..44cf3de8cf9e 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -38,6 +38,8 @@ struct debuggerinfo_struct {
 	void			*debuggerinfo;
 	struct task_struct	*task;
 	int			exception_state;
+	int			ret_state;
+	int			irq_depth;
 };
 
 extern struct debuggerinfo_struct kgdb_info[];
@@ -47,9 +49,31 @@ extern int dbg_remove_all_break(void);
 extern int dbg_set_sw_break(unsigned long addr);
 extern int dbg_remove_sw_break(unsigned long addr);
 extern int dbg_activate_sw_breakpoints(void);
+extern int dbg_deactivate_sw_breakpoints(void);
+
+/* polled character access to i/o module */
+extern int dbg_io_get_char(void);
+
+/* stub return value for switching between the gdbstub and kdb */
+#define DBG_PASS_EVENT -12345
+/* Switch from one cpu to another */
+#define DBG_SWITCH_CPU_EVENT -123456
+extern int dbg_switch_cpu;
 
 /* gdbstub interface functions */
 extern int gdb_serial_stub(struct kgdb_state *ks);
 extern void gdbstub_msg_write(const char *s, int len);
 
+/* gdbstub functions used for kdb <-> gdbstub transition */
+extern int gdbstub_state(struct kgdb_state *ks, char *cmd);
+
+#ifdef CONFIG_KGDB_KDB
+extern int kdb_stub(struct kgdb_state *ks);
+#else /* ! CONFIG_KGDB_KDB */
+static inline int kdb_stub(struct kgdb_state *ks)
+{
+	return DBG_PASS_EVENT;
+}
+#endif /* CONFIG_KGDB_KDB */
+
 #endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index ccdf0929f12d..188203a19657 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -887,6 +887,13 @@ int gdb_serial_stub(struct kgdb_state *ks)
 		case 'Z': /* Break point set */
 			gdb_cmd_break(ks);
 			break;
+#ifdef CONFIG_KGDB_KDB
+		case '3': /* Escape into back into kdb */
+			if (remcom_in_buffer[1] == '\0') {
+				gdb_cmd_detachkill(ks);
+				return DBG_PASS_EVENT;
+			}
+#endif
 		case 'C': /* Exception passing */
 			tmp = gdb_cmd_exception_pass(ks);
 			if (tmp > 0)
@@ -932,3 +939,32 @@ kgdb_exit:
 		error = 1;
 	return error;
 }
+
+int gdbstub_state(struct kgdb_state *ks, char *cmd)
+{
+	int error;
+
+	switch (cmd[0]) {
+	case 'e':
+		error = kgdb_arch_handle_exception(ks->ex_vector,
+						   ks->signo,
+						   ks->err_code,
+						   remcom_in_buffer,
+						   remcom_out_buffer,
+						   ks->linux_regs);
+		return error;
+	case 's':
+	case 'c':
+		strcpy(remcom_in_buffer, cmd);
+		return 0;
+	case '?':
+		gdb_cmd_status(ks);
+		break;
+	case '\0':
+		strcpy(remcom_out_buffer, "");
+		break;
+	}
+	dbg_io_ops->write_char('+');
+	put_packet(remcom_out_buffer);
+	return 0;
+}
diff --git a/lib/Kconfig.kgdb b/lib/Kconfig.kgdb
index 9b5d1d7f2ef7..78de43a5e902 100644
--- a/lib/Kconfig.kgdb
+++ b/lib/Kconfig.kgdb
@@ -3,7 +3,7 @@ config HAVE_ARCH_KGDB
 	bool
 
 menuconfig KGDB
-	bool "KGDB: kernel debugging with remote gdb"
+	bool "KGDB: kernel debugger"
 	depends on HAVE_ARCH_KGDB
 	depends on DEBUG_KERNEL && EXPERIMENTAL
 	help
@@ -57,4 +57,10 @@ config KGDB_TESTS_BOOT_STRING
 	  information about other strings you could use beyond the
 	  default of V1F100.
 
+config KGDB_KDB
+	bool "KGDB_KDB: include kdb frontend for kgdb"
+	default n
+	help
+	  KDB frontend for kernel
+
 endif # KGDB
-- 
cgit v1.2.3-55-g7522


From 98ec1878cacb393975cba64f7392eece81716cb4 Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Mon, 27 Apr 2009 10:58:06 -0500
Subject: kgdb: remove post_primary_code references

Remove all the references to the kgdb_post_primary_code.  This
function serves no useful purpose because you can obtain the same
information from the "struct kgdb_state *ks" from with in the
debugger, if for some reason you want the data.

Also remove the unintentional duplicate assignment for ks->ex_vector.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/kgdb.c    | 29 -----------------------------
 include/linux/kgdb.h      | 14 --------------
 kernel/debug/debug_core.c |  8 --------
 3 files changed, 51 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f95a2c0b915c..acba57169938 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -47,20 +47,8 @@
 #include <asm/debugreg.h>
 #include <asm/apicdef.h>
 #include <asm/system.h>
-
 #include <asm/apic.h>
 
-/*
- * Put the error code here just in case the user cares:
- */
-static int gdb_x86errcode;
-
-/*
- * Likewise, the vector number here (since GDB only gets the signal
- * number through the usual means, and that's not very specific):
- */
-static int gdb_x86vector = -1;
-
 /**
  *	pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
  *	@gdb_regs: A pointer to hold the registers in the order GDB wants.
@@ -399,23 +387,6 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
 	}
 }
 
-/**
- *	kgdb_post_primary_code - Save error vector/code numbers.
- *	@regs: Original pt_regs.
- *	@e_vector: Original error vector.
- *	@err_code: Original error code.
- *
- *	This is needed on architectures which support SMP and KGDB.
- *	This function is called after all the slave cpus have been put
- *	to a know spin state and the primary CPU has control over KGDB.
- */
-void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
-{
-	/* primary processor is completely in the debugger */
-	gdb_x86vector = e_vector;
-	gdb_x86errcode = err_code;
-}
-
 #ifdef CONFIG_SMP
 /**
  *	kgdb_roundup_cpus - Get other CPUs into a holding pattern
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 407edb1e0c4d..406f6f9286f3 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -35,20 +35,6 @@ struct pt_regs;
  */
 extern int kgdb_skipexception(int exception, struct pt_regs *regs);
 
-/**
- *	kgdb_post_primary_code - (optional) Save error vector/code numbers.
- *	@regs: Original pt_regs.
- *	@e_vector: Original error vector.
- *	@err_code: Original error code.
- *
- *	This is usually needed on architectures which support SMP and
- *	KGDB.  This function is called after all the secondary cpus have
- *	been put to a know spin state and the primary CPU has control over
- *	KGDB.
- */
-extern void kgdb_post_primary_code(struct pt_regs *regs, int e_vector,
-				  int err_code);
-
 /**
  *	kgdb_disable_hw_debug - (optional) Disable hardware debugging hook
  *	@regs: Current &struct pt_regs.
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1aed37b4c564..88a83a225374 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -203,12 +203,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
 	return 0;
 }
 
-void __weak
-kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
-{
-	return;
-}
-
 /**
  *	kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
  *	@regs: Current &struct pt_regs.
@@ -588,7 +582,6 @@ return_normal:
 	 * At this point the primary processor is completely
 	 * in the debugger and all secondary CPUs are quiescent
 	 */
-	kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
 	dbg_deactivate_sw_breakpoints();
 	kgdb_single_step = 0;
 	kgdb_contthread = current;
@@ -678,7 +671,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
 	ks->cpu			= raw_smp_processor_id();
 	ks->ex_vector		= evector;
 	ks->signo		= signo;
-	ks->ex_vector		= evector;
 	ks->err_code		= ecode;
 	ks->kgdb_usethreadid	= 0;
 	ks->linux_regs		= regs;
-- 
cgit v1.2.3-55-g7522


From f503b5ae53cb557ac351a668fcac1baab1cef0db Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Thu, 20 May 2010 21:04:25 -0500
Subject: x86,kgdb: Add low level debug hook

The only way the debugger can handle a trap in inside rcu_lock,
notify_die, or atomic_notifier_call_chain without a triple fault is
to have a low level "first opportunity handler" in the int3 exception
handler.

Generally this will be something the vast majority of folks will not
need, but for those who need it, it is added as a kernel .config
option called KGDB_LOW_LEVEL_TRAP.

CC: Ingo Molnar <mingo@elte.hu>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: H. Peter Anvin <hpa@zytor.com>
CC: x86@kernel.org
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/include/asm/kgdb.h |  3 +++
 arch/x86/kernel/kgdb.c      | 22 +++++++++++++++++++++-
 arch/x86/kernel/traps.c     |  6 ++++++
 include/linux/kgdb.h        |  1 +
 kernel/debug/debug_core.c   |  2 +-
 lib/Kconfig.kgdb            |  9 +++++++++
 6 files changed, 41 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index e6c6c808489f..006da3687cdc 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -76,4 +76,7 @@ static inline void arch_kgdb_breakpoint(void)
 #define BREAK_INSTR_SIZE	1
 #define CACHE_FLUSH_IS_SAFE	1
 
+extern int kgdb_ll_trap(int cmd, const char *str,
+			struct pt_regs *regs, long err, int trap, int sig);
+
 #endif /* _ASM_X86_KGDB_H */
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index acba57169938..95b89d4cb8f1 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -538,7 +538,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
 			return NOTIFY_DONE;
 	}
 
-	if (kgdb_handle_exception(args->trapnr, args->signr, args->err, regs))
+	if (kgdb_handle_exception(args->trapnr, args->signr, cmd, regs))
 		return NOTIFY_DONE;
 
 	/* Must touch watchdog before return to normal operation */
@@ -546,6 +546,26 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
 	return NOTIFY_STOP;
 }
 
+#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
+int kgdb_ll_trap(int cmd, const char *str,
+		 struct pt_regs *regs, long err, int trap, int sig)
+{
+	struct die_args args = {
+		.regs	= regs,
+		.str	= str,
+		.err	= err,
+		.trapnr	= trap,
+		.signr	= sig,
+
+	};
+
+	if (!kgdb_io_module_registered)
+		return NOTIFY_DONE;
+
+	return __kgdb_notify(&args, cmd);
+}
+#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
+
 static int
 kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
 {
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 02cfb9b8f5b1..7eaad4c5110a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -15,6 +15,7 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/kdebug.h>
+#include <linux/kgdb.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
@@ -451,6 +452,11 @@ void restart_nmi(void)
 /* May run on IST stack. */
 dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 {
+#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
+	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
+			== NOTIFY_STOP)
+		return;
+#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
 #ifdef CONFIG_KPROBES
 	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
 			== NOTIFY_STOP)
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 406f6f9286f3..19d1b29a2694 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -60,6 +60,7 @@ struct uart_port;
 void kgdb_breakpoint(void);
 
 extern int kgdb_connected;
+extern int kgdb_io_module_registered;
 
 extern atomic_t			kgdb_setting_breakpoint;
 extern atomic_t			kgdb_cpu_doing_single_step;
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 88a83a225374..375e42f0baf0 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -66,7 +66,7 @@ int				kgdb_connected;
 EXPORT_SYMBOL_GPL(kgdb_connected);
 
 /* All the KGDB handlers are installed */
-static int			kgdb_io_module_registered;
+int			kgdb_io_module_registered;
 
 /* Guard for recursive entry */
 static int			exception_level;
diff --git a/lib/Kconfig.kgdb b/lib/Kconfig.kgdb
index ee8ae7132f20..c56ccb4ad292 100644
--- a/lib/Kconfig.kgdb
+++ b/lib/Kconfig.kgdb
@@ -57,6 +57,15 @@ config KGDB_TESTS_BOOT_STRING
 	  information about other strings you could use beyond the
 	  default of V1F100.
 
+config KGDB_LOW_LEVEL_TRAP
+       bool "KGDB: Allow debugging with traps in notifiers"
+       depends on X86
+       default n
+       help
+         This will add an extra call back to kgdb for the breakpoint
+         exception handler on which will will allow kgdb to step
+         through a notify handler.
+
 config KGDB_KDB
 	bool "KGDB_KDB: include kdb frontend for kgdb"
 	default n
-- 
cgit v1.2.3-55-g7522


From 29c843912a0baa7fa63033fe28e1ca7e796686a5 Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Thu, 20 May 2010 21:04:29 -0500
Subject: x86, kgdb: early trap init for early debug

Allow the x86 arch to have early exception processing for the purpose
of debugging via the kgdb.

Signed-off-by: Jan Kiszka <jan.kiszka@web.de>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/include/asm/processor.h |  2 ++
 arch/x86/kernel/setup.c          |  1 +
 arch/x86/kernel/traps.c          | 14 ++++++++++----
 3 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5a51379dcbe4..7e5c6a60b8ee 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -789,6 +789,8 @@ static inline void wbinvd_halt(void)
 extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);
 
+extern void early_trap_init(void);
+
 /* Defined in head.S */
 extern struct desc_ptr		early_gdt_descr;
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c4851eff57b3..e8029896309a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -725,6 +725,7 @@ void __init setup_arch(char **cmdline_p)
 	/* VMI may relocate the fixmap; do this before touching ioremap area */
 	vmi_init();
 
+	early_trap_init();
 	early_cpu_init();
 	early_ioremap_init();
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7eaad4c5110a..142d70c74b02 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -808,6 +808,16 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 }
 #endif
 
+/* Set of traps needed for early debugging. */
+void __init early_trap_init(void)
+{
+	set_intr_gate_ist(1, &debug, DEBUG_STACK);
+	/* int3 can be called from all */
+	set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
+	set_intr_gate(14, &page_fault);
+	load_idt(&idt_descr);
+}
+
 void __init trap_init(void)
 {
 	int i;
@@ -821,10 +831,7 @@ void __init trap_init(void)
 #endif
 
 	set_intr_gate(0, &divide_error);
-	set_intr_gate_ist(1, &debug, DEBUG_STACK);
 	set_intr_gate_ist(2, &nmi, NMI_STACK);
-	/* int3 can be called from all */
-	set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
 	/* int4 can be called from all */
 	set_system_intr_gate(4, &overflow);
 	set_intr_gate(5, &bounds);
@@ -840,7 +847,6 @@ void __init trap_init(void)
 	set_intr_gate(11, &segment_not_present);
 	set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
 	set_intr_gate(13, &general_protection);
-	set_intr_gate(14, &page_fault);
 	set_intr_gate(15, &spurious_interrupt_bug);
 	set_intr_gate(16, &coprocessor_error);
 	set_intr_gate(17, &alignment_check);
-- 
cgit v1.2.3-55-g7522


From 0b4b3827db386ec6034a5aba1261025b039440c2 Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Thu, 20 May 2010 21:04:29 -0500
Subject: x86, kgdb, init: Add early and late debug states

The kernel debugger can operate well before mm_init(), but the x86
hardware breakpoint code which uses the perf api requires that the
kernel allocators are initialized.

This means the kernel debug core needs to provide an optional arch
specific call back to allow the initialization functions to run after
the kernel has been further initialized.

The kdb shell already had a similar restriction with an early
initialization and late initialization.  The kdb_init() was moved into
the debug core's version of the late init which is called
dbg_late_init();

CC: kgdb-bugreport@lists.sourceforge.net
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/kgdb.c    | 17 ++++++++++-------
 include/linux/kgdb.h      | 14 ++++++++++++++
 init/main.c               |  4 ++--
 kernel/debug/debug_core.c | 16 ++++++++++++++++
 4 files changed, 42 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 95b89d4cb8f1..2b71ec41869f 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -595,15 +595,16 @@ static struct notifier_block kgdb_notifier = {
  *	specific callbacks.
  */
 int kgdb_arch_init(void)
+{
+	return register_die_notifier(&kgdb_notifier);
+}
+
+void kgdb_arch_late(void)
 {
 	int i, cpu;
-	int ret;
 	struct perf_event_attr attr;
 	struct perf_event **pevent;
 
-	ret = register_die_notifier(&kgdb_notifier);
-	if (ret != 0)
-		return ret;
 	/*
 	 * Pre-allocate the hw breakpoint structions in the non-atomic
 	 * portion of kgdb because this operation requires mutexs to
@@ -615,12 +616,15 @@ int kgdb_arch_init(void)
 	attr.bp_type = HW_BREAKPOINT_W;
 	attr.disabled = 1;
 	for (i = 0; i < 4; i++) {
+		if (breakinfo[i].pev)
+			continue;
 		breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
 		if (IS_ERR(breakinfo[i].pev)) {
-			printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n");
+			printk(KERN_ERR "kgdb: Could not allocate hw"
+			       "breakpoints\nDisabling the kernel debugger\n");
 			breakinfo[i].pev = NULL;
 			kgdb_arch_exit();
-			return -1;
+			return;
 		}
 		for_each_online_cpu(cpu) {
 			pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
@@ -631,7 +635,6 @@ int kgdb_arch_init(void)
 			}
 		}
 	}
-	return ret;
 }
 
 /**
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 6c784ab6856a..9340f34d1bb5 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -207,6 +207,17 @@ extern int kgdb_validate_break_address(unsigned long addr);
 extern int kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr);
 extern int kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle);
 
+/**
+ *	kgdb_arch_late - Perform any architecture specific initalization.
+ *
+ *	This function will handle the late initalization of any
+ *	architecture specific callbacks.  This is an optional function for
+ *	handling things like late initialization of hw breakpoints.  The
+ *	default implementation does nothing.
+ */
+extern void kgdb_arch_late(void);
+
+
 /**
  * struct kgdb_arch - Describe architecture specific values.
  * @gdb_bpt_instr: The instruction to trigger a breakpoint.
@@ -285,7 +296,10 @@ extern int			kgdb_single_step;
 extern atomic_t			kgdb_active;
 #define in_dbg_master() \
 	(raw_smp_processor_id() == atomic_read(&kgdb_active))
+extern bool dbg_is_early;
+extern void __init dbg_late_init(void);
 #else /* ! CONFIG_KGDB */
 #define in_dbg_master() (0)
+#define dbg_late_init()
 #endif /* ! CONFIG_KGDB */
 #endif /* _KGDB_H_ */
diff --git a/init/main.c b/init/main.c
index 372771333d98..22881b5e95e3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -62,7 +62,7 @@
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/idr.h>
-#include <linux/kdb.h>
+#include <linux/kgdb.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
 #include <linux/kmemcheck.h>
@@ -676,7 +676,7 @@ asmlinkage void __init start_kernel(void)
 	buffer_init();
 	key_init();
 	security_init();
-	kdb_init(KDB_INIT_FULL);
+	dbg_late_init();
 	vfs_caches_init(totalram_pages);
 	signals_init();
 	/* rootfs populating might need page-writeback */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 64b5588c9638..5cb7cd1de10c 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -78,6 +78,8 @@ static DEFINE_SPINLOCK(kgdb_registration_lock);
 static int kgdb_con_registered;
 /* determine if kgdb console output should be used */
 static int kgdb_use_con;
+/* Flag for alternate operations for early debugging */
+bool dbg_is_early = true;
 /* Next cpu to become the master debug core */
 int dbg_switch_cpu;
 
@@ -777,11 +779,25 @@ static struct notifier_block kgdb_panic_event_nb = {
        .priority	= INT_MAX,
 };
 
+void __weak kgdb_arch_late(void)
+{
+}
+
+void __init dbg_late_init(void)
+{
+	dbg_is_early = false;
+	if (kgdb_io_module_registered)
+		kgdb_arch_late();
+	kdb_init(KDB_INIT_FULL);
+}
+
 static void kgdb_register_callbacks(void)
 {
 	if (!kgdb_io_module_registered) {
 		kgdb_io_module_registered = 1;
 		kgdb_arch_init();
+		if (!dbg_is_early)
+			kgdb_arch_late();
 		atomic_notifier_chain_register(&panic_notifier_list,
 					       &kgdb_panic_event_nb);
 #ifdef CONFIG_MAGIC_SYSRQ
-- 
cgit v1.2.3-55-g7522


From 031acd8c42edd61070f81c51edc89e83147a3d0e Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Thu, 20 May 2010 21:04:30 -0500
Subject: x86,kgdb: Implement early hardware breakpoint debugging

It is not possible to use the hw_breakpoint.c API prior to mm_init(),
but it is possible to use hardware breakpoints with the kernel
debugger.

Prior to smp_init() it is possible to simply write to the dr registers
of the boot cpu directly.  This can be used up until the
kgdb_arch_late() is invoked, at which point the standard hw_breakpoint.c
API will get used.

CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/kgdb.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 2b71ec41869f..4f4af75b9482 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -199,6 +199,8 @@ static struct hw_breakpoint {
 	struct perf_event	**pev;
 } breakinfo[4];
 
+static unsigned long early_dr7;
+
 static void kgdb_correct_hw_break(void)
 {
 	int breakno;
@@ -210,6 +212,14 @@ static void kgdb_correct_hw_break(void)
 		int cpu = raw_smp_processor_id();
 		if (!breakinfo[breakno].enabled)
 			continue;
+		if (dbg_is_early) {
+			set_debugreg(breakinfo[breakno].addr, breakno);
+			early_dr7 |= encode_dr7(breakno,
+						breakinfo[breakno].len,
+						breakinfo[breakno].type);
+			set_debugreg(early_dr7, 7);
+			continue;
+		}
 		bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu);
 		info = counter_arch_bp(bp);
 		if (bp->attr.disabled != 1)
@@ -224,7 +234,8 @@ static void kgdb_correct_hw_break(void)
 		if (!val)
 			bp->attr.disabled = 0;
 	}
-	hw_breakpoint_restore();
+	if (!dbg_is_early)
+		hw_breakpoint_restore();
 }
 
 static int hw_break_reserve_slot(int breakno)
@@ -233,6 +244,9 @@ static int hw_break_reserve_slot(int breakno)
 	int cnt = 0;
 	struct perf_event **pevent;
 
+	if (dbg_is_early)
+		return 0;
+
 	for_each_online_cpu(cpu) {
 		cnt++;
 		pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
@@ -258,6 +272,9 @@ static int hw_break_release_slot(int breakno)
 	struct perf_event **pevent;
 	int cpu;
 
+	if (dbg_is_early)
+		return 0;
+
 	for_each_online_cpu(cpu) {
 		pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
 		if (dbg_release_bp_slot(*pevent))
@@ -302,7 +319,11 @@ static void kgdb_remove_all_hw_break(void)
 		bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
 		if (bp->attr.disabled == 1)
 			continue;
-		arch_uninstall_hw_breakpoint(bp);
+		if (dbg_is_early)
+			early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
+						 breakinfo[i].type);
+		else
+			arch_uninstall_hw_breakpoint(bp);
 		bp->attr.disabled = 1;
 	}
 }
@@ -379,6 +400,11 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
 	for (i = 0; i < 4; i++) {
 		if (!breakinfo[i].enabled)
 			continue;
+		if (dbg_is_early) {
+			early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
+						 breakinfo[i].type);
+			continue;
+		}
 		bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
 		if (bp->attr.disabled == 1)
 			continue;
-- 
cgit v1.2.3-55-g7522


From 0bb9fef9134cf4fdcfce02f9adc22d3d0725cc29 Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Thu, 20 May 2010 21:04:30 -0500
Subject: x86,early dr regs,kgdb: Allow kernel debugger early dr register
 access

If the kernel debugger was configured, attached and started with
kgdbwait, the hardware breakpoint registers should get restored by the
kgdb code which is managing the dr registers.

CC: x86@kernel.org
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ingo Molnar <mingo@redhat.com>
CC: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/cpu/common.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c1c00d0b1692..cc83a002786e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1084,6 +1084,20 @@ static void clear_all_debug_regs(void)
 	}
 }
 
+#ifdef CONFIG_KGDB
+/*
+ * Restore debug regs if using kgdbwait and you have a kernel debugger
+ * connection established.
+ */
+static void dbg_restore_debug_regs(void)
+{
+	if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break))
+		arch_kgdb_ops.correct_hw_break();
+}
+#else /* ! CONFIG_KGDB */
+#define dbg_restore_debug_regs()
+#endif /* ! CONFIG_KGDB */
+
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
  * initialized (naturally) in the bootstrap process, such as the GDT
@@ -1174,18 +1188,8 @@ void __cpuinit cpu_init(void)
 	load_TR_desc();
 	load_LDT(&init_mm.context);
 
-#ifdef CONFIG_KGDB
-	/*
-	 * If the kgdb is connected no debug regs should be altered.  This
-	 * is only applicable when KGDB and a KGDB I/O module are built
-	 * into the kernel and you are using early debugging with
-	 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
-	 */
-	if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
-		arch_kgdb_ops.correct_hw_break();
-	else
-#endif
-		clear_all_debug_regs();
+	clear_all_debug_regs();
+	dbg_restore_debug_regs();
 
 	fpu_init();
 
@@ -1239,6 +1243,7 @@ void __cpuinit cpu_init(void)
 #endif
 
 	clear_all_debug_regs();
+	dbg_restore_debug_regs();
 
 	/*
 	 * Force FPU initialization:
-- 
cgit v1.2.3-55-g7522


From 61eaf539b9fb4926ed57e38f6545100c3432cf1b Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Thu, 20 May 2010 21:04:31 -0500
Subject: earlyprintk,vga,kdb: Fix \b and \r for earlyprintk=vga with kdb

Allow kdb to work properly with with earlyprintk=vga by interpreting
the backspace and carriage return output characters.  These
interpretation of these characters is used for simple line editing
provided in the kdb shell.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ingo Molnar <mingo@redhat.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: x86@kernel.org
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/early_printk.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index b9c830c12b4a..fa99bae75ace 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -41,6 +41,14 @@ static void early_vga_write(struct console *con, const char *str, unsigned n)
 				writew(0x720, VGABASE + 2*(max_xpos*j + i));
 			current_ypos = max_ypos-1;
 		}
+#ifdef CONFIG_KGDB_KDB
+		if (c == '\b') {
+			if (current_xpos > 0)
+				current_xpos--;
+		} else if (c == '\r') {
+			current_xpos = 0;
+		} else
+#endif
 		if (c == '\n') {
 			current_xpos = 0;
 			current_ypos++;
-- 
cgit v1.2.3-55-g7522


From 3f6ea84a3035cc0ef7488f8e93bc76766799e082 Mon Sep 17 00:00:00 2001
From: Ira W. Snyder
Date: Thu, 1 Apr 2010 11:43:30 -0700
Subject: PCI: read memory ranges out of Broadcom CNB20LE host bridge

Read the memory ranges behind the Broadcom CNB20LE host bridge out of the
hardware. This allows PCI hotplugging to work, since we know which memory
range to allocate PCI BAR's from.

The x86 PCI code automatically prefers the ACPI _CRS information when it is
available. In that case, this information is not used.

Signed-off-by: Ira W. Snyder <iws@ovro.caltech.edu>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/Kconfig            |   8 ++++
 arch/x86/pci/Makefile       |   2 +
 arch/x86/pci/broadcom_bus.c | 101 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 111 insertions(+)
 create mode 100644 arch/x86/pci/broadcom_bus.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9458685902bd..677b87d60a36 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1931,6 +1931,14 @@ config PCI_MMCONFIG
 	bool "Support mmconfig PCI config space access"
 	depends on X86_64 && PCI && ACPI
 
+config PCI_CNB20LE_QUIRK
+	bool "Read CNB20LE Host Bridge Windows"
+	depends on PCI
+	help
+	  Read the PCI windows out of the CNB20LE host bridge. This allows
+	  PCI hotplug to work on systems with the CNB20LE chipset which do
+	  not have ACPI.
+
 config DMAR
 	bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
 	depends on PCI_MSI && ACPI && EXPERIMENTAL
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index b110d97fb925..a0207a7fdf39 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -18,6 +18,8 @@ obj-$(CONFIG_X86_MRST)		+= mrst.o
 obj-y				+= common.o early.o
 obj-y				+= amd_bus.o bus_numa.o
 
+obj-$(CONFIG_PCI_CNB20LE_QUIRK)	+= broadcom_bus.o
+
 ifeq ($(CONFIG_PCI_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
 endif
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
new file mode 100644
index 000000000000..0846a5bbbfbd
--- /dev/null
+++ b/arch/x86/pci/broadcom_bus.c
@@ -0,0 +1,101 @@
+/*
+ * Read address ranges from a Broadcom CNB20LE Host Bridge
+ *
+ * Copyright (c) 2010 Ira W. Snyder <iws@ovro.caltech.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/delay.h>
+#include <linux/dmi.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <asm/pci_x86.h>
+
+#include "bus_numa.h"
+
+static void __devinit cnb20le_res(struct pci_dev *dev)
+{
+	struct pci_root_info *info;
+	struct resource res;
+	u16 word1, word2;
+	u8 fbus, lbus;
+	int i;
+
+	/*
+	 * The x86_pci_root_bus_res_quirks() function already refuses to use
+	 * this information if ACPI _CRS was used. Therefore, we don't bother
+	 * checking if ACPI is enabled, and just generate the information
+	 * for both the ACPI _CRS and no ACPI cases.
+	 */
+
+	info = &pci_root_info[pci_root_num];
+	pci_root_num++;
+
+	/* read the PCI bus numbers */
+	pci_read_config_byte(dev, 0x44, &fbus);
+	pci_read_config_byte(dev, 0x45, &lbus);
+	info->bus_min = fbus;
+	info->bus_max = lbus;
+
+	/*
+	 * Add the legacy IDE ports on bus 0
+	 *
+	 * These do not exist anywhere in the bridge registers, AFAICT. I do
+	 * not have the datasheet, so this is the best I can do.
+	 */
+	if (fbus == 0) {
+		update_res(info, 0x01f0, 0x01f7, IORESOURCE_IO, 0);
+		update_res(info, 0x03f6, 0x03f6, IORESOURCE_IO, 0);
+		update_res(info, 0x0170, 0x0177, IORESOURCE_IO, 0);
+		update_res(info, 0x0376, 0x0376, IORESOURCE_IO, 0);
+		update_res(info, 0xffa0, 0xffaf, IORESOURCE_IO, 0);
+	}
+
+	/* read the non-prefetchable memory window */
+	pci_read_config_word(dev, 0xc0, &word1);
+	pci_read_config_word(dev, 0xc2, &word2);
+	if (word1 != word2) {
+		res.start = (word1 << 16) | 0x0000;
+		res.end   = (word2 << 16) | 0xffff;
+		res.flags = IORESOURCE_MEM;
+		update_res(info, res.start, res.end, res.flags, 0);
+	}
+
+	/* read the prefetchable memory window */
+	pci_read_config_word(dev, 0xc4, &word1);
+	pci_read_config_word(dev, 0xc6, &word2);
+	if (word1 != word2) {
+		res.start = (word1 << 16) | 0x0000;
+		res.end   = (word2 << 16) | 0xffff;
+		res.flags = IORESOURCE_MEM | IORESOURCE_PREFETCH;
+		update_res(info, res.start, res.end, res.flags, 0);
+	}
+
+	/* read the IO port window */
+	pci_read_config_word(dev, 0xd0, &word1);
+	pci_read_config_word(dev, 0xd2, &word2);
+	if (word1 != word2) {
+		res.start = word1;
+		res.end   = word2;
+		res.flags = IORESOURCE_IO;
+		update_res(info, res.start, res.end, res.flags, 0);
+	}
+
+	/* print information about this host bridge */
+	res.start = fbus;
+	res.end   = lbus;
+	res.flags = IORESOURCE_BUS;
+	dev_info(&dev->dev, "CNB20LE PCI Host Bridge (domain %04x %pR)\n",
+			    pci_domain_nr(dev->bus), &res);
+
+	for (i = 0; i < info->res_num; i++)
+		dev_info(&dev->dev, "host bridge window %pR\n", &info->res[i]);
+}
+
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE,
+			cnb20le_res);
+
-- 
cgit v1.2.3-55-g7522


From 841bca1393d315d79077f272c2918423e36dc364 Mon Sep 17 00:00:00 2001
From: Akinobu Mita
Date: Tue, 18 May 2010 08:48:30 +0900
Subject: x86/mmiotrace: Remove redundant instruction prefix checks

Get rid of the duplicated entries in prefix_codes[]
to eliminate redundant checks by skip_prefix().

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: Pekka Paalanen <pq@iki.fi>
LKML-Reference: <1274140110-5841-1-git-send-email-akinobu.mita@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pf_in.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index df3d5c861cda..308e32570d84 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -34,7 +34,7 @@
 /* IA32 Manual 3, 2-1 */
 static unsigned char prefix_codes[] = {
 	0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
-	0x65, 0x2E, 0x3E, 0x66, 0x67
+	0x65, 0x66, 0x67
 };
 /* IA32 Manual 3, 3-432*/
 static unsigned int reg_rop[] = {
-- 
cgit v1.2.3-55-g7522


From 8c3ba8d049247dc06b6dcee1711a11b26647aa44 Mon Sep 17 00:00:00 2001
From: Kerstin Jonsson
Date: Mon, 24 May 2010 12:13:15 -0700
Subject: x86, apic: ack all pending irqs when crashed/on kexec

When the SMP kernel decides to crash_kexec() the local APICs may have
pending interrupts in their vector tables.

The setup routine for the local APIC has a deficient mechanism for
clearing these interrupts, it only handles interrupts that has already
been dispatched to the local core for servicing (the ISR register) safely,
it doesn't consider lower prioritized queued interrupts stored in the IRR
register.

If you have more than one pending interrupt within the same 32 bit word in
the LAPIC vector table registers you may find yourself entering the IO
APIC setup with pending interrupts left in the LAPIC.  This is a situation
for wich the IO APIC setup is not prepared.  Depending of what/which
interrupt vector/vectors are stuck in the APIC tables your system may show
various degrees of malfunctioning.  That was the reason why the
check_timer() failed in our system, the timer interrupts was blocked by
pending interrupts from the old kernel when routed trough the IO APIC.

Additional comment from Jiri Bohac:
==============
If this should go into stable release,
I'd add some kind of limit on the number of iterations, just to be safe from
hard to debug lock-ups:

+if (loops++  > MAX_LOOPS) {
+        printk("LAPIC pending clean-up")
+        break;
+}
 while (queued);

with MAX_LOOPS something like 1E9 this would leave plenty of time for the
pending IRQs to be cleared and would and still cause at most a second of delay
if the loop were to lock-up for whatever reason.

[trenn@suse.de:

V2: Use tsc if avail to bail out after 1 sec due to possible virtual
    apic_read calls which may take rather long (suggested by: Avi Kivity
    <avi@redhat.com>) If no tsc is available bail out quickly after
    cpu_khz, if we broke out too early and still have irqs pending (which
    should never happen?) we still get a WARN_ON...

V3: - Fixed indentation -> checkpatch clean
    - max_loops must be signed

V4: - Fix typo, mixed up tsc and ntsc in first rdtscll() call

V5: Adjust WARN_ON() condition to also catch error in cpu_has_tsc case]

Cc: <jbohac@novell.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Kerstin Jonsson <kerstin.jonsson@ericsson.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Tested-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Thomas Renninger <trenn@suse.de>
LKML-Reference: <201005241913.o4OJDGWM010865@imap1.linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/apic/apic.c | 41 +++++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e5a4a1e01618..c02cc692985c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -51,6 +51,7 @@
 #include <asm/smp.h>
 #include <asm/mce.h>
 #include <asm/kvm_para.h>
+#include <asm/tsc.h>
 
 unsigned int num_processors;
 
@@ -1151,8 +1152,13 @@ static void __cpuinit lapic_setup_esr(void)
  */
 void __cpuinit setup_local_APIC(void)
 {
-	unsigned int value;
-	int i, j;
+	unsigned int value, queued;
+	int i, j, acked = 0;
+	unsigned long long tsc = 0, ntsc;
+	long long max_loops = cpu_khz;
+
+	if (cpu_has_tsc)
+		rdtscll(tsc);
 
 	if (disable_apic) {
 		arch_disable_smp_support();
@@ -1204,13 +1210,32 @@ void __cpuinit setup_local_APIC(void)
 	 * the interrupt. Hence a vector might get locked. It was noticed
 	 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
 	 */
-	for (i = APIC_ISR_NR - 1; i >= 0; i--) {
-		value = apic_read(APIC_ISR + i*0x10);
-		for (j = 31; j >= 0; j--) {
-			if (value & (1<<j))
-				ack_APIC_irq();
+	do {
+		queued = 0;
+		for (i = APIC_ISR_NR - 1; i >= 0; i--)
+			queued |= apic_read(APIC_IRR + i*0x10);
+
+		for (i = APIC_ISR_NR - 1; i >= 0; i--) {
+			value = apic_read(APIC_ISR + i*0x10);
+			for (j = 31; j >= 0; j--) {
+				if (value & (1<<j)) {
+					ack_APIC_irq();
+					acked++;
+				}
+			}
 		}
-	}
+		if (acked > 256) {
+			printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n",
+			       acked);
+			break;
+		}
+		if (cpu_has_tsc) {
+			rdtscll(ntsc);
+			max_loops = (cpu_khz << 10) - (ntsc - tsc);
+		} else
+			max_loops--;
+	} while (queued && max_loops > 0);
+	WARN_ON(max_loops <= 0);
 
 	/*
 	 * Now that we are all set up, enable the APIC
-- 
cgit v1.2.3-55-g7522


From b46fc5f235be04a7f77fb2af1d8cb809889c25c1 Mon Sep 17 00:00:00 2001
From: Julia Lawall
Date: Mon, 24 May 2010 12:13:16 -0700
Subject: arch/x86/pci: use kasprintf

kasprintf combines kmalloc and sprintf, and takes care of the size
calculation itself.

The semantic patch that makes this change is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@@
expression a,flag;
expression list args;
statement S;
@@

  a =
-  \(kmalloc\|kzalloc\)(...,flag)
+  kasprintf(flag,args)
  <... when != a
  if (a == NULL || ...) S
  ...>
- sprintf(a,args);
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
LKML-Reference: <201005241913.o4OJDG3R010871@imap1.linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/pci/acpi.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 31930fd30ea9..7c0ad634694a 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -207,10 +207,9 @@ get_current_resources(struct acpi_device *device, int busnum,
 	if (!info.res)
 		goto res_alloc_fail;
 
-	info.name = kmalloc(16, GFP_KERNEL);
+	info.name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum);
 	if (!info.name)
 		goto name_alloc_fail;
-	sprintf(info.name, "PCI Bus %04x:%02x", domain, busnum);
 
 	info.res_num = 0;
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
-- 
cgit v1.2.3-55-g7522


From 5f2eb55026c91f8400ab4469aff88b2e201b5616 Mon Sep 17 00:00:00 2001
From: Jan Beulich
Date: Mon, 24 May 2010 12:13:17 -0700
Subject: x86: "nosmp" command line option should force the system into UP mode

Bits set in cpu_possible_mask prior to the execution of
prefill_possible_map() (i.e.  when parsing ACPI or MPS tables) would
prevent the SMP alternatives logic from switching to UP mode, plus
unnecessary setup of per-CPU data for CPUs that can never come online.

Additionally, without CONFIG_HOTPLUG_CPU disabled CPUs can never come
online, and hence setting cpu_possible_mask bits for them is again a
simple waste of resources.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <201005241913.o4OJDH3Z010874@imap1.linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/smpboot.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 763d815e27a0..37462f1ddba5 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1215,9 +1215,17 @@ __init void prefill_possible_map(void)
 	if (!num_processors)
 		num_processors = 1;
 
-	if (setup_possible_cpus == -1)
-		possible = num_processors + disabled_cpus;
-	else
+	i = setup_max_cpus ?: 1;
+	if (setup_possible_cpus == -1) {
+		possible = num_processors;
+#ifdef CONFIG_HOTPLUG_CPU
+		if (setup_max_cpus)
+			possible += disabled_cpus;
+#else
+		if (possible > i)
+			possible = i;
+#endif
+	} else
 		possible = setup_possible_cpus;
 
 	total_cpus = max_t(int, possible, num_processors + disabled_cpus);
@@ -1230,11 +1238,23 @@ __init void prefill_possible_map(void)
 		possible = nr_cpu_ids;
 	}
 
+#ifdef CONFIG_HOTPLUG_CPU
+	if (!setup_max_cpus)
+#endif
+	if (possible > i) {
+		printk(KERN_WARNING
+			"%d Processors exceeds max_cpus limit of %u\n",
+			possible, setup_max_cpus);
+		possible = i;
+	}
+
 	printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
 		possible, max_t(int, possible - num_processors, 0));
 
 	for (i = 0; i < possible; i++)
 		set_cpu_possible(i, true);
+	for (; i < NR_CPUS; i++)
+		set_cpu_possible(i, false);
 
 	nr_cpu_ids = possible;
 }
-- 
cgit v1.2.3-55-g7522


From 3d6e77a3ddb8e4156b89f4273ff8c7d37abaf781 Mon Sep 17 00:00:00 2001
From: Gabor Gombas
Date: Mon, 24 May 2010 12:13:18 -0700
Subject: x86, setup: Phoenix BIOS fixup is needed on Dell Inspiron Mini 1012

The low-memory corruption checker triggers during suspend/resume, so we
need to reserve the low 64k.  Don't be fooled that the BIOS identifies
itself as "Dell Inc.", it's still Phoenix BIOS.

[ hpa: I think we blacklist almost every BIOS in existence.  We should
either change this to a whitelist or just make it unconditional. ]

Signed-off-by: Gabor Gombas <gombasg@digikabel.hu>
LKML-Reference: <201005241913.o4OJDIMM010877@imap1.linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: <stable@kernel.org>
---
 arch/x86/kernel/setup.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e8029896309a..b4ae4acbd031 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -676,6 +676,17 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
 		},
 	},
+	/*
+	 * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
+	 * match on the product name.
+	 */
+	{
+		.callback = dmi_low_memory_corruption,
+		.ident = "Phoenix BIOS",
+		.matches = {
+			DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
+		},
+	},
 #endif
 	{}
 };
-- 
cgit v1.2.3-55-g7522


From 48691ff86d91db1090551ec2a5ae0d80ef59105f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Mon, 24 May 2010 12:13:19 -0700
Subject: x86: remove last traces of quicklist usage

We still have a stray quicklist header included even though we axed
quicklist usage quite a while back.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <201005241913.o4OJDJe9010881@imap1.linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/pgtable_32.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 792854003ed3..cac718499256 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -9,7 +9,6 @@
 #include <linux/pagemap.h>
 #include <linux/spinlock.h>
 #include <linux/module.h>
-#include <linux/quicklist.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
-- 
cgit v1.2.3-55-g7522


From 87f44bbc246c5244c76a701f8eefba7788bce64a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Tue, 25 May 2010 11:02:55 +0200
Subject: perf, trace: Fix !x86 build bug

Patch b7e2ecef92 (perf, trace: Optimize tracepoints by removing
IRQ-disable from perf/tracepoint interaction) made the
unfortunate mistake of assuming the world is x86 only, correct
this.

The problem was that perf_fetch_caller_regs() did
local_save_flags() into regs->flags, and I re-used that to
remove another local_save_flags(), forgetting !x86 doesn't have
regs->flags.

Do the reverse, remove the local_save_flags() from
perf_fetch_caller_regs() and let the ftrace site do the
local_save_flags() instead.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Cc: acme@redhat.com
Cc: efault@gmx.de
Cc: fweisbec@gmail.com
Cc: rostedt@goodmis.org
LKML-Reference: <1274778175.5882.623.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 6 +++++-
 kernel/trace/trace_event_perf.c  | 4 +++-
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index fd4db0db3708..c77586061bcb 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1717,7 +1717,11 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 	 */
 	regs->bp = rewind_frame_pointer(skip + 1);
 	regs->cs = __KERNEL_CS;
-	local_save_flags(regs->flags);
+	/*
+	 * We abuse bit 3 to pass exact information, see perf_misc_flags
+	 * and the comment with PERF_EFLAGS_EXACT.
+	 */
+	regs->flags = 0;
 }
 
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 26b8607a0abc..cb6f365016e4 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -157,6 +157,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
 				       struct pt_regs *regs, int *rctxp)
 {
 	struct trace_entry *entry;
+	unsigned long flags;
 	char *raw_data;
 	int pc;
 
@@ -174,7 +175,8 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
 	memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
 
 	entry = (struct trace_entry *)raw_data;
-	tracing_generic_entry_update(entry, regs->flags, pc);
+	local_save_flags(flags);
+	tracing_generic_entry_update(entry, flags, pc);
 	entry->type = type;
 
 	return raw_data;
-- 
cgit v1.2.3-55-g7522


From b3b77c8caef1750ebeea1054e39e358550ea9f55 Mon Sep 17 00:00:00 2001
From: Joakim Tjernlund
Date: Mon, 24 May 2010 14:33:01 -0700
Subject: endian: #define __BYTE_ORDER

Linux does not define __BYTE_ORDER in its endian header files which makes
some header files bend backwards to get at the current endian.  Lets
#define __BYTE_ORDER in big_endian.h/litte_endian.h to make it easier for
header files that are used in user space too.

In userspace the convention is that

  1. _both_ __LITTLE_ENDIAN and __BIG_ENDIAN are defined,
  2. you have to test for e.g. __BYTE_ORDER == __BIG_ENDIAN.

Signed-off-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/math-emu/sfp-util.h          | 5 -----
 arch/powerpc/include/asm/sfp-machine.h  | 6 ------
 arch/s390/include/asm/sfp-util.h        | 2 --
 arch/sh/math-emu/sfp-util.h             | 4 ----
 arch/sparc/math-emu/sfp-util_32.h       | 6 ------
 arch/sparc/math-emu/sfp-util_64.h       | 6 ------
 arch/x86/boot/compressed/relocs.c       | 4 ++--
 include/linux/byteorder/big_endian.h    | 3 +++
 include/linux/byteorder/little_endian.h | 3 +++
 9 files changed, 8 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/math-emu/sfp-util.h b/arch/alpha/math-emu/sfp-util.h
index f53707f77455..d4c6ae7fee47 100644
--- a/arch/alpha/math-emu/sfp-util.h
+++ b/arch/alpha/math-emu/sfp-util.h
@@ -28,8 +28,3 @@ extern unsigned long __udiv_qrnnd (unsigned long *, unsigned long,
 #define UDIV_NEEDS_NORMALIZATION 1  
 
 #define abort()			goto bad_insn
-
-#ifndef __LITTLE_ENDIAN
-#define __LITTLE_ENDIAN -1
-#endif
-#define __BYTE_ORDER __LITTLE_ENDIAN
diff --git a/arch/powerpc/include/asm/sfp-machine.h b/arch/powerpc/include/asm/sfp-machine.h
index 3a7a67a0d006..8b8fab91ad1e 100644
--- a/arch/powerpc/include/asm/sfp-machine.h
+++ b/arch/powerpc/include/asm/sfp-machine.h
@@ -353,12 +353,6 @@
 #define abort()								\
 	return 0
 
-#ifdef __BIG_ENDIAN
-#define __BYTE_ORDER __BIG_ENDIAN
-#else
-#define __BYTE_ORDER __LITTLE_ENDIAN
-#endif
-
 /* Exception flags. */
 #define EFLAG_INVALID		(1 << (31 - 2))
 #define EFLAG_OVERFLOW		(1 << (31 - 3))
diff --git a/arch/s390/include/asm/sfp-util.h b/arch/s390/include/asm/sfp-util.h
index 0addc6466d95..7d43fee17e32 100644
--- a/arch/s390/include/asm/sfp-util.h
+++ b/arch/s390/include/asm/sfp-util.h
@@ -73,5 +73,3 @@ extern unsigned long __udiv_qrnnd (unsigned int *, unsigned int,
 #define UDIV_NEEDS_NORMALIZATION 0
 
 #define abort() return 0
-
-#define __BYTE_ORDER __BIG_ENDIAN
diff --git a/arch/sh/math-emu/sfp-util.h b/arch/sh/math-emu/sfp-util.h
index 8ae1bd310ad0..e8526021892f 100644
--- a/arch/sh/math-emu/sfp-util.h
+++ b/arch/sh/math-emu/sfp-util.h
@@ -66,7 +66,3 @@
   } while (0)
 
 #define abort()	return 0
-
-#define __BYTE_ORDER __LITTLE_ENDIAN
-
-
diff --git a/arch/sparc/math-emu/sfp-util_32.h b/arch/sparc/math-emu/sfp-util_32.h
index d1b2aff3c259..0ea35afbb914 100644
--- a/arch/sparc/math-emu/sfp-util_32.h
+++ b/arch/sparc/math-emu/sfp-util_32.h
@@ -107,9 +107,3 @@
 
 #define abort()								\
 	return 0
-
-#ifdef __BIG_ENDIAN
-#define __BYTE_ORDER __BIG_ENDIAN
-#else
-#define __BYTE_ORDER __LITTLE_ENDIAN
-#endif
diff --git a/arch/sparc/math-emu/sfp-util_64.h b/arch/sparc/math-emu/sfp-util_64.h
index 425d3cf01af4..d17c9bc72181 100644
--- a/arch/sparc/math-emu/sfp-util_64.h
+++ b/arch/sparc/math-emu/sfp-util_64.h
@@ -112,9 +112,3 @@
 
 #define abort() \
 	return 0
-
-#ifdef __BIG_ENDIAN
-#define __BYTE_ORDER __BIG_ENDIAN
-#else
-#define __BYTE_ORDER __LITTLE_ENDIAN
-#endif
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index 89bbf4e4d05d..7b1aaa20c7b5 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -195,11 +195,11 @@ static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym)
 
 
-#if BYTE_ORDER == LITTLE_ENDIAN
+#if __BYTE_ORDER == __LITTLE_ENDIAN
 #define le16_to_cpu(val) (val)
 #define le32_to_cpu(val) (val)
 #endif
-#if BYTE_ORDER == BIG_ENDIAN
+#if __BYTE_ORDER == __BIG_ENDIAN
 #define le16_to_cpu(val) bswap_16(val)
 #define le32_to_cpu(val) bswap_32(val)
 #endif
diff --git a/include/linux/byteorder/big_endian.h b/include/linux/byteorder/big_endian.h
index 3c80fd7e8b56..d53a67dff018 100644
--- a/include/linux/byteorder/big_endian.h
+++ b/include/linux/byteorder/big_endian.h
@@ -7,6 +7,9 @@
 #ifndef __BIG_ENDIAN_BITFIELD
 #define __BIG_ENDIAN_BITFIELD
 #endif
+#ifndef __BYTE_ORDER
+#define __BYTE_ORDER __BIG_ENDIAN
+#endif
 
 #include <linux/types.h>
 #include <linux/swab.h>
diff --git a/include/linux/byteorder/little_endian.h b/include/linux/byteorder/little_endian.h
index 83195fb82962..f7f8ad13adb6 100644
--- a/include/linux/byteorder/little_endian.h
+++ b/include/linux/byteorder/little_endian.h
@@ -7,6 +7,9 @@
 #ifndef __LITTLE_ENDIAN_BITFIELD
 #define __LITTLE_ENDIAN_BITFIELD
 #endif
+#ifndef __BYTE_ORDER
+#define __BYTE_ORDER __LITTLE_ENDIAN
+#endif
 
 #include <linux/types.h>
 #include <linux/swab.h>
-- 
cgit v1.2.3-55-g7522


From a321cedb12904114e2ba5041a3673ca24deb09c9 Mon Sep 17 00:00:00 2001
From: Carsten Emde
Date: Mon, 24 May 2010 14:33:41 -0700
Subject: drivers/hwmon/coretemp.c: get TjMax value from MSR

The MSR IA32_TEMPERATURE_TARGET contains the TjMax value in the newer
Intel processors.

Signed-off-by: Huaxu Wan <huaxu.wan@linux.intel.com>
Signed-off-by: Carsten Emde <C.Emde@osadl.org>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Cc: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Cc: Yong Wang <yong.y.wang@linux.intel.com>
Cc: Rudolf Marek <r.marek@assembler.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/msr-index.h |  2 ++
 drivers/hwmon/coretemp.c         | 61 +++++++++++++++++++++++++++++++++++++---
 2 files changed, 59 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index f9324851eba0..b49d8ca228f6 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -236,6 +236,8 @@
 
 #define MSR_IA32_MISC_ENABLE		0x000001a0
 
+#define MSR_IA32_TEMPERATURE_TARGET	0x000001a2
+
 /* MISC_ENABLE bits: architectural */
 #define MSR_IA32_MISC_ENABLE_FAST_STRING	(1ULL << 0)
 #define MSR_IA32_MISC_ENABLE_TCC		(1ULL << 1)
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index 9fae7cbc5d76..2988da150ed6 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -241,6 +241,55 @@ static int __devinit adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *
 	return tjmax;
 }
 
+static int __devinit get_tjmax(struct cpuinfo_x86 *c, u32 id,
+			       struct device *dev)
+{
+	/* The 100C is default for both mobile and non mobile CPUs */
+	int err;
+	u32 eax, edx;
+	u32 val;
+
+	/* A new feature of current Intel(R) processors, the
+	   IA32_TEMPERATURE_TARGET contains the TjMax value */
+	err = rdmsr_safe_on_cpu(id, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
+	if (err) {
+		dev_warn(dev, "Unable to read TjMax from CPU.\n");
+	} else {
+		val = (eax >> 16) & 0xff;
+		/*
+		 * If the TjMax is not plausible, an assumption
+		 * will be used
+		 */
+		if ((val > 80) && (val < 120)) {
+			dev_info(dev, "TjMax is %d C.\n", val);
+			return val * 1000;
+		}
+	}
+
+	/*
+	 * An assumption is made for early CPUs and unreadable MSR.
+	 * NOTE: the given value may not be correct.
+	 */
+
+	switch (c->x86_model) {
+	case 0xe:
+	case 0xf:
+	case 0x16:
+	case 0x1a:
+		dev_warn(dev, "TjMax is assumed as 100 C!\n");
+		return 100000;
+		break;
+	case 0x17:
+	case 0x1c:		/* Atom CPUs */
+		return adjust_tjmax(c, id, dev);
+		break;
+	default:
+		dev_warn(dev, "CPU (model=0x%x) is not supported yet,"
+			" using default TjMax of 100C.\n", c->x86_model);
+		return 100000;
+	}
+}
+
 static int __devinit coretemp_probe(struct platform_device *pdev)
 {
 	struct coretemp_data *data;
@@ -283,14 +332,18 @@ static int __devinit coretemp_probe(struct platform_device *pdev)
 		}
 	}
 
-	data->tjmax = adjust_tjmax(c, data->id, &pdev->dev);
+	data->tjmax = get_tjmax(c, data->id, &pdev->dev);
 	platform_set_drvdata(pdev, data);
 
-	/* read the still undocumented IA32_TEMPERATURE_TARGET it exists
-	   on older CPUs but not in this register, Atoms don't have it either */
+	/*
+	 * read the still undocumented IA32_TEMPERATURE_TARGET. It exists
+	 * on older CPUs but not in this register,
+	 * Atoms don't have it either.
+	 */
 
 	if ((c->x86_model > 0xe) && (c->x86_model != 0x1c)) {
-		err = rdmsr_safe_on_cpu(data->id, 0x1a2, &eax, &edx);
+		err = rdmsr_safe_on_cpu(data->id, MSR_IA32_TEMPERATURE_TARGET,
+		    &eax, &edx);
 		if (err) {
 			dev_warn(&pdev->dev, "Unable to read"
 					" IA32_TEMPERATURE_TARGET MSR\n");
-- 
cgit v1.2.3-55-g7522


From 578454ff7eab61d13a26b568f99a89a2c9edc881 Mon Sep 17 00:00:00 2001
From: Kay Sievers
Date: Thu, 20 May 2010 18:07:20 +0200
Subject: driver core: add devname module aliases to allow module on-demand
 auto-loading

This adds:
  alias: devname:<name>
to some common kernel modules, which will allow the on-demand loading
of the kernel module when the device node is accessed.

Ideally all these modules would be compiled-in, but distros seems too
much in love with their modularization that we need to cover the common
cases with this new facility. It will allow us to remove a bunch of pretty
useless init scripts and modprobes from init scripts.

The static device node aliases will be carried in the module itself. The
program depmod will extract this information to a file in the module directory:
  $ cat /lib/modules/2.6.34-00650-g537b60d-dirty/modules.devname
  # Device nodes to trigger on-demand module loading.
  microcode cpu/microcode c10:184
  fuse fuse c10:229
  ppp_generic ppp c108:0
  tun net/tun c10:200
  dm_mod mapper/control c10:235

Udev will pick up the depmod created file on startup and create all the
static device nodes which the kernel modules specify, so that these modules
get automatically loaded when the device node is accessed:
  $ /sbin/udevd --debug
  ...
  static_dev_create_from_modules: mknod '/dev/cpu/microcode' c10:184
  static_dev_create_from_modules: mknod '/dev/fuse' c10:229
  static_dev_create_from_modules: mknod '/dev/ppp' c108:0
  static_dev_create_from_modules: mknod '/dev/net/tun' c10:200
  static_dev_create_from_modules: mknod '/dev/mapper/control' c10:235
  udev_rules_apply_static_dev_perms: chmod '/dev/net/tun' 0666
  udev_rules_apply_static_dev_perms: chmod '/dev/fuse' 0666

A few device nodes are switched to statically allocated numbers, to allow
the static nodes to work. This might also useful for systems which still run
a plain static /dev, which is completely unsafe to use with any dynamic minor
numbers.

Note:
The devname aliases must be limited to the *common* and *single*instance*
device nodes, like the misc devices, and never be used for conceptually limited
systems like the loop devices, which should rather get fixed properly and get a
control node for losetup to talk to, instead of creating a random number of
device nodes in advance, regardless if they are ever used.

This facility is to hide the mess distros are creating with too modualized
kernels, and just to hide that these modules are not compiled-in, and not to
paper-over broken concepts. Thanks! :)

Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: David S. Miller <davem@davemloft.net>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Cc: Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
Cc: Ian Kent <raven@themaw.net>
Signed-Off-By: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/devices.txt        | 2 ++
 arch/x86/kernel/microcode_core.c | 1 +
 drivers/net/ppp_generic.c        | 4 ++--
 drivers/net/tun.c                | 1 +
 fs/autofs4/dev-ioctl.c           | 5 ++++-
 fs/btrfs/super.c                 | 5 ++++-
 fs/fuse/dev.c                    | 1 +
 include/linux/miscdevice.h       | 2 ++
 8 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 53d64d382343..1d83d124056c 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -443,6 +443,8 @@ Your cooperation is appreciated.
 		231 = /dev/snapshot	System memory snapshot device
 		232 = /dev/kvm		Kernel-based virtual machine (hardware virtualization extensions)
 		233 = /dev/kmview	View-OS A process with a view
+		234 = /dev/btrfs-control	Btrfs control device
+		235 = /dev/autofs	Autofs control device
 		240-254			Reserved for local use
 		255			Reserved for MISC_DYNAMIC_MINOR
 
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 2cd8c544e41a..fa6551d36c10 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -260,6 +260,7 @@ static void microcode_dev_exit(void)
 }
 
 MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+MODULE_ALIAS("devname:cpu/microcode");
 #else
 #define microcode_dev_init()	0
 #define microcode_dev_exit()	do { } while (0)
diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c
index 5441688daba7..c5f8eb102bf7 100644
--- a/drivers/net/ppp_generic.c
+++ b/drivers/net/ppp_generic.c
@@ -2926,5 +2926,5 @@ EXPORT_SYMBOL(ppp_output_wakeup);
 EXPORT_SYMBOL(ppp_register_compressor);
 EXPORT_SYMBOL(ppp_unregister_compressor);
 MODULE_LICENSE("GPL");
-MODULE_ALIAS_CHARDEV_MAJOR(PPP_MAJOR);
-MODULE_ALIAS("/dev/ppp");
+MODULE_ALIAS_CHARDEV(PPP_MAJOR, 0);
+MODULE_ALIAS("devname:ppp");
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 97b25533e5fb..005cad689578 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1649,3 +1649,4 @@ MODULE_DESCRIPTION(DRV_DESCRIPTION);
 MODULE_AUTHOR(DRV_COPYRIGHT);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_MISCDEV(TUN_MINOR);
+MODULE_ALIAS("devname:net/tun");
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index d29b7f6df862..d832062869f6 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -736,11 +736,14 @@ static const struct file_operations _dev_ioctl_fops = {
 };
 
 static struct miscdevice _autofs_dev_ioctl_misc = {
-	.minor 		= MISC_DYNAMIC_MINOR,
+	.minor		= AUTOFS_MINOR,
 	.name  		= AUTOFS_DEVICE_NAME,
 	.fops  		= &_dev_ioctl_fops
 };
 
+MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
+MODULE_ALIAS("devname:autofs");
+
 /* Register/deregister misc character device */
 int autofs_dev_ioctl_init(void)
 {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1866dff0538e..2909a03e5230 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -832,11 +832,14 @@ static const struct file_operations btrfs_ctl_fops = {
 };
 
 static struct miscdevice btrfs_misc = {
-	.minor		= MISC_DYNAMIC_MINOR,
+	.minor		= BTRFS_MINOR,
 	.name		= "btrfs-control",
 	.fops		= &btrfs_ctl_fops
 };
 
+MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
+MODULE_ALIAS("devname:btrfs-control");
+
 static int btrfs_interface_init(void)
 {
 	return misc_register(&btrfs_misc);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index eb7e9423691f..e53df5ebb2b8 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 
 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
+MODULE_ALIAS("devname:fuse");
 
 static struct kmem_cache *fuse_req_cachep;
 
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 8b5f7cc0fba6..b631c46cffd9 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -31,6 +31,8 @@
 #define FUSE_MINOR		229
 #define KVM_MINOR		232
 #define VHOST_NET_MINOR		233
+#define BTRFS_MINOR		234
+#define AUTOFS_MINOR		235
 #define MISC_DYNAMIC_MINOR	255
 
 struct device;
-- 
cgit v1.2.3-55-g7522


From fe501f1e89cd460793152f500bf25d81d463515b Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Tue, 25 May 2010 15:28:58 +0000
Subject: x86, k8: Fix section mismatch for powernowk8_exit()

Fix the following warning:

"WARNING: arch/x86/kernel/built-in.o(.exit.text+0x72):
Section mismatch in reference from the function powernowk8_exit() to the variable .cpuinit.data:cpb_nb

The function __exit powernowk8_exit() references a variable
__cpuinitdata cpb_nb. This is often seen when error handling in the exit
function uses functionality in the init path. The fix is often to remove
the __cpuinitdata annotation of cpb_nb so it may be used outside an init
section."

Cc: <stable@kernel.org>
Reported-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20100525152858.GA24836@aftab>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 6f3dc8fbbfdc..7ec2123838e6 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1497,8 +1497,8 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
  * simply keep the boost-disable flag in sync with the current global
  * state.
  */
-static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action,
-				void *hcpu)
+static int cpb_notify(struct notifier_block *nb, unsigned long action,
+		      void *hcpu)
 {
 	unsigned cpu = (long)hcpu;
 	u32 lo, hi;
@@ -1528,7 +1528,7 @@ static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata cpb_nb = {
+static struct notifier_block cpb_nb = {
 	.notifier_call		= cpb_notify,
 };
 
-- 
cgit v1.2.3-55-g7522


From 13da9e200fe4740b02cd51e07ab454627e228920 Mon Sep 17 00:00:00 2001
From: Linus Torvalds
Date: Wed, 26 May 2010 08:30:15 -0700
Subject: Revert "endian: #define __BYTE_ORDER"

This reverts commit b3b77c8caef1750ebeea1054e39e358550ea9f55, which was
also totally broken (see commit 0d2daf5cc858 that reverted the crc32
version of it).  As reported by Stephen Rothwell, it causes problems on
big-endian machines:

> In file included from fs/jfs/jfs_types.h:33,
>                  from fs/jfs/jfs_incore.h:26,
>                  from fs/jfs/file.c:22:
> fs/jfs/endian24.h:36:101: warning: "__LITTLE_ENDIAN" is not defined

The kernel has never had that crazy "__BYTE_ORDER == __LITTLE_ENDIAN"
model.  It's not how we do things, and it isn't how we _should_ do
things.  So don't go there.

Requested-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/math-emu/sfp-util.h          | 5 +++++
 arch/powerpc/include/asm/sfp-machine.h  | 6 ++++++
 arch/s390/include/asm/sfp-util.h        | 2 ++
 arch/sh/math-emu/sfp-util.h             | 4 ++++
 arch/sparc/math-emu/sfp-util_32.h       | 6 ++++++
 arch/sparc/math-emu/sfp-util_64.h       | 6 ++++++
 arch/x86/boot/compressed/relocs.c       | 4 ++--
 include/linux/byteorder/big_endian.h    | 3 ---
 include/linux/byteorder/little_endian.h | 3 ---
 9 files changed, 31 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/math-emu/sfp-util.h b/arch/alpha/math-emu/sfp-util.h
index d4c6ae7fee47..f53707f77455 100644
--- a/arch/alpha/math-emu/sfp-util.h
+++ b/arch/alpha/math-emu/sfp-util.h
@@ -28,3 +28,8 @@ extern unsigned long __udiv_qrnnd (unsigned long *, unsigned long,
 #define UDIV_NEEDS_NORMALIZATION 1  
 
 #define abort()			goto bad_insn
+
+#ifndef __LITTLE_ENDIAN
+#define __LITTLE_ENDIAN -1
+#endif
+#define __BYTE_ORDER __LITTLE_ENDIAN
diff --git a/arch/powerpc/include/asm/sfp-machine.h b/arch/powerpc/include/asm/sfp-machine.h
index 8b8fab91ad1e..3a7a67a0d006 100644
--- a/arch/powerpc/include/asm/sfp-machine.h
+++ b/arch/powerpc/include/asm/sfp-machine.h
@@ -353,6 +353,12 @@
 #define abort()								\
 	return 0
 
+#ifdef __BIG_ENDIAN
+#define __BYTE_ORDER __BIG_ENDIAN
+#else
+#define __BYTE_ORDER __LITTLE_ENDIAN
+#endif
+
 /* Exception flags. */
 #define EFLAG_INVALID		(1 << (31 - 2))
 #define EFLAG_OVERFLOW		(1 << (31 - 3))
diff --git a/arch/s390/include/asm/sfp-util.h b/arch/s390/include/asm/sfp-util.h
index 7d43fee17e32..0addc6466d95 100644
--- a/arch/s390/include/asm/sfp-util.h
+++ b/arch/s390/include/asm/sfp-util.h
@@ -73,3 +73,5 @@ extern unsigned long __udiv_qrnnd (unsigned int *, unsigned int,
 #define UDIV_NEEDS_NORMALIZATION 0
 
 #define abort() return 0
+
+#define __BYTE_ORDER __BIG_ENDIAN
diff --git a/arch/sh/math-emu/sfp-util.h b/arch/sh/math-emu/sfp-util.h
index e8526021892f..8ae1bd310ad0 100644
--- a/arch/sh/math-emu/sfp-util.h
+++ b/arch/sh/math-emu/sfp-util.h
@@ -66,3 +66,7 @@
   } while (0)
 
 #define abort()	return 0
+
+#define __BYTE_ORDER __LITTLE_ENDIAN
+
+
diff --git a/arch/sparc/math-emu/sfp-util_32.h b/arch/sparc/math-emu/sfp-util_32.h
index 0ea35afbb914..d1b2aff3c259 100644
--- a/arch/sparc/math-emu/sfp-util_32.h
+++ b/arch/sparc/math-emu/sfp-util_32.h
@@ -107,3 +107,9 @@
 
 #define abort()								\
 	return 0
+
+#ifdef __BIG_ENDIAN
+#define __BYTE_ORDER __BIG_ENDIAN
+#else
+#define __BYTE_ORDER __LITTLE_ENDIAN
+#endif
diff --git a/arch/sparc/math-emu/sfp-util_64.h b/arch/sparc/math-emu/sfp-util_64.h
index d17c9bc72181..425d3cf01af4 100644
--- a/arch/sparc/math-emu/sfp-util_64.h
+++ b/arch/sparc/math-emu/sfp-util_64.h
@@ -112,3 +112,9 @@
 
 #define abort() \
 	return 0
+
+#ifdef __BIG_ENDIAN
+#define __BYTE_ORDER __BIG_ENDIAN
+#else
+#define __BYTE_ORDER __LITTLE_ENDIAN
+#endif
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index 7b1aaa20c7b5..89bbf4e4d05d 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -195,11 +195,11 @@ static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym)
 
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
+#if BYTE_ORDER == LITTLE_ENDIAN
 #define le16_to_cpu(val) (val)
 #define le32_to_cpu(val) (val)
 #endif
-#if __BYTE_ORDER == __BIG_ENDIAN
+#if BYTE_ORDER == BIG_ENDIAN
 #define le16_to_cpu(val) bswap_16(val)
 #define le32_to_cpu(val) bswap_32(val)
 #endif
diff --git a/include/linux/byteorder/big_endian.h b/include/linux/byteorder/big_endian.h
index d53a67dff018..3c80fd7e8b56 100644
--- a/include/linux/byteorder/big_endian.h
+++ b/include/linux/byteorder/big_endian.h
@@ -7,9 +7,6 @@
 #ifndef __BIG_ENDIAN_BITFIELD
 #define __BIG_ENDIAN_BITFIELD
 #endif
-#ifndef __BYTE_ORDER
-#define __BYTE_ORDER __BIG_ENDIAN
-#endif
 
 #include <linux/types.h>
 #include <linux/swab.h>
diff --git a/include/linux/byteorder/little_endian.h b/include/linux/byteorder/little_endian.h
index f7f8ad13adb6..83195fb82962 100644
--- a/include/linux/byteorder/little_endian.h
+++ b/include/linux/byteorder/little_endian.h
@@ -7,9 +7,6 @@
 #ifndef __LITTLE_ENDIAN_BITFIELD
 #define __LITTLE_ENDIAN_BITFIELD
 #endif
-#ifndef __BYTE_ORDER
-#define __BYTE_ORDER __LITTLE_ENDIAN
-#endif
 
 #include <linux/types.h>
 #include <linux/swab.h>
-- 
cgit v1.2.3-55-g7522


From 20413f27163fb1b8b806c0c219dc95eae67c633a Mon Sep 17 00:00:00 2001
From: Xiaotian Feng
Date: Wed, 26 May 2010 09:51:10 +0800
Subject: x86, pat: Fix memory leak in free_memtype

Reserve_memtype will allocate memory for new memtype, but
in free_memtype, after the memtype erased from rbtree, the
memory is not freed.

Changes since V1:
	make rbt_memtype_erase return erased memtype so that
	it can be freed in free_memtype.

[ hpa: not for -stable: 2.6.34 and earlier not affected ]

Signed-off-by: Xiaotian Feng <dfeng@redhat.com>
LKML-Reference: <1274838670-8731-1-git-send-email-dfeng@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Jack Steiner <steiner@sgi.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/pat.c          | 10 +++++++---
 arch/x86/mm/pat_internal.h |  6 +++---
 arch/x86/mm/pat_rbtree.c   |  7 ++++---
 3 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index bbe5502ee1cb..acc15b23b743 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -336,6 +336,7 @@ int free_memtype(u64 start, u64 end)
 {
 	int err = -EINVAL;
 	int is_range_ram;
+	struct memtype *entry;
 
 	if (!pat_enabled)
 		return 0;
@@ -355,17 +356,20 @@ int free_memtype(u64 start, u64 end)
 	}
 
 	spin_lock(&memtype_lock);
-	err = rbt_memtype_erase(start, end);
+	entry = rbt_memtype_erase(start, end);
 	spin_unlock(&memtype_lock);
 
-	if (err) {
+	if (!entry) {
 		printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
 			current->comm, current->pid, start, end);
+		return -EINVAL;
 	}
 
+	kfree(entry);
+
 	dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
 
-	return err;
+	return 0;
 }
 
 
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
index 4f39eefa3e61..77e5ba153fac 100644
--- a/arch/x86/mm/pat_internal.h
+++ b/arch/x86/mm/pat_internal.h
@@ -28,15 +28,15 @@ static inline char *cattr_name(unsigned long flags)
 #ifdef CONFIG_X86_PAT
 extern int rbt_memtype_check_insert(struct memtype *new,
 					unsigned long *new_type);
-extern int rbt_memtype_erase(u64 start, u64 end);
+extern struct memtype *rbt_memtype_erase(u64 start, u64 end);
 extern struct memtype *rbt_memtype_lookup(u64 addr);
 extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos);
 #else
 static inline int rbt_memtype_check_insert(struct memtype *new,
 					unsigned long *new_type)
 { return 0; }
-static inline int rbt_memtype_erase(u64 start, u64 end)
-{ return 0; }
+static inline struct memtype *rbt_memtype_erase(u64 start, u64 end)
+{ return NULL; }
 static inline struct memtype *rbt_memtype_lookup(u64 addr)
 { return NULL; }
 static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 07de4cb8cc30..f537087bb740 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -231,16 +231,17 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
 	return err;
 }
 
-int rbt_memtype_erase(u64 start, u64 end)
+struct memtype *rbt_memtype_erase(u64 start, u64 end)
 {
 	struct memtype *data;
 
 	data = memtype_rb_exact_match(&memtype_rbroot, start, end);
 	if (!data)
-		return -EINVAL;
+		goto out;
 
 	rb_erase(&data->rb, &memtype_rbroot);
-	return 0;
+out:
+	return data;
 }
 
 struct memtype *rbt_memtype_lookup(u64 addr)
-- 
cgit v1.2.3-55-g7522


From 0ac0c0d0f837c499afd02a802f9cf52d3027fa3b Mon Sep 17 00:00:00 2001
From: Jack Steiner
Date: Wed, 26 May 2010 14:42:51 -0700
Subject: cpusets: randomize node rotor used in cpuset_mem_spread_node()

Some workloads that create a large number of small files tend to assign
too many pages to node 0 (multi-node systems).  Part of the reason is that
the rotor (in cpuset_mem_spread_node()) used to assign nodes starts at
node 0 for newly created tasks.

This patch changes the rotor to be initialized to a random node number of
the cpuset.

[akpm@linux-foundation.org: fix layout]
[Lee.Schermerhorn@hp.com: Define stub numa_random() for !NUMA configuration]
Signed-off-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Paul Menage <menage@google.com>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/numa.c       | 17 +++++++++++++++++
 include/linux/bitmap.h   |  1 +
 include/linux/nodemask.h |  8 ++++++++
 kernel/fork.c            |  4 ++++
 lib/bitmap.c             |  2 +-
 5 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 550df481accd..10c27bb1e95f 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -2,6 +2,7 @@
 #include <linux/topology.h>
 #include <linux/module.h>
 #include <linux/bootmem.h>
+#include <linux/random.h>
 
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 # define DBG(x...) printk(KERN_DEBUG x)
@@ -65,3 +66,19 @@ const struct cpumask *cpumask_of_node(int node)
 }
 EXPORT_SYMBOL(cpumask_of_node);
 #endif
+
+/*
+ * Return the bit number of a random bit set in the nodemask.
+ *   (returns -1 if nodemask is empty)
+ */
+int __node_random(const nodemask_t *maskp)
+{
+	int w, bit = -1;
+
+	w = nodes_weight(*maskp);
+	if (w)
+		bit = bitmap_ord_to_pos(maskp->bits,
+			get_random_int() % w, MAX_NUMNODES);
+	return bit;
+}
+EXPORT_SYMBOL(__node_random);
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index daf8c480c786..6fb2720882fc 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -141,6 +141,7 @@ extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order);
 extern void bitmap_release_region(unsigned long *bitmap, int pos, int order);
 extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
 extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);
+extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits);
 
 #define BITMAP_LAST_WORD_MASK(nbits)					\
 (									\
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index dba35e413371..8a8f1d09c133 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -66,6 +66,8 @@
  * int num_online_nodes()		Number of online Nodes
  * int num_possible_nodes()		Number of all possible Nodes
  *
+ * int node_random(mask)		Random node with set bit in mask
+ *
  * int node_online(node)		Is some node online?
  * int node_possible(node)		Is some node possible?
  *
@@ -430,6 +432,10 @@ static inline void node_set_offline(int nid)
 	node_clear_state(nid, N_ONLINE);
 	nr_online_nodes = num_node_state(N_ONLINE);
 }
+
+#define node_random(mask) __node_random(&(mask))
+extern int __node_random(const nodemask_t *maskp);
+
 #else
 
 static inline int node_state(int node, enum node_states state)
@@ -460,6 +466,8 @@ static inline int num_node_state(enum node_states state)
 
 #define node_set_online(node)	   node_set_state((node), N_ONLINE)
 #define node_set_offline(node)	   node_clear_state((node), N_ONLINE)
+
+static inline int node_random(const nodemask_t mask) { return 0; }
 #endif
 
 #define node_online_map 	node_states[N_ONLINE]
diff --git a/kernel/fork.c b/kernel/fork.c
index 4d57d9e3a6e9..2e9cc3139ec6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1079,6 +1079,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
  	}
 	mpol_fix_fork_child_flag(p);
 #endif
+#ifdef CONFIG_CPUSETS
+	p->cpuset_mem_spread_rotor = node_random(p->mems_allowed);
+	p->cpuset_slab_spread_rotor = node_random(p->mems_allowed);
+#endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	p->irq_events = 0;
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
diff --git a/lib/bitmap.c b/lib/bitmap.c
index ffb78c916ccd..d7137e7e06e8 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -672,7 +672,7 @@ static int bitmap_pos_to_ord(const unsigned long *buf, int pos, int bits)
  *
  * The bit positions 0 through @bits are valid positions in @buf.
  */
-static int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits)
+int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits)
 {
 	int pos = 0;
 
-- 
cgit v1.2.3-55-g7522


From a94247e7fb99170590dc9592792045c6fa49c7f5 Mon Sep 17 00:00:00 2001
From: Akinobu Mita
Date: Wed, 26 May 2010 14:43:30 -0700
Subject: x86: convert cpu notifier to return encapsulate errno value

By the previous modification, the cpu notifier can return encapsulate
errno value.  This converts the cpu notifiers for msr, cpuid, and
therm_throt.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 2 +-
 arch/x86/kernel/cpuid.c                  | 2 +-
 arch/x86/kernel/msr.c                    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 81c499eceb21..e1a0a3bf9716 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -190,7 +190,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
 		mutex_unlock(&therm_cpu_lock);
 		break;
 	}
-	return err ? NOTIFY_BAD : NOTIFY_OK;
+	return notifier_from_errno(err);
 }
 
 static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata =
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 8b862d5900fe..1b7b31ab7d86 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -170,7 +170,7 @@ static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb,
 		cpuid_device_destroy(cpu);
 		break;
 	}
-	return err ? NOTIFY_BAD : NOTIFY_OK;
+	return notifier_from_errno(err);
 }
 
 static struct notifier_block __refdata cpuid_class_cpu_notifier =
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 4d4468e9f47c..7bf2dc4c8f70 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -230,7 +230,7 @@ static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb,
 		msr_device_destroy(cpu);
 		break;
 	}
-	return err ? NOTIFY_BAD : NOTIFY_OK;
+	return notifier_from_errno(err);
 }
 
 static struct notifier_block __refdata msr_class_cpu_notifier = {
-- 
cgit v1.2.3-55-g7522


From de006a071cbb08fff6663d98f5b9bac7ffb47559 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori
Date: Wed, 26 May 2010 14:44:16 -0700
Subject: x86: remove unnecessary sync_single_range_* in swiotlb_dma_ops

sync_single_range_for_cpu and sync_single_range_for_device hooks in
swiotlb_dma_ops are unnecessary because sync_single_for_cpu and
sync_single_for_device are used there.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/pci-swiotlb.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 7d2829dde20e..a5bc528d4328 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -31,8 +31,6 @@ static struct dma_map_ops swiotlb_dma_ops = {
 	.free_coherent = swiotlb_free_coherent,
 	.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
 	.sync_single_for_device = swiotlb_sync_single_for_device,
-	.sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
-	.sync_single_range_for_device = swiotlb_sync_single_range_for_device,
 	.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
 	.sync_sg_for_device = swiotlb_sync_sg_for_device,
 	.map_sg = swiotlb_map_sg_attrs,
-- 
cgit v1.2.3-55-g7522


From 18e98307de0d746cb0845ebf66535ce2184c25a2 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori
Date: Wed, 26 May 2010 14:44:32 -0700
Subject: asm-generic: add NEED_SG_DMA_LENGTH to define sg_dma_len()

There are only two ways to define sg_dma_len(); use sg->dma_length or
sg->length.  This patch introduces NEED_SG_DMA_LENGTH that enables
architectures to choose sg->dma_length or sg->length.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/Kconfig                    |  3 +++
 arch/sh/Kconfig                      |  3 +++
 arch/sparc/Kconfig                   |  3 +++
 arch/sparc/include/asm/scatterlist.h |  2 --
 arch/x86/Kconfig                     |  3 +++
 include/asm-generic/scatterlist.h    | 13 +++++--------
 6 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 9676100b83ee..f8afdd271f3d 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -56,6 +56,9 @@ config MMU
 config NEED_DMA_MAP_STATE
 	def_bool y
 
+config NEED_SG_DMA_LENGTH
+	def_bool y
+
 config SWIOTLB
        bool
 
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 0e318c905eea..c5ee4ce60b57 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -186,6 +186,9 @@ config DMA_NONCOHERENT
 config NEED_DMA_MAP_STATE
 	def_bool DMA_NONCOHERENT
 
+config NEED_SG_DMA_LENGTH
+	def_bool y
+
 source "init/Kconfig"
 
 source "kernel/Kconfig.freezer"
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index d6781ce687e2..6f1470baa314 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -133,6 +133,9 @@ config ZONE_DMA
 config NEED_DMA_MAP_STATE
 	def_bool y
 
+config NEED_SG_DMA_LENGTH
+	def_bool y
+
 config GENERIC_ISA_DMA
 	bool
 	default y if SPARC32
diff --git a/arch/sparc/include/asm/scatterlist.h b/arch/sparc/include/asm/scatterlist.h
index 0fa0d6da2107..69d21bb052f1 100644
--- a/arch/sparc/include/asm/scatterlist.h
+++ b/arch/sparc/include/asm/scatterlist.h
@@ -1,8 +1,6 @@
 #ifndef _SPARC_SCATTERLIST_H
 #define _SPARC_SCATTERLIST_H
 
-#define sg_dma_len(sg)     	((sg)->dma_length)
-
 #define ISA_DMA_THRESHOLD	(~0UL)
 
 #include <asm-generic/scatterlist.h>
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e0c619c55b4e..5bdc143b1228 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -109,6 +109,9 @@ config SBUS
 config NEED_DMA_MAP_STATE
        def_bool (X86_64 || DMAR || DMA_API_DEBUG)
 
+config NEED_SG_DMA_LENGTH
+	def_bool X86_64
+
 config GENERIC_ISA_DMA
 	def_bool y
 
diff --git a/include/asm-generic/scatterlist.h b/include/asm-generic/scatterlist.h
index 51a7a43ab0ce..5e087944a659 100644
--- a/include/asm-generic/scatterlist.h
+++ b/include/asm-generic/scatterlist.h
@@ -11,7 +11,9 @@ struct scatterlist {
 	unsigned int	offset;
 	unsigned int	length;
 	dma_addr_t	dma_address;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
 	unsigned int	dma_length;
+#endif
 };
 
 /*
@@ -22,17 +24,12 @@ struct scatterlist {
  * is 0.
  */
 #define sg_dma_address(sg)	((sg)->dma_address)
-#ifndef sg_dma_len
-/*
- * Normally, you have an iommu on 64 bit machines, but not on 32 bit
- * machines. Architectures that are differnt should override this.
- */
-#if __BITS_PER_LONG == 64
+
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
 #define sg_dma_len(sg)		((sg)->dma_length)
 #else
 #define sg_dma_len(sg)		((sg)->length)
-#endif /* 64 bit */
-#endif /* sg_dma_len */
+#endif
 
 #define ARCH_HAS_SG_CHAIN
 
-- 
cgit v1.2.3-55-g7522


From 4a14d84ea2adc6c02dde4ae2d4552c15e014a475 Mon Sep 17 00:00:00 2001
From: Andrew Morton
Date: Wed, 26 May 2010 14:44:33 -0700
Subject: x86_32: use asm-generic/scatterlist.h

Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5bdc143b1228..5dc54219e73b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -110,7 +110,7 @@ config NEED_DMA_MAP_STATE
        def_bool (X86_64 || DMAR || DMA_API_DEBUG)
 
 config NEED_SG_DMA_LENGTH
-	def_bool X86_64
+	def_bool y
 
 config GENERIC_ISA_DMA
 	def_bool y
-- 
cgit v1.2.3-55-g7522


From 1ef04370d823a811d2cca9f237097559a6b99b12 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori
Date: Wed, 26 May 2010 14:44:34 -0700
Subject: asm-generic: remove ARCH_HAS_SG_CHAIN in scatterlist.h

There are more architectures that don't support ARCH_HAS_SG_CHAIN than
those that support it.  This removes removes ARCH_HAS_SG_CHAIN in
asm-generic/scatterlist.h and lets arhictectures to define it.

It's clearer than defining ARCH_HAS_SG_CHAIN asm-generic/scatterlist.h and
undefing it in arhictectures that don't support it.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/scatterlist.h      | 4 +---
 arch/arm/include/asm/scatterlist.h        | 3 ---
 arch/ia64/include/asm/scatterlist.h       | 4 ++--
 arch/microblaze/include/asm/scatterlist.h | 4 ++--
 arch/powerpc/include/asm/scatterlist.h    | 1 +
 arch/sparc/include/asm/scatterlist.h      | 5 +++--
 arch/x86/include/asm/scatterlist.h        | 5 +++--
 include/asm-generic/scatterlist.h         | 2 --
 8 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/include/asm/scatterlist.h b/arch/alpha/include/asm/scatterlist.h
index 85a0ef25516b..5728c52a7412 100644
--- a/arch/alpha/include/asm/scatterlist.h
+++ b/arch/alpha/include/asm/scatterlist.h
@@ -1,10 +1,8 @@
 #ifndef _ALPHA_SCATTERLIST_H
 #define _ALPHA_SCATTERLIST_H
 
-#define ISA_DMA_THRESHOLD (~0UL)
-
 #include <asm-generic/scatterlist.h>
 
-#undef ARCH_HAS_SG_CHAIN
+#define ISA_DMA_THRESHOLD (~0UL)
 
 #endif /* !(_ALPHA_SCATTERLIST_H) */
diff --git a/arch/arm/include/asm/scatterlist.h b/arch/arm/include/asm/scatterlist.h
index bcda59f39941..2f87870d9347 100644
--- a/arch/arm/include/asm/scatterlist.h
+++ b/arch/arm/include/asm/scatterlist.h
@@ -3,9 +3,6 @@
 
 #include <asm/memory.h>
 #include <asm/types.h>
-
 #include <asm-generic/scatterlist.h>
 
-#undef ARCH_HAS_SG_CHAIN
-
 #endif /* _ASMARM_SCATTERLIST_H */
diff --git a/arch/ia64/include/asm/scatterlist.h b/arch/ia64/include/asm/scatterlist.h
index d8e98961dec7..f299a4fb25c8 100644
--- a/arch/ia64/include/asm/scatterlist.h
+++ b/arch/ia64/include/asm/scatterlist.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_IA64_SCATTERLIST_H
 #define _ASM_IA64_SCATTERLIST_H
 
+#include <asm-generic/scatterlist.h>
 /*
  * It used to be that ISA_DMA_THRESHOLD had something to do with the
  * DMA-limits of ISA-devices.  Nowadays, its only remaining use (apart
@@ -10,7 +11,6 @@
  * that's 4GB - 1.
  */
 #define ISA_DMA_THRESHOLD	0xffffffff
-
-#include <asm-generic/scatterlist.h>
+#define ARCH_HAS_SG_CHAIN
 
 #endif /* _ASM_IA64_SCATTERLIST_H */
diff --git a/arch/microblaze/include/asm/scatterlist.h b/arch/microblaze/include/asm/scatterlist.h
index be44d94cba54..dc4a8900cc80 100644
--- a/arch/microblaze/include/asm/scatterlist.h
+++ b/arch/microblaze/include/asm/scatterlist.h
@@ -1,3 +1,3 @@
-#define ISA_DMA_THRESHOLD	(~0UL)
-
 #include <asm-generic/scatterlist.h>
+
+#define ISA_DMA_THRESHOLD	(~0UL)
diff --git a/arch/powerpc/include/asm/scatterlist.h b/arch/powerpc/include/asm/scatterlist.h
index 4ae35da4975e..34cc78fd0ef4 100644
--- a/arch/powerpc/include/asm/scatterlist.h
+++ b/arch/powerpc/include/asm/scatterlist.h
@@ -15,5 +15,6 @@
 #ifdef __powerpc64__
 #define ISA_DMA_THRESHOLD	(~0UL)
 #endif
+#define ARCH_HAS_SG_CHAIN
 
 #endif /* _ASM_POWERPC_SCATTERLIST_H */
diff --git a/arch/sparc/include/asm/scatterlist.h b/arch/sparc/include/asm/scatterlist.h
index 69d21bb052f1..433e45f05fd4 100644
--- a/arch/sparc/include/asm/scatterlist.h
+++ b/arch/sparc/include/asm/scatterlist.h
@@ -1,8 +1,9 @@
 #ifndef _SPARC_SCATTERLIST_H
 #define _SPARC_SCATTERLIST_H
 
-#define ISA_DMA_THRESHOLD	(~0UL)
-
 #include <asm-generic/scatterlist.h>
 
+#define ISA_DMA_THRESHOLD	(~0UL)
+#define ARCH_HAS_SG_CHAIN
+
 #endif /* !(_SPARC_SCATTERLIST_H) */
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
index 75af592677ec..fb0b1874396f 100644
--- a/arch/x86/include/asm/scatterlist.h
+++ b/arch/x86/include/asm/scatterlist.h
@@ -1,8 +1,9 @@
 #ifndef _ASM_X86_SCATTERLIST_H
 #define _ASM_X86_SCATTERLIST_H
 
-#define ISA_DMA_THRESHOLD (0x00ffffff)
-
 #include <asm-generic/scatterlist.h>
 
+#define ISA_DMA_THRESHOLD (0x00ffffff)
+#define ARCH_HAS_SG_CHAIN
+
 #endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/include/asm-generic/scatterlist.h b/include/asm-generic/scatterlist.h
index 5e087944a659..5de07355fad4 100644
--- a/include/asm-generic/scatterlist.h
+++ b/include/asm-generic/scatterlist.h
@@ -31,6 +31,4 @@ struct scatterlist {
 #define sg_dma_len(sg)		((sg)->length)
 #endif
 
-#define ARCH_HAS_SG_CHAIN
-
 #endif /* __ASM_GENERIC_SCATTERLIST_H */
-- 
cgit v1.2.3-55-g7522


From 7281201922a0063fa60804ce39c277fc98142a47 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn
Date: Wed, 26 May 2010 14:44:56 -0700
Subject: numa: add generic percpu var numa_node_id() implementation

Rework the generic version of the numa_node_id() function to use the new
generic percpu variable infrastructure.

Guard the new implementation with a new config option:

        CONFIG_USE_PERCPU_NUMA_NODE_ID.

Archs which support this new implemention will default this option to 'y'
when NUMA is configured.  This config option could be removed if/when all
archs switch over to the generic percpu implementation of numa_node_id().
Arch support involves:

  1) converting any existing per cpu variable implementations to use
     this implementation.  x86_64 is an instance of such an arch.
  2) archs that don't use a per cpu variable for numa_node_id() will
     need to initialize the new per cpu variable "numa_node" as cpus
     are brought on-line.  ia64 is an example.
  3) Defining USE_PERCPU_NUMA_NODE_ID in arch dependent Kconfig--e.g.,
     when NUMA is configured.  This is required because I have
     retained the old implementation by default to allow archs to
     be modified incrementally, as desired.

Subsequent patches will convert x86_64 and ia64 to use this implemenation.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Eric Whitney <eric.whitney@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/topology.h |  4 ++++
 include/linux/topology.h        | 51 +++++++++++++++++++++++++++++++++++++----
 mm/page_alloc.c                 |  5 ++++
 3 files changed, 55 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index c5087d796587..a2e629476b0e 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -170,6 +170,10 @@ static inline int numa_node_id(void)
 {
 	return 0;
 }
+/*
+ * indicate override:
+ */
+#define numa_node_id numa_node_id
 
 static inline int early_cpu_to_node(int cpu)
 {
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 5b81156780b1..2e5518f46571 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -31,6 +31,7 @@
 #include <linux/bitops.h>
 #include <linux/mmzone.h>
 #include <linux/smp.h>
+#include <linux/percpu.h>
 #include <asm/topology.h>
 
 #ifndef node_has_online_mem
@@ -203,8 +204,53 @@ int arch_update_cpu_topology(void);
 #ifndef SD_NODE_INIT
 #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
 #endif
+
 #endif /* CONFIG_NUMA */
 
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+DECLARE_PER_CPU(int, numa_node);
+
+#ifndef numa_node_id
+/* Returns the number of the current Node. */
+static inline int numa_node_id(void)
+{
+	return __this_cpu_read(numa_node);
+}
+#endif
+
+#ifndef cpu_to_node
+static inline int cpu_to_node(int cpu)
+{
+	return per_cpu(numa_node, cpu);
+}
+#endif
+
+#ifndef set_numa_node
+static inline void set_numa_node(int node)
+{
+	percpu_write(numa_node, node);
+}
+#endif
+
+#ifndef set_cpu_numa_node
+static inline void set_cpu_numa_node(int cpu, int node)
+{
+	per_cpu(numa_node, cpu) = node;
+}
+#endif
+
+#else	/* !CONFIG_USE_PERCPU_NUMA_NODE_ID */
+
+/* Returns the number of the current Node. */
+#ifndef numa_node_id
+static inline int numa_node_id(void)
+{
+	return cpu_to_node(raw_smp_processor_id());
+}
+#endif
+
+#endif	/* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */
+
 #ifndef topology_physical_package_id
 #define topology_physical_package_id(cpu)	((void)(cpu), -1)
 #endif
@@ -218,9 +264,4 @@ int arch_update_cpu_topology(void);
 #define topology_core_cpumask(cpu)		cpumask_of(cpu)
 #endif
 
-/* Returns the number of the current Node. */
-#ifndef numa_node_id
-#define numa_node_id()		(cpu_to_node(raw_smp_processor_id()))
-#endif
-
 #endif /* _LINUX_TOPOLOGY_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 08b349931ebc..6fe1b65ee1a8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -57,6 +57,11 @@
 #include <asm/div64.h>
 #include "internal.h"
 
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+DEFINE_PER_CPU(int, numa_node);
+EXPORT_PER_CPU_SYMBOL(numa_node);
+#endif
+
 /*
  * Array of node states.
  */
-- 
cgit v1.2.3-55-g7522


From e534c7c5f8d6e9fc46f57fab067c7e48d8ceb172 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn
Date: Wed, 26 May 2010 14:44:58 -0700
Subject: numa: x86_64: use generic percpu var numa_node_id() implementation

x86 arch specific changes to use generic numa_node_id() based on generic
percpu variable infrastructure.  Back out x86's custom version of
numa_node_id()

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Eric Whitney <eric.whitney@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig                |  4 ++++
 arch/x86/include/asm/topology.h | 22 +++++++++-------------
 arch/x86/kernel/cpu/common.c    |  6 +++---
 arch/x86/kernel/setup_percpu.c  |  4 ++--
 arch/x86/mm/numa_64.c           |  9 +++------
 5 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5dc54219e73b..dcb0593b4a66 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1706,6 +1706,10 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
 	def_bool X86_64
 	depends on NUMA
 
+config USE_PERCPU_NUMA_NODE_ID
+	def_bool X86_64
+	depends on NUMA
+
 menu "Power management and ACPI options"
 
 config ARCH_HIBERNATION_HEADER
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index a2e629476b0e..21899cc31e52 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -53,33 +53,29 @@
 extern int cpu_to_node_map[];
 
 /* Returns the number of the node containing CPU 'cpu' */
-static inline int cpu_to_node(int cpu)
+static inline int __cpu_to_node(int cpu)
 {
 	return cpu_to_node_map[cpu];
 }
-#define early_cpu_to_node(cpu)	cpu_to_node(cpu)
+#define early_cpu_to_node __cpu_to_node
+#define cpu_to_node __cpu_to_node
 
 #else /* CONFIG_X86_64 */
 
 /* Mappings between logical cpu number and node number */
 DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
 
-/* Returns the number of the current Node. */
-DECLARE_PER_CPU(int, node_number);
-#define numa_node_id()		percpu_read(node_number)
-
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
-extern int cpu_to_node(int cpu);
+/*
+ * override generic percpu implementation of cpu_to_node
+ */
+extern int __cpu_to_node(int cpu);
+#define cpu_to_node __cpu_to_node
+
 extern int early_cpu_to_node(int cpu);
 
 #else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
 
-/* Returns the number of the node containing CPU 'cpu' */
-static inline int cpu_to_node(int cpu)
-{
-	return per_cpu(x86_cpu_to_node_map, cpu);
-}
-
 /* Same function but used if called before per_cpu areas are setup */
 static inline int early_cpu_to_node(int cpu)
 {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cc83a002786e..68e4a6f2211e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1121,9 +1121,9 @@ void __cpuinit cpu_init(void)
 	oist = &per_cpu(orig_ist, cpu);
 
 #ifdef CONFIG_NUMA
-	if (cpu != 0 && percpu_read(node_number) == 0 &&
-	    cpu_to_node(cpu) != NUMA_NO_NODE)
-		percpu_write(node_number, cpu_to_node(cpu));
+	if (cpu != 0 && percpu_read(numa_node) == 0 &&
+	    early_cpu_to_node(cpu) != NUMA_NO_NODE)
+		set_numa_node(early_cpu_to_node(cpu));
 #endif
 
 	me = current;
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ef6370b00e70..a867940a6dfc 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -265,10 +265,10 @@ void __init setup_per_cpu_areas(void)
 
 #if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
 	/*
-	 * make sure boot cpu node_number is right, when boot cpu is on the
+	 * make sure boot cpu numa_node is right, when boot cpu is on the
 	 * node that doesn't have mem installed
 	 */
-	per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
+	set_cpu_numa_node(boot_cpu_id, early_cpu_to_node(boot_cpu_id));
 #endif
 
 	/* Setup node to cpumask map */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 8948f47fde05..a7bcc23ef96c 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -33,9 +33,6 @@ int numa_off __initdata;
 static unsigned long __initdata nodemap_addr;
 static unsigned long __initdata nodemap_size;
 
-DEFINE_PER_CPU(int, node_number) = 0;
-EXPORT_PER_CPU_SYMBOL(node_number);
-
 /*
  * Map cpu index to node index
  */
@@ -809,7 +806,7 @@ void __cpuinit numa_set_node(int cpu, int node)
 	per_cpu(x86_cpu_to_node_map, cpu) = node;
 
 	if (node != NUMA_NO_NODE)
-		per_cpu(node_number, cpu) = node;
+		set_cpu_numa_node(cpu, node);
 }
 
 void __cpuinit numa_clear_node(int cpu)
@@ -867,7 +864,7 @@ void __cpuinit numa_remove_cpu(int cpu)
 	numa_set_cpumask(cpu, 0);
 }
 
-int cpu_to_node(int cpu)
+int __cpu_to_node(int cpu)
 {
 	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
 		printk(KERN_WARNING
@@ -877,7 +874,7 @@ int cpu_to_node(int cpu)
 	}
 	return per_cpu(x86_cpu_to_node_map, cpu);
 }
-EXPORT_SYMBOL(cpu_to_node);
+EXPORT_SYMBOL(__cpu_to_node);
 
 /*
  * Same function as cpu_to_node() but used if called before the
-- 
cgit v1.2.3-55-g7522


From 1ba4f22c426ba04b00fd717318d50620c621a0e1 Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Thu, 27 May 2010 12:02:00 -0700
Subject: x86, cpufeature: Unbreak compile with gcc 3.x

gcc 3 is too braindamaged to be able to compile static_cpu_has() --
apparently it can't tell that a constant passed to an inline function
is still a constant -- so if we're using gcc 3, just use the dynamic
test.  This is bad for performance, but if you care about performance,
don't use an ancient, known-to-optimize-poorly compiler.

Reported-and-tested-by: Eric Dumazet <eric.dumazet@gmail.com>
LKML-Reference: <4BF2FF82.7090005@zytor.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/cpufeature.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index dca9c545f44e..468145914389 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -332,6 +332,7 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
 #endif
 }
 
+#if __GNUC__ >= 4
 #define static_cpu_has(bit)					\
 (								\
 	__builtin_constant_p(boot_cpu_has(bit)) ?		\
@@ -340,6 +341,12 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
 		__static_cpu_has(bit) :				\
 		boot_cpu_has(bit)				\
 )
+#else
+/*
+ * gcc 3.x is too stupid to do the static test; fall back to dynamic.
+ */
+#define static_cpu_has(bit) boot_cpu_has(bit)
+#endif
 
 #endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
 
-- 
cgit v1.2.3-55-g7522


From f52e62dcc1f555a0f8d8ccc3112454c3ff571395 Mon Sep 17 00:00:00 2001
From: Florian Fainelli
Date: Sun, 21 Mar 2010 01:06:17 +0100
Subject: x86: remove rdc321x_defs.h

This file is replaced by a cleaner version with the adding of a MFD driver for
the southbridge.

Signed-off-by: Florian Fainelli <florian@openwrt.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 arch/x86/include/asm/rdc321x_defs.h | 12 ------------
 1 file changed, 12 deletions(-)
 delete mode 100644 arch/x86/include/asm/rdc321x_defs.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/rdc321x_defs.h b/arch/x86/include/asm/rdc321x_defs.h
deleted file mode 100644
index c8e9c8bed3d0..000000000000
--- a/arch/x86/include/asm/rdc321x_defs.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#define PFX	"rdc321x: "
-
-/* General purpose configuration and data registers */
-#define RDC3210_CFGREG_ADDR     0x0CF8
-#define RDC3210_CFGREG_DATA     0x0CFC
-
-#define RDC321X_GPIO_CTRL_REG1	0x48
-#define RDC321X_GPIO_CTRL_REG2	0x84
-#define RDC321X_GPIO_DATA_REG1	0x4c
-#define RDC321X_GPIO_DATA_REG2	0x88
-
-#define RDC321X_MAX_GPIO	58
-- 
cgit v1.2.3-55-g7522


From e45b7fa23097332508730123ac6d59227e7bd7f8 Mon Sep 17 00:00:00 2001
From: Len Brown
Date: Mon, 24 May 2010 11:34:36 -0400
Subject: sched: clarify commment for TS_POLLING

TS_POLLING set tells the scheduler an idle_task will poll
need_resched() to look for work.

TS_POLLING clear tells resched_task() and wake_up_idle_cpu()
that the remote CPU's idle_task is now sleeping in idle,
and thus requires a reschedule interrupt notice work.

Update the description of TS_POLLING to reflect how it works.
"idle task polling need_resched, skip sending interrupt"

Wordsmithing-by: Milton Miller <miltonm@bga.com>
Signed-off-by: Len Brown <len.brown@intel.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
---
 arch/x86/include/asm/thread_info.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e0d28901e969..812186919bba 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -241,8 +241,8 @@ static inline struct thread_info *current_thread_info(void)
 #define TS_USEDFPU		0x0001	/* FPU was used by this task
 					   this quantum (SMP) */
 #define TS_COMPAT		0x0002	/* 32bit syscall active (64BIT)*/
-#define TS_POLLING		0x0004	/* true if in idle loop
-					   and not sleeping */
+#define TS_POLLING		0x0004	/* idle task polling need_resched,
+					   skip sending interrupt */
 #define TS_RESTORE_SIGMASK	0x0008	/* restore signal mask in do_signal() */
 #define TS_XSAVE		0x0010	/* Use xsave/xrstor */
 
-- 
cgit v1.2.3-55-g7522


From 35926ff5fba8245bd1c6ac04155048f6f89232b1 Mon Sep 17 00:00:00 2001
From: Linus Torvalds
Date: Sun, 30 May 2010 09:00:03 -0700
Subject: Revert "cpusets: randomize node rotor used in
 cpuset_mem_spread_node()"

This reverts commit 0ac0c0d0f837c499afd02a802f9cf52d3027fa3b, which
caused cross-architecture build problems for all the wrong reasons.
IA64 already added its own version of __node_random(), but the fact is,
there is nothing architectural about the function, and the original
commit was just badly done. Revert it, since no fix is forthcoming.

Requested-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/numa.c       | 17 -----------------
 include/linux/bitmap.h   |  1 -
 include/linux/nodemask.h |  8 --------
 kernel/fork.c            |  4 ----
 lib/bitmap.c             |  2 +-
 5 files changed, 1 insertion(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 10c27bb1e95f..550df481accd 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -2,7 +2,6 @@
 #include <linux/topology.h>
 #include <linux/module.h>
 #include <linux/bootmem.h>
-#include <linux/random.h>
 
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 # define DBG(x...) printk(KERN_DEBUG x)
@@ -66,19 +65,3 @@ const struct cpumask *cpumask_of_node(int node)
 }
 EXPORT_SYMBOL(cpumask_of_node);
 #endif
-
-/*
- * Return the bit number of a random bit set in the nodemask.
- *   (returns -1 if nodemask is empty)
- */
-int __node_random(const nodemask_t *maskp)
-{
-	int w, bit = -1;
-
-	w = nodes_weight(*maskp);
-	if (w)
-		bit = bitmap_ord_to_pos(maskp->bits,
-			get_random_int() % w, MAX_NUMNODES);
-	return bit;
-}
-EXPORT_SYMBOL(__node_random);
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 6fb2720882fc..daf8c480c786 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -141,7 +141,6 @@ extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order);
 extern void bitmap_release_region(unsigned long *bitmap, int pos, int order);
 extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
 extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);
-extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits);
 
 #define BITMAP_LAST_WORD_MASK(nbits)					\
 (									\
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 8a8f1d09c133..dba35e413371 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -66,8 +66,6 @@
  * int num_online_nodes()		Number of online Nodes
  * int num_possible_nodes()		Number of all possible Nodes
  *
- * int node_random(mask)		Random node with set bit in mask
- *
  * int node_online(node)		Is some node online?
  * int node_possible(node)		Is some node possible?
  *
@@ -432,10 +430,6 @@ static inline void node_set_offline(int nid)
 	node_clear_state(nid, N_ONLINE);
 	nr_online_nodes = num_node_state(N_ONLINE);
 }
-
-#define node_random(mask) __node_random(&(mask))
-extern int __node_random(const nodemask_t *maskp);
-
 #else
 
 static inline int node_state(int node, enum node_states state)
@@ -466,8 +460,6 @@ static inline int num_node_state(enum node_states state)
 
 #define node_set_online(node)	   node_set_state((node), N_ONLINE)
 #define node_set_offline(node)	   node_clear_state((node), N_ONLINE)
-
-static inline int node_random(const nodemask_t mask) { return 0; }
 #endif
 
 #define node_online_map 	node_states[N_ONLINE]
diff --git a/kernel/fork.c b/kernel/fork.c
index bf9fef6d1bfe..b6cce14ba047 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1086,10 +1086,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
  	}
 	mpol_fix_fork_child_flag(p);
 #endif
-#ifdef CONFIG_CPUSETS
-	p->cpuset_mem_spread_rotor = node_random(p->mems_allowed);
-	p->cpuset_slab_spread_rotor = node_random(p->mems_allowed);
-#endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	p->irq_events = 0;
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
diff --git a/lib/bitmap.c b/lib/bitmap.c
index d7137e7e06e8..ffb78c916ccd 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -672,7 +672,7 @@ static int bitmap_pos_to_ord(const unsigned long *buf, int pos, int bits)
  *
  * The bit positions 0 through @bits are valid positions in @buf.
  */
-int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits)
+static int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits)
 {
 	int pos = 0;
 
-- 
cgit v1.2.3-55-g7522