From 81559f9ad3d88c033e4ec3b6468012dbfda3b31d Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna
Date: Tue, 18 Oct 2011 13:33:02 +0300
Subject: crypto: twofish-x86_64-3way - add lrw support

Patch adds LRW support for twofish-x86_64-3way by using lrw_crypt(). Patch has
been tested with tcrypt and automated filesystem tests.

Tcrypt benchmarks results (twofish-3way/twofish-asm speed ratios):

Intel Celeron T1600 (fam:6, model:15, step:13):

size	lrw-enc	lrw-dec
16B	0.99x	1.00x
64B	1.17x	1.17x
256B	1.26x	1.27x
1024B	1.30x	1.31x
8192B	1.31x	1.32x

AMD Phenom II 1055T (fam:16, model:10):

size	lrw-enc	lrw-dec
16B	1.06x	1.01x
64B	1.08x	1.14x
256B	1.19x	1.20x
1024B	1.21x	1.22x
8192B	1.23x	1.24x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/twofish_glue_3way.c | 135 ++++++++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 5ede9c444c3e..fa9151df3637 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -32,6 +32,11 @@
 #include <crypto/algapi.h>
 #include <crypto/twofish.h>
 #include <crypto/b128ops.h>
+#include <crypto/lrw.h>
+
+#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE)
+#define HAS_LRW
+#endif
 
 /* regular block cipher functions from twofish_x86_64 module */
 asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
@@ -432,6 +437,124 @@ static struct crypto_alg blk_ctr_alg = {
 	},
 };
 
+#ifdef HAS_LRW
+
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = TF_BLOCK_SIZE;
+	struct twofish_ctx *ctx = priv;
+	int i;
+
+	if (nbytes == 3 * bsize) {
+		twofish_enc_blk_3way(ctx, srcdst, srcdst);
+		return;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		twofish_enc_blk(ctx, srcdst, srcdst);
+}
+
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = TF_BLOCK_SIZE;
+	struct twofish_ctx *ctx = priv;
+	int i;
+
+	if (nbytes == 3 * bsize) {
+		twofish_dec_blk_3way(ctx, srcdst, srcdst);
+		return;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		twofish_dec_blk(ctx, srcdst, srcdst);
+}
+
+struct twofish_lrw_ctx {
+	struct lrw_table_ctx lrw_table;
+	struct twofish_ctx twofish_ctx;
+};
+
+static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen)
+{
+	struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+	int err;
+
+	err = __twofish_setkey(&ctx->twofish_ctx, key, keylen - TF_BLOCK_SIZE,
+			       &tfm->crt_flags);
+	if (err)
+		return err;
+
+	return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE);
+}
+
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[3];
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &ctx->twofish_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+
+	return lrw_crypt(desc, dst, src, nbytes, &req);
+}
+
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[3];
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &ctx->twofish_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+
+	return lrw_crypt(desc, dst, src, nbytes, &req);
+}
+
+static void lrw_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	lrw_free_table(&ctx->lrw_table);
+}
+
+static struct crypto_alg blk_lrw_alg = {
+	.cra_name		= "lrw(twofish)",
+	.cra_driver_name	= "lrw-twofish-3way",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_lrw_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_lrw_alg.cra_list),
+	.cra_exit		= lrw_exit_tfm,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE + TF_BLOCK_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE + TF_BLOCK_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= lrw_twofish_setkey,
+			.encrypt	= lrw_encrypt,
+			.decrypt	= lrw_decrypt,
+		},
+	},
+};
+
+#endif
+
 int __init init(void)
 {
 	int err;
@@ -445,9 +568,18 @@ int __init init(void)
 	err = crypto_register_alg(&blk_ctr_alg);
 	if (err)
 		goto ctr_err;
+#ifdef HAS_LRW
+	err = crypto_register_alg(&blk_lrw_alg);
+	if (err)
+		goto blk_lrw_err;
+#endif
 
 	return 0;
 
+#ifdef HAS_LRW
+blk_lrw_err:
+	crypto_unregister_alg(&blk_ctr_alg);
+#endif
 ctr_err:
 	crypto_unregister_alg(&blk_cbc_alg);
 cbc_err:
@@ -458,6 +590,9 @@ ecb_err:
 
 void __exit fini(void)
 {
+#ifdef HAS_LRW
+	crypto_unregister_alg(&blk_lrw_alg);
+#endif
 	crypto_unregister_alg(&blk_ctr_alg);
 	crypto_unregister_alg(&blk_cbc_alg);
 	crypto_unregister_alg(&blk_ecb_alg);
-- 
cgit v1.2.3-55-g7522


From bae6d3038b7faff187f4207448a40b9912cf787d Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna
Date: Tue, 18 Oct 2011 13:33:43 +0300
Subject: crypto: twofish-x86_64-3way - add xts support

Patch adds XTS support for twofish-x86_64-3way by using xts_crypt(). Patch has
been tested with tcrypt and automated filesystem tests.

Tcrypt benchmarks results (twofish-3way/twofish-asm speed ratios):

Intel Celeron T1600 (fam:6, model:15, step:13):

size    xts-enc xts-dec
16B     0.98x   1.00x
64B     1.14x   1.15x
256B    1.23x   1.25x
1024B   1.26x   1.29x
8192B   1.28x   1.30x

AMD Phenom II 1055T (fam:16, model:10):

size    xts-enc xts-dec
16B     1.03x   1.03x
64B     1.13x   1.16x
256B    1.20x   1.20x
1024B   1.22x   1.22x
8192B   1.22x   1.21x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/twofish_glue_3way.c | 119 +++++++++++++++++++++++++++++++++++-
 1 file changed, 117 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index fa9151df3637..954f59eeb7b4 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -33,11 +33,16 @@
 #include <crypto/twofish.h>
 #include <crypto/b128ops.h>
 #include <crypto/lrw.h>
+#include <crypto/xts.h>
 
 #if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE)
 #define HAS_LRW
 #endif
 
+#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE)
+#define HAS_XTS
+#endif
+
 /* regular block cipher functions from twofish_x86_64 module */
 asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
 				const u8 *src);
@@ -437,7 +442,7 @@ static struct crypto_alg blk_ctr_alg = {
 	},
 };
 
-#ifdef HAS_LRW
+#if defined(HAS_LRW) || defined(HAS_XTS)
 
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
@@ -469,6 +474,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 		twofish_dec_blk(ctx, srcdst, srcdst);
 }
 
+#endif
+
+#ifdef HAS_LRW
+
 struct twofish_lrw_ctx {
 	struct lrw_table_ctx lrw_table;
 	struct twofish_ctx twofish_ctx;
@@ -555,6 +564,99 @@ static struct crypto_alg blk_lrw_alg = {
 
 #endif
 
+#ifdef HAS_XTS
+
+struct twofish_xts_ctx {
+	struct twofish_ctx tweak_ctx;
+	struct twofish_ctx crypt_ctx;
+};
+
+static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen)
+{
+	struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+	u32 *flags = &tfm->crt_flags;
+	int err;
+
+	/* key consists of keys of equal size concatenated, therefore
+	 * the length must be even
+	 */
+	if (keylen % 2) {
+		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+
+	/* first half of xts-key is for crypt */
+	err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
+	if (err)
+		return err;
+
+	/* second half of xts-key is for tweak */
+	return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
+				flags);
+}
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[3];
+	struct xts_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.tweak_ctx = &ctx->tweak_ctx,
+		.tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
+		.crypt_ctx = &ctx->crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+
+	return xts_crypt(desc, dst, src, nbytes, &req);
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[3];
+	struct xts_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.tweak_ctx = &ctx->tweak_ctx,
+		.tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
+		.crypt_ctx = &ctx->crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+
+	return xts_crypt(desc, dst, src, nbytes, &req);
+}
+
+static struct crypto_alg blk_xts_alg = {
+	.cra_name		= "xts(twofish)",
+	.cra_driver_name	= "xts-twofish-3way",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_xts_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_xts_alg.cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE * 2,
+			.max_keysize	= TF_MAX_KEY_SIZE * 2,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= xts_twofish_setkey,
+			.encrypt	= xts_encrypt,
+			.decrypt	= xts_decrypt,
+		},
+	},
+};
+
+#endif
+
 int __init init(void)
 {
 	int err;
@@ -573,13 +675,23 @@ int __init init(void)
 	if (err)
 		goto blk_lrw_err;
 #endif
+#ifdef HAS_XTS
+	err = crypto_register_alg(&blk_xts_alg);
+	if (err)
+		goto blk_xts_err;
+#endif
 
 	return 0;
 
+#ifdef HAS_XTS
+	crypto_unregister_alg(&blk_xts_alg);
+blk_xts_err:
+#endif
 #ifdef HAS_LRW
+	crypto_unregister_alg(&blk_lrw_alg);
 blk_lrw_err:
-	crypto_unregister_alg(&blk_ctr_alg);
 #endif
+	crypto_unregister_alg(&blk_ctr_alg);
 ctr_err:
 	crypto_unregister_alg(&blk_cbc_alg);
 cbc_err:
@@ -590,6 +702,9 @@ ecb_err:
 
 void __exit fini(void)
 {
+#ifdef HAS_XTS
+	crypto_unregister_alg(&blk_xts_alg);
+#endif
 #ifdef HAS_LRW
 	crypto_unregister_alg(&blk_lrw_alg);
 #endif
-- 
cgit v1.2.3-55-g7522


From ff14c1d01576fb839a925a42596582f6c68a1a1a Mon Sep 17 00:00:00 2001
From: Pekka Enberg
Date: Tue, 1 Nov 2011 15:58:16 +0200
Subject: x86, mm: Use MAX_DMA_PFN for ZONE_DMA on 32-bit

Use MAX_DMA_PFN which represents the 16 MB ISA DMA limit on
32-bit x86 just like we do on 64-bit.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-1-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 29f7c6d98179..434c97d620c2 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -679,8 +679,7 @@ static void __init zone_sizes_init(void)
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 #ifdef CONFIG_ZONE_DMA
-	max_zone_pfns[ZONE_DMA] =
-		virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
 #endif
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 #ifdef CONFIG_HIGHMEM
-- 
cgit v1.2.3-55-g7522


From 4c0b2e5f8940fec7cbeafcf641fecd5e746329c5 Mon Sep 17 00:00:00 2001
From: Pekka Enberg
Date: Tue, 1 Nov 2011 15:58:17 +0200
Subject: x86, mm: Move zone init from paging_init() on 64-bit

This patch introduces a zone_sizes_init() helper function on
64-bit to make it more similar to 32-bit init.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-2-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bbaaa005bf0e..3ddda59f7087 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -612,7 +612,7 @@ void __init initmem_init(void)
 }
 #endif
 
-void __init paging_init(void)
+static void __init zone_sizes_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
 
@@ -623,6 +623,11 @@ void __init paging_init(void)
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 	max_zone_pfns[ZONE_NORMAL] = max_pfn;
 
+	free_area_init_nodes(max_zone_pfns);
+}
+
+void __init paging_init(void)
+{
 	sparse_memory_present_with_active_regions(MAX_NUMNODES);
 	sparse_init();
 
@@ -634,7 +639,7 @@ void __init paging_init(void)
 	 */
 	node_clear_state(0, N_NORMAL_MEMORY);
 
-	free_area_init_nodes(max_zone_pfns);
+	zone_sizes_init();
 }
 
 /*
-- 
cgit v1.2.3-55-g7522


From e4794640ca408acda18eb31b126f58a58803b9c9 Mon Sep 17 00:00:00 2001
From: Pekka Enberg
Date: Tue, 1 Nov 2011 15:58:18 +0200
Subject: x86, mm: Use max_pfn instead of highend_pfn

The 'highend_pfn' variable is always set to 'max_pfn' so just
use the latter directly.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-3-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 434c97d620c2..5ac0118b7610 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -683,7 +683,7 @@ static void __init zone_sizes_init(void)
 #endif
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 #ifdef CONFIG_HIGHMEM
-	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
+	max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
 #endif
 
 	free_area_init_nodes(max_zone_pfns);
-- 
cgit v1.2.3-55-g7522


From 80b3cac97bc14fdf839d967602e599cbf82ea336 Mon Sep 17 00:00:00 2001
From: Pekka Enberg
Date: Tue, 1 Nov 2011 15:58:19 +0200
Subject: x86, mm: Wrap ZONE_DMA32 with CONFIG_ZONE_DMA32

In preparation for unifying 32-bit and 64-bit zone_sizes_init()
make sure ZONE_DMA32 is wrapped in CONFIG_ZONE_DMA32.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Arun Sharma <asharma@fb.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-4-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3ddda59f7087..a9214e6e721a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -620,7 +620,9 @@ static void __init zone_sizes_init(void)
 #ifdef CONFIG_ZONE_DMA
 	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
 #endif
+#ifdef CONFIG_ZONE_DMA32
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
+#endif
 	max_zone_pfns[ZONE_NORMAL] = max_pfn;
 
 	free_area_init_nodes(max_zone_pfns);
-- 
cgit v1.2.3-55-g7522


From ece838b6257412647197c072fe59dfc6615df144 Mon Sep 17 00:00:00 2001
From: Pekka Enberg
Date: Tue, 1 Nov 2011 15:58:20 +0200
Subject: x86, mm: Use max_low_pfn for ZONE_NORMAL on 64-bit

64-bit has no highmem so max_low_pfn is always the same as
'max_pfn'.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-5-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a9214e6e721a..f6b1f087cced 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -623,7 +623,7 @@ static void __init zone_sizes_init(void)
 #ifdef CONFIG_ZONE_DMA32
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 #endif
-	max_zone_pfns[ZONE_NORMAL] = max_pfn;
+	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 
 	free_area_init_nodes(max_zone_pfns);
 }
-- 
cgit v1.2.3-55-g7522


From 248b52b97da7a712d2263a51d8d84c959f38ef75 Mon Sep 17 00:00:00 2001
From: Pekka Enberg
Date: Tue, 1 Nov 2011 15:58:21 +0200
Subject: x86, mm: Prepare zone_sizes_init() for unification

Make 32-bit and 64-bit zone_sizes_init() identical in
preparation for unification.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-6-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c | 4 ++++
 arch/x86/mm/init_64.c | 3 +++
 2 files changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 5ac0118b7610..27455b958b8d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -677,9 +677,13 @@ void __init initmem_init(void)
 static void __init zone_sizes_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
+
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 #ifdef CONFIG_ZONE_DMA
 	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+#endif
+#ifdef CONFIG_ZONE_DMA32
+        max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 #endif
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 #ifdef CONFIG_HIGHMEM
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index f6b1f087cced..06c4360cf796 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -624,6 +624,9 @@ static void __init zone_sizes_init(void)
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 #endif
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+	max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
+#endif
 
 	free_area_init_nodes(max_zone_pfns);
 }
-- 
cgit v1.2.3-55-g7522


From 176239153049a023d060ce95b05f7ef31667e362 Mon Sep 17 00:00:00 2001
From: Pekka Enberg
Date: Tue, 1 Nov 2011 15:58:22 +0200
Subject: x86, mm: Unify zone_sizes_init()

Now that zone_sizes_init() is identical on 32-bit and 64-bit,
move the code to arch/x86/mm/init.c and use it for both
architectures.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-7-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/init.h |  2 ++
 arch/x86/mm/init.c          | 23 +++++++++++++++++++++++
 arch/x86/mm/init_32.c       | 19 -------------------
 arch/x86/mm/init_64.c       | 19 -------------------
 4 files changed, 25 insertions(+), 38 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 8dbe353e41e1..adcc0ae73d09 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -5,6 +5,8 @@
 extern void __init early_ioremap_page_table_range_init(void);
 #endif
 
+extern void __init zone_sizes_init(void);
+
 extern unsigned long __init
 kernel_physical_mapping_init(unsigned long start,
 			     unsigned long end,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 87488b93a65c..2426b60bb409 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -3,6 +3,7 @@
 #include <linux/ioport.h>
 #include <linux/swap.h>
 #include <linux/memblock.h>
+#include <linux/bootmem.h>	/* for max_low_pfn */
 
 #include <asm/cacheflush.h>
 #include <asm/e820.h>
@@ -15,6 +16,7 @@
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/proto.h>
+#include <asm/dma.h>		/* for MAX_DMA_PFN */
 
 unsigned long __initdata pgt_buf_start;
 unsigned long __meminitdata pgt_buf_end;
@@ -392,3 +394,24 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	free_init_pages("initrd memory", start, PAGE_ALIGN(end));
 }
 #endif
+
+void __init zone_sizes_init(void)
+{
+	unsigned long max_zone_pfns[MAX_NR_ZONES];
+
+	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+
+#ifdef CONFIG_ZONE_DMA
+	max_zone_pfns[ZONE_DMA]		= MAX_DMA_PFN;
+#endif
+#ifdef CONFIG_ZONE_DMA32
+	max_zone_pfns[ZONE_DMA32]	= MAX_DMA32_PFN;
+#endif
+	max_zone_pfns[ZONE_NORMAL]	= max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+	max_zone_pfns[ZONE_HIGHMEM]	= max_pfn;
+#endif
+
+	free_area_init_nodes(max_zone_pfns);
+}
+
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 27455b958b8d..3bebaed5021c 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -674,25 +674,6 @@ void __init initmem_init(void)
 }
 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
 
-static void __init zone_sizes_init(void)
-{
-	unsigned long max_zone_pfns[MAX_NR_ZONES];
-
-	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-#ifdef CONFIG_ZONE_DMA
-	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
-#endif
-#ifdef CONFIG_ZONE_DMA32
-        max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
-#endif
-	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-#ifdef CONFIG_HIGHMEM
-	max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
-#endif
-
-	free_area_init_nodes(max_zone_pfns);
-}
-
 void __init setup_bootmem_allocator(void)
 {
 	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 06c4360cf796..6fcce7d34555 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -612,25 +612,6 @@ void __init initmem_init(void)
 }
 #endif
 
-static void __init zone_sizes_init(void)
-{
-	unsigned long max_zone_pfns[MAX_NR_ZONES];
-
-	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-#ifdef CONFIG_ZONE_DMA
-	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
-#endif
-#ifdef CONFIG_ZONE_DMA32
-	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
-#endif
-	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-#ifdef CONFIG_HIGHMEM
-	max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
-#endif
-
-	free_area_init_nodes(max_zone_pfns);
-}
-
 void __init paging_init(void)
 {
 	sparse_memory_present_with_active_regions(MAX_NUMNODES);
-- 
cgit v1.2.3-55-g7522


From bcb71abe7d4c5a0d0368c67da0a7def4fc73497a Mon Sep 17 00:00:00 2001
From: Alex Williamson
Date: Fri, 21 Oct 2011 15:56:24 -0400
Subject: iommu: Add option to group multi-function devices

The option iommu=group_mf indicates the that the iommu driver should
expose all functions of a multi-function PCI device as the same
iommu_device_group.  This is useful for disallowing individual functions
being exposed as independent devices to userspace as there are often
hidden dependencies.  Virtual functions are not affected by this option.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 Documentation/kernel-parameters.txt |  4 +++-
 arch/ia64/include/asm/iommu.h       |  2 ++
 arch/ia64/kernel/pci-dma.c          |  1 +
 arch/x86/include/asm/iommu.h        |  1 +
 arch/x86/kernel/pci-dma.c           | 11 +++++++++++
 drivers/iommu/amd_iommu.c           | 10 +++++++++-
 drivers/iommu/intel-iommu.c         |  3 +++
 7 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a0c5c5f4fce6..e1b6e4469845 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1059,7 +1059,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 		nomerge
 		forcesac
 		soft
-		pt	[x86, IA-64]
+		pt		[x86, IA-64]
+		group_mf	[x86, IA-64]
+
 
 	io7=		[HW] IO7 for Marvel based alpha systems
 			See comment before marvel_specify_io7 in
diff --git a/arch/ia64/include/asm/iommu.h b/arch/ia64/include/asm/iommu.h
index 105c93b00b1b..b6a809fa2995 100644
--- a/arch/ia64/include/asm/iommu.h
+++ b/arch/ia64/include/asm/iommu.h
@@ -11,10 +11,12 @@ extern void no_iommu_init(void);
 extern int force_iommu, no_iommu;
 extern int iommu_pass_through;
 extern int iommu_detected;
+extern int iommu_group_mf;
 #else
 #define iommu_pass_through	(0)
 #define no_iommu		(1)
 #define iommu_detected		(0)
+#define iommu_group_mf		(0)
 #endif
 extern void iommu_dma_init(void);
 extern void machvec_init(const char *name);
diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c
index c16162c70860..eb1175720050 100644
--- a/arch/ia64/kernel/pci-dma.c
+++ b/arch/ia64/kernel/pci-dma.c
@@ -33,6 +33,7 @@ int force_iommu __read_mostly;
 #endif
 
 int iommu_pass_through;
+int iommu_group_mf;
 
 /* Dummy device used for NULL arguments (normally ISA). Better would
    be probably a smaller DMA mask, but this is bug-to-bug compatible
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index 345c99cef152..dffc38ee6255 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -5,6 +5,7 @@ extern struct dma_map_ops nommu_dma_ops;
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
 extern int iommu_pass_through;
+extern int iommu_group_mf;
 
 /* 10 seconds */
 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 80dc793b3f63..1c4d769e21ea 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -45,6 +45,15 @@ int iommu_detected __read_mostly = 0;
  */
 int iommu_pass_through __read_mostly;
 
+/*
+ * Group multi-function PCI devices into a single device-group for the
+ * iommu_device_group interface.  This tells the iommu driver to pretend
+ * it cannot distinguish between functions of a device, exposing only one
+ * group for the device.  Useful for disallowing use of individual PCI
+ * functions from userspace drivers.
+ */
+int iommu_group_mf __read_mostly;
+
 extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
 
 /* Dummy device used for NULL arguments (normally ISA). */
@@ -169,6 +178,8 @@ static __init int iommu_setup(char *p)
 #endif
 		if (!strncmp(p, "pt", 2))
 			iommu_pass_through = 1;
+		if (!strncmp(p, "group_mf", 8))
+			iommu_group_mf = 1;
 
 		gart_parse_options(p);
 
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 1d82b631d09c..6f7553684c1e 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2776,11 +2776,19 @@ static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
 static int amd_iommu_device_group(struct device *dev, unsigned int *groupid)
 {
 	struct iommu_dev_data *dev_data = dev->archdata.iommu;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	u16 devid;
 
 	if (!dev_data)
 		return -ENODEV;
 
-	*groupid = amd_iommu_alias_table[dev_data->devid];
+	if (pdev->is_virtfn || !iommu_group_mf)
+		devid = dev_data->devid;
+	else
+		devid = calc_devid(pdev->bus->number,
+				   PCI_DEVFN(PCI_SLOT(pdev->devfn), 0));
+
+	*groupid = amd_iommu_alias_table[devid];
 
 	return 0;
 }
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 39ca6bb17e13..9ef16d664a99 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4100,6 +4100,9 @@ static int intel_iommu_device_group(struct device *dev, unsigned int *groupid)
 		}
 	}
 
+	if (!pdev->is_virtfn && iommu_group_mf)
+		id.pci.devfn = PCI_DEVFN(PCI_SLOT(id.pci.devfn), 0);
+
 	*groupid = id.group;
 
 	return 0;
-- 
cgit v1.2.3-55-g7522


From b82e324b3c46a554595c12b45465d1943a57326c Mon Sep 17 00:00:00 2001
From: Mika Westerberg
Date: Thu, 10 Nov 2011 13:18:09 +0000
Subject: serial, mfd: don't hardcode the console

Add support to specify which HSU port to use as an early console. This can
be selected by passing "earlyprintk=hsu<n>" on the kernel command line. By
default port 0 is still used.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/include/asm/mrst.h                |  2 +-
 arch/x86/kernel/early_printk.c             |  2 +-
 arch/x86/platform/mrst/early_printk_mrst.c | 16 ++++++++++++----
 drivers/tty/serial/mfd.c                   | 18 +++++++-----------
 4 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 719f00b28ff5..470776039af9 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -51,7 +51,7 @@ extern struct console early_mrst_console;
 extern void mrst_early_console_init(void);
 
 extern struct console early_hsu_console;
-extern void hsu_early_console_init(void);
+extern void hsu_early_console_init(const char *);
 
 extern void intel_scu_devices_create(void);
 extern void intel_scu_devices_destroy(void);
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index cd28a350f7f9..9d42a52d2331 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -247,7 +247,7 @@ static int __init setup_early_printk(char *buf)
 		}
 
 		if (!strncmp(buf, "hsu", 3)) {
-			hsu_early_console_init();
+			hsu_early_console_init(buf + 3);
 			early_console_register(&early_hsu_console, keep);
 		}
 #endif
diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c
index 25bfdbb5b130..3c6e328483c7 100644
--- a/arch/x86/platform/mrst/early_printk_mrst.c
+++ b/arch/x86/platform/mrst/early_printk_mrst.c
@@ -245,16 +245,24 @@ struct console early_mrst_console = {
  * Following is the early console based on Medfield HSU (High
  * Speed UART) device.
  */
-#define HSU_PORT2_PADDR		0xffa28180
+#define HSU_PORT_BASE		0xffa28080
 
 static void __iomem *phsu;
 
-void hsu_early_console_init(void)
+void hsu_early_console_init(const char *s)
 {
+	unsigned long paddr, port = 0;
 	u8 lcr;
 
-	phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
-							HSU_PORT2_PADDR);
+	/*
+	 * Select the early HSU console port if specified by user in the
+	 * kernel command line.
+	 */
+	if (*s && !kstrtoul(s, 10, &port))
+		port = clamp_val(port, 0, 2);
+
+	paddr = HSU_PORT_BASE + port * 0x80;
+	phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr);
 
 	/* Disable FIFO */
 	writeb(0x0, phsu + UART_FCR);
diff --git a/drivers/tty/serial/mfd.c b/drivers/tty/serial/mfd.c
index 286c386d9c46..565f3fe3dd77 100644
--- a/drivers/tty/serial/mfd.c
+++ b/drivers/tty/serial/mfd.c
@@ -1156,7 +1156,6 @@ serial_hsu_console_setup(struct console *co, char *options)
 	int bits = 8;
 	int parity = 'n';
 	int flow = 'n';
-	int ret;
 
 	if (co->index == -1 || co->index >= serial_hsu_reg.nr)
 		co->index = 0;
@@ -1167,9 +1166,7 @@ serial_hsu_console_setup(struct console *co, char *options)
 	if (options)
 		uart_parse_options(options, &baud, &parity, &bits, &flow);
 
-	ret = uart_set_options(&up->port, co, baud, parity, bits, flow);
-
-	return ret;
+	return uart_set_options(&up->port, co, baud, parity, bits, flow);
 }
 
 static struct console serial_hsu_console = {
@@ -1178,9 +1175,13 @@ static struct console serial_hsu_console = {
 	.device		= uart_console_device,
 	.setup		= serial_hsu_console_setup,
 	.flags		= CON_PRINTBUFFER,
-	.index		= 2,
+	.index		= -1,
 	.data		= &serial_hsu_reg,
 };
+
+#define SERIAL_HSU_CONSOLE	(&serial_hsu_console)
+#else
+#define SERIAL_HSU_CONSOLE	NULL
 #endif
 
 struct uart_ops serial_hsu_pops = {
@@ -1210,6 +1211,7 @@ static struct uart_driver serial_hsu_reg = {
 	.major		= TTY_MAJOR,
 	.minor		= 128,
 	.nr		= 3,
+	.cons		= SERIAL_HSU_CONSOLE,
 };
 
 #ifdef CONFIG_PM
@@ -1344,12 +1346,6 @@ static int serial_hsu_probe(struct pci_dev *pdev,
 		}
 		uart_add_one_port(&serial_hsu_reg, &uport->port);
 
-#ifdef CONFIG_SERIAL_MFD_HSU_CONSOLE
-		if (index == 2) {
-			register_console(&serial_hsu_console);
-			uport->port.cons = &serial_hsu_console;
-		}
-#endif
 		pci_set_drvdata(pdev, uport);
 	}
 
-- 
cgit v1.2.3-55-g7522


From b7641d2c83aa10031bf45afd82619bfaaedcbc6f Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Fri, 11 Nov 2011 15:43:02 -0800
Subject: x86-64, syscall: Adjust comment spacing and remove typo

Adjust spacing for comment so that it matches the multiline comment
style used in the rest of the kernel, and remove word duplication.

It is not really clear what version of gcc this refers to, but the
extra & doesn't cause any harm, so there is no reason to remove it.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/syscall_64.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index de87d6008295..0edfafa1b269 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -21,9 +21,9 @@ extern void sys_ni_syscall(void);
 
 const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
 	/*
-	*Smells like a like a compiler bug -- it doesn't work
-	*when the & below is removed.
-	*/
+	 * Smells like a compiler bug -- it doesn't work
+	 * when the & below is removed.
+	 */
 	[0 ... __NR_syscall_max] = &sys_ni_syscall,
 #include <asm/unistd_64.h>
 };
-- 
cgit v1.2.3-55-g7522


From e79a7fccfb2ab10f8753ac634a1c8473e870ae6c Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Fri, 11 Nov 2011 15:48:42 -0800
Subject: x86-64, ia32: Move compat_ni_syscall into C and its own file

Move compat_ni_syscall out of ia32entry.S and into its own .c file.
Although this is a trivial function, it is not performance-critical,
and this will simplify further cleanups.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/ia32/Makefile    | 1 +
 arch/x86/ia32/ia32entry.S | 3 ---
 arch/x86/ia32/nosyscall.c | 7 +++++++
 3 files changed, 8 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/ia32/nosyscall.c

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index 52d0ccfcf6ea..eea9a1c77d38 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,6 +3,7 @@
 #
 
 obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
+obj-$(CONFIG_IA32_EMULATION) += nosyscall.o
 
 sysv-$(CONFIG_SYSVIPC) := ipc32.o
 obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a6253ec1b284..59538a777695 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -453,9 +453,6 @@ ia32_badsys:
 	movq $-ENOSYS,%rax
 	jmp ia32_sysret
 
-quiet_ni_syscall:
-	movq $-ENOSYS,%rax
-	ret
 	CFI_ENDPROC
 	
 	.macro PTREGSCALL label, func, arg
diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c
new file mode 100644
index 000000000000..51ecd5b4e787
--- /dev/null
+++ b/arch/x86/ia32/nosyscall.c
@@ -0,0 +1,7 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+
+long compat_ni_syscall(void)
+{
+	return -ENOSYS;
+}
-- 
cgit v1.2.3-55-g7522


From d181764ccf6207e02abb95fb3052639b947f4833 Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Fri, 11 Nov 2011 15:55:49 -0800
Subject: x86: Machine-readable syscall tables and scripts to process them

Create a simple set of syscall tables and scripts to turn them into
both header files (unistd_*.h) and macros for generating the system
call tables.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/syscalls/Makefile       |  43 +++++
 arch/x86/syscalls/syscall_32.tbl | 357 +++++++++++++++++++++++++++++++++++++++
 arch/x86/syscalls/syscall_64.tbl | 320 +++++++++++++++++++++++++++++++++++
 arch/x86/syscalls/syscallhdr.sh  |  36 ++++
 arch/x86/syscalls/syscalltbl.sh  |  15 ++
 5 files changed, 771 insertions(+)
 create mode 100644 arch/x86/syscalls/Makefile
 create mode 100644 arch/x86/syscalls/syscall_32.tbl
 create mode 100644 arch/x86/syscalls/syscall_64.tbl
 create mode 100644 arch/x86/syscalls/syscallhdr.sh
 create mode 100644 arch/x86/syscalls/syscalltbl.sh

(limited to 'arch/x86')

diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile
new file mode 100644
index 000000000000..564b2476fede
--- /dev/null
+++ b/arch/x86/syscalls/Makefile
@@ -0,0 +1,43 @@
+out := $(obj)/../include/generated/asm
+
+# Create output directory if not already present
+_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
+
+syscall32 := $(srctree)/$(src)/syscall_32.tbl
+syscall64 := $(srctree)/$(src)/syscall_64.tbl
+
+syshdr := $(srctree)/$(src)/syscallhdr.sh
+systbl := $(srctree)/$(src)/syscalltbl.sh
+
+quiet_cmd_syshdr = SYSHDR  $@
+      cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' $< $@ \
+		   $(syshdr_abi_$(basetarget)) $(syshdr_pfx_$(basetarget))
+quiet_cmd_systbl = SYSTBL  $@
+      cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@
+
+syshdr_abi_unistd_32 := i386
+$(out)/unistd_32.h: $(syscall32) $(syshdr)
+	$(call if_changed,syshdr)
+
+syshdr_abi_unistd_32_ia32 := i386
+syshdr_pfx_unistd_32_ia32 := ia32_
+$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr)
+	$(call if_changed,syshdr)
+
+syshdr_abi_unistd_64 := 64
+$(out)/unistd_64.h: $(syscall64) $(syshdr)
+	$(call if_changed,syshdr)
+
+$(out)/syscalls_32.h: $(syscall32) $(systbl)
+	$(call if_changed,systbl)
+$(out)/syscalls_64.h: $(syscall64) $(systbl)
+	$(call if_changed,systbl)
+
+syshdr-y			+= unistd_32.h unistd_64.h
+syshdr-y			+= syscalls_32.h
+syshdr-$(CONFIG_X86_64)		+= unistd_32_ia32.h
+syshdr-$(CONFIG_X86_64)		+= syscalls_64.h
+
+targets	+= $(syshdr-y)
+
+all: $(addprefix $(out)/,$(targets))
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
new file mode 100644
index 000000000000..ce98e287c066
--- /dev/null
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -0,0 +1,357 @@
+#
+# 32-bit system call numbers and entry vectors
+#
+# The format is:
+# <number> <abi> <name> <entry point> <compat entry point>
+#
+# The abi is always "i386" for this file.
+#
+0	i386	restart_syscall		sys_restart_syscall
+1	i386	exit			sys_exit
+2	i386	fork			ptregs_fork			stub32_fork
+3	i386	read			sys_read
+4	i386	write			sys_write
+5	i386	open			sys_open			compat_sys_open
+6	i386	close			sys_close
+7	i386	waitpid			sys_waitpid			sys32_waitpid
+8	i386	creat			sys_creat
+9	i386	link			sys_link
+10	i386	unlink			sys_unlink
+11	i386	execve			ptregs_execve			stub32_execve
+12	i386	chdir			sys_chdir
+13	i386	time			sys_time			compat_sys_time
+14	i386	mknod			sys_mknod
+15	i386	chmod			sys_chmod
+16	i386	lchown			sys_lchown16
+17	i386	break
+18	i386	oldstat			sys_stat
+19	i386	lseek			sys_lseek			sys32_lseek
+20	i386	getpid			sys_getpid
+21	i386	mount			sys_mount			compat_sys_mount
+22	i386	umount			sys_oldumount
+23	i386	setuid			sys_setuid16
+24	i386	getuid			sys_getuid16
+25	i386	stime			sys_stime			compat_sys_stime
+26	i386	ptrace			sys_ptrace			compat_sys_ptrace
+27	i386	alarm			sys_alarm
+28	i386	oldfstat		sys_fstat
+29	i386	pause			sys_pause
+30	i386	utime			sys_utime			compat_sys_utime
+31	i386	stty
+32	i386	gtty
+33	i386	access			sys_access
+34	i386	nice			sys_nice
+35	i386	ftime
+36	i386	sync			sys_sync
+37	i386	kill			sys_kill			sys32_kill
+38	i386	rename			sys_rename
+39	i386	mkdir			sys_mkdir
+40	i386	rmdir			sys_rmdir
+41	i386	dup			sys_dup
+42	i386	pipe			sys_pipe
+43	i386	times			sys_times			compat_sys_times
+44	i386	prof
+45	i386	brk			sys_brk
+46	i386	setgid			sys_setgid16
+47	i386	getgid			sys_getgid16
+48	i386	signal			sys_signal
+49	i386	geteuid			sys_geteuid16
+50	i386	getegid			sys_getegid16
+51	i386	acct			sys_acct
+52	i386	umount2			sys_umount
+53	i386	lock
+54	i386	ioctl			sys_ioctl			compat_sys_ioctl
+55	i386	fcntl			sys_fcntl			compat_sys_fcntl64
+56	i386	mpx
+57	i386	setpgid			sys_setpgid
+58	i386	ulimit
+59	i386	oldolduname		sys_olduname
+60	i386	umask			sys_umask
+61	i386	chroot			sys_chroot
+62	i386	ustat			sys_ustat			compat_sys_ustat
+63	i386	dup2			sys_dup2
+64	i386	getppid			sys_getppid
+65	i386	getpgrp			sys_getpgrp
+66	i386	setsid			sys_setsid
+67	i386	sigaction		sys_sigaction			sys32_sigaction
+68	i386	sgetmask		sys_sgetmask
+69	i386	ssetmask		sys_ssetmask
+70	i386	setreuid		sys_setreuid16
+71	i386	setregid		sys_setregid16
+72	i386	sigsuspend		sys_sigsuspend			sys32_sigsuspend
+73	i386	sigpending		sys_sigpending			compat_sys_sigpending
+74	i386	sethostname		sys_sethostname
+75	i386	setrlimit		sys_setrlimit			compat_sys_setrlimit
+76	i386	getrlimit		sys_old_getrlimit		compat_sys_old_getrlimit
+77	i386	getrusage		sys_getrusage			compat_sys_getrusage
+78	i386	gettimeofday		sys_gettimeofday		compat_sys_gettimeofday
+79	i386	settimeofday		sys_settimeofday		compat_sys_settimeofday
+80	i386	getgroups		sys_getgroups16
+81	i386	setgroups		sys_setgroups16
+82	i386	select			sys_old_select			compat_sys_old_select
+83	i386	symlink			sys_symlink
+84	i386	oldlstat		sys_lstat
+85	i386	readlink		sys_readlink
+86	i386	uselib			sys_uselib
+87	i386	swapon			sys_swapon
+88	i386	reboot			sys_reboot
+89	i386	readdir			sys_old_readdir			compat_sys_old_readdir
+90	i386	mmap			sys_old_mmap			sys32_mmap
+91	i386	munmap			sys_munmap
+92	i386	truncate		sys_truncate
+93	i386	ftruncate		sys_ftruncate
+94	i386	fchmod			sys_fchmod
+95	i386	fchown			sys_fchown16
+96	i386	getpriority		sys_getpriority
+97	i386	setpriority		sys_setpriority
+98	i386	profil
+99	i386	statfs			sys_statfs			compat_sys_statfs
+100	i386	fstatfs			sys_fstatfs			compat_sys_fstatfs
+101	i386	ioperm			sys_ioperm
+102	i386	socketcall		sys_socketcall			compat_sys_socketcall
+103	i386	syslog			sys_syslog
+104	i386	setitimer		sys_setitimer			compat_sys_setitimer
+105	i386	getitimer		sys_getitimer			compat_sys_getitimer
+106	i386	stat			sys_newstat			compat_sys_newstat
+107	i386	lstat			sys_newlstat			compat_sys_newlstat
+108	i386	fstat			sys_newfstat			compat_sys_newfstat
+109	i386	olduname		sys_uname
+110	i386	iopl			ptregs_iopl			stub32_iopl
+111	i386	vhangup			sys_vhangup
+112	i386	idle
+113	i386	vm86old			ptregs_vm86old			sys32_vm86_warning
+114	i386	wait4			sys_wait4			compat_sys_wait4
+115	i386	swapoff			sys_swapoff
+116	i386	sysinfo			sys_sysinfo			compat_sys_sysinfo
+117	i386	ipc			sys_ipc				sys32_ipc
+118	i386	fsync			sys_fsync
+119	i386	sigreturn		ptregs_sigreturn		stub32_sigreturn
+120	i386	clone			ptregs_clone			stub32_clone
+121	i386	setdomainname		sys_setdomainname
+122	i386	uname			sys_newuname
+123	i386	modify_ldt		sys_modify_ldt
+124	i386	adjtimex		sys_adjtimex			compat_sys_adjtimex
+125	i386	mprotect		sys_mprotect			sys32_mprotect
+126	i386	sigprocmask		sys_sigprocmask			compat_sys_sigprocmask
+127	i386	create_module
+128	i386	init_module		sys_init_module
+129	i386	delete_module		sys_delete_module
+130	i386	get_kernel_syms
+131	i386	quotactl		sys_quotactl			sys32_quotactl
+132	i386	getpgid			sys_getpgid
+133	i386	fchdir			sys_fchdir
+134	i386	bdflush			sys_bdflush
+135	i386	sysfs			sys_sysfs
+136	i386	personality		sys_personality
+137	i386	afs_syscall
+138	i386	setfsuid		sys_setfsuid16
+139	i386	setfsgid		sys_setfsgid16
+140	i386	_llseek			sys_llseek
+141	i386	getdents		sys_getdents			compat_sys_getdents
+142	i386	_newselect		sys_select			compat_sys_select
+143	i386	flock			sys_flock
+144	i386	msync			sys_msync
+145	i386	readv			sys_readv			compat_sys_readv
+146	i386	writev			sys_writev			compat_sys_writev
+147	i386	getsid			sys_getsid
+148	i386	fdatasync		sys_fdatasync
+149	i386	_sysctl			sys_sysctl			compat_sys_sysctl
+150	i386	mlock			sys_mlock
+151	i386	munlock			sys_munlock
+152	i386	mlockall		sys_mlockall
+153	i386	munlockall		sys_munlockall
+154	i386	sched_setparam		sys_sched_setparam
+155	i386	sched_getparam		sys_sched_getparam
+156	i386	sched_setscheduler	sys_sched_setscheduler
+157	i386	sched_getscheduler	sys_sched_getscheduler
+158	i386	sched_yield		sys_sched_yield
+159	i386	sched_get_priority_max	sys_sched_get_priority_max
+160	i386	sched_get_priority_min	sys_sched_get_priority_min
+161	i386	sched_rr_get_interval	sys_sched_rr_get_interval	sys32_sched_rr_get_interval
+162	i386	nanosleep		sys_nanosleep			compat_sys_nanosleep
+163	i386	mremap			sys_mremap
+164	i386	setresuid		sys_setresuid16
+165	i386	getresuid		sys_getresuid16
+166	i386	vm86			ptregs_vm86			sys32_vm86_warning
+167	i386	query_module
+168	i386	poll			sys_poll
+169	i386	nfsservctl
+170	i386	setresgid		sys_setresgid16
+171	i386	getresgid		sys_getresgid16
+172	i386	prctl			sys_prctl
+173	i386	rt_sigreturn		ptregs_rt_sigreturn		stub32_rt_sigreturn
+174	i386	rt_sigaction		sys_rt_sigaction		sys32_rt_sigaction
+175	i386	rt_sigprocmask		sys_rt_sigprocmask		sys32_rt_sigprocmask
+176	i386	rt_sigpending		sys_rt_sigpending		sys32_rt_sigpending
+177	i386	rt_sigtimedwait		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait
+178	i386	rt_sigqueueinfo		sys_rt_sigqueueinfo		sys32_rt_sigqueueinfo
+179	i386	rt_sigsuspend		sys_rt_sigsuspend
+180	i386	pread64			sys_pread64			sys32_pread
+181	i386	pwrite64		sys_pwrite64			sys32_pwrite
+182	i386	chown			sys_chown16
+183	i386	getcwd			sys_getcwd
+184	i386	capget			sys_capget
+185	i386	capset			sys_capset
+186	i386	sigaltstack		ptregs_sigaltstack		stub32_sigaltstack
+187	i386	sendfile		sys_sendfile			sys32_sendfile
+188	i386	getpmsg
+189	i386	putpmsg
+190	i386	vfork			ptregs_vfork			stub32_vfork
+191	i386	ugetrlimit		sys_getrlimit			compat_sys_getrlimit
+192	i386	mmap2			sys_mmap_pgoff
+193	i386	truncate64		sys_truncate64			sys32_truncate64
+194	i386	ftruncate64		sys_ftruncate64			sys32_ftruncate64
+195	i386	stat64			sys_stat64			sys32_stat64
+196	i386	lstat64			sys_lstat64			sys32_lstat64
+197	i386	fstat64			sys_fstat64			sys32_fstat64
+198	i386	lchown32		sys_lchown
+199	i386	getuid32		sys_getuid
+200	i386	getgid32		sys_getgid
+201	i386	geteuid32		sys_geteuid
+202	i386	getegid32		sys_getegid
+203	i386	setreuid32		sys_setreuid
+204	i386	setregid32		sys_setregid
+205	i386	getgroups32		sys_getgroups
+206	i386	setgroups32		sys_setgroups
+207	i386	fchown32		sys_fchown
+208	i386	setresuid32		sys_setresuid
+209	i386	getresuid32		sys_getresuid
+210	i386	setresgid32		sys_setresgid
+211	i386	getresgid32		sys_getresgid
+212	i386	chown32			sys_chown
+213	i386	setuid32		sys_setuid
+214	i386	setgid32		sys_setgid
+215	i386	setfsuid32		sys_setfsuid
+216	i386	setfsgid32		sys_setfsgid
+217	i386	pivot_root		sys_pivot_root
+218	i386	mincore			sys_mincore
+219	i386	madvise			sys_madvise
+220	i386	getdents64		sys_getdents64			compat_sys_getdents64
+221	i386	fcntl64			sys_fcntl64			compat_sys_fcntl64
+# 222 is unused
+# 223 is unused
+224	i386	gettid			sys_gettid
+225	i386	readahead		sys_readahead			sys32_readahead
+226	i386	setxattr		sys_setxattr
+227	i386	lsetxattr		sys_lsetxattr
+228	i386	fsetxattr		sys_fsetxattr
+229	i386	getxattr		sys_getxattr
+230	i386	lgetxattr		sys_lgetxattr
+231	i386	fgetxattr		sys_fgetxattr
+232	i386	listxattr		sys_listxattr
+233	i386	llistxattr		sys_llistxattr
+234	i386	flistxattr		sys_flistxattr
+235	i386	removexattr		sys_removexattr
+236	i386	lremovexattr		sys_lremovexattr
+237	i386	fremovexattr		sys_fremovexattr
+238	i386	tkill			sys_tkill
+239	i386	sendfile64		sys_sendfile64
+240	i386	futex			sys_futex			compat_sys_futex
+241	i386	sched_setaffinity	sys_sched_setaffinity		compat_sys_sched_setaffinity
+242	i386	sched_getaffinity	sys_sched_getaffinity		compat_sys_sched_getaffinity
+243	i386	set_thread_area		sys_set_thread_area
+244	i386	get_thread_area		sys_get_thread_area
+245	i386	io_setup		sys_io_setup			compat_sys_io_setup
+246	i386	io_destroy		sys_io_destroy
+247	i386	io_getevents		sys_io_getevents		compat_sys_io_getevents
+248	i386	io_submit		sys_io_submit			compat_sys_io_submit
+249	i386	io_cancel		sys_io_cancel
+250	i386	fadvise64		sys_fadvise64			sys32_fadvise64
+# 251 is available for reuse (was briefly sys_set_zone_reclaim)
+252	i386	exit_group		sys_exit_group
+253	i386	lookup_dcookie		sys_lookup_dcookie		sys32_lookup_dcookie
+254	i386	epoll_create		sys_epoll_create
+255	i386	epoll_ctl		sys_epoll_ctl
+256	i386	epoll_wait		sys_epoll_wait
+257	i386	remap_file_pages	sys_remap_file_pages
+258	i386	set_tid_address		sys_set_tid_address
+259	i386	timer_create		sys_timer_create		compat_sys_timer_create
+260	i386	timer_settime		sys_timer_settime		compat_sys_timer_settime
+261	i386	timer_gettime		sys_timer_gettime		compat_sys_timer_gettime
+262	i386	timer_getoverrun	sys_timer_getoverrun
+263	i386	timer_delete		sys_timer_delete
+264	i386	clock_settime		sys_clock_settime		compat_sys_clock_settime
+265	i386	clock_gettime		sys_clock_gettime		compat_sys_clock_gettime
+266	i386	clock_getres		sys_clock_getres		compat_sys_clock_getres
+267	i386	clock_nanosleep		sys_clock_nanosleep		compat_sys_clock_nanosleep
+268	i386	statfs64		sys_statfs64			compat_sys_statfs64
+269	i386	fstatfs64		sys_fstatfs64			compat_sys_fstatfs64
+270	i386	tgkill			sys_tgkill
+271	i386	utimes			sys_utimes			compat_sys_utimes
+272	i386	fadvise64_64		sys_fadvise64_64		sys32_fadvise64_64
+273	i386	vserver
+274	i386	mbind			sys_mbind
+275	i386	get_mempolicy		sys_get_mempolicy		compat_sys_get_mempolicy
+276	i386	set_mempolicy		sys_set_mempolicy
+277	i386	mq_open			sys_mq_open			compat_sys_mq_open
+278	i386	mq_unlink		sys_mq_unlink
+279	i386	mq_timedsend		sys_mq_timedsend		compat_sys_mq_timedsend
+280	i386	mq_timedreceive		sys_mq_timedreceive		compat_sys_mq_timedreceive
+281	i386	mq_notify		sys_mq_notify			compat_sys_mq_notify
+282	i386	mq_getsetaddr		sys_mq_getsetattr		compat_sys_mq_getsetattr
+283	i386	kexec_load		sys_kexec_load			compat_sys_kexec_load
+284	i386	waitid			sys_waitid			compat_sys_waitid
+# 285 sys_setaltroot
+286	i386	add_key			sys_add_key
+287	i386	request_key		sys_request_key
+288	i386	keyctl			sys_keyctl
+289	i386	ioprio_set		sys_ioprio_set
+290	i386	ioprio_get		sys_ioprio_get
+291	i386	inotify_init		sys_inotify_init
+292	i386	inotify_add_watch	sys_inotify_add_watch
+293	i386	inotify_rm_watch	sys_inotify_rm_watch
+294	i386	migrate_pages		sys_migrate_pages
+295	i386	openat			sys_openat			compat_sys_openat
+296	i386	mkdirat			sys_mkdirat
+297	i386	mknodat			sys_mknodat
+298	i386	fchownat		sys_fchownat
+299	i386	futimesat		sys_futimesat			compat_sys_futimesat
+300	i386	fstatat64		sys_fstatat64			sys32_fstatat
+301	i386	unlinkat		sys_unlinkat
+302	i386	renameat		sys_renameat
+303	i386	linkat			sys_linkat
+304	i386	symlinkat		sys_symlinkat
+305	i386	readlinkat		sys_readlinkat
+306	i386	fchmodat		sys_fchmodat
+307	i386	faccessat		sys_faccessat
+308	i386	pselect6		sys_pselect6			compat_sys_pselect6
+309	i386	ppoll			sys_ppoll			compat_sys_ppoll
+310	i386	unshare			sys_unshare
+311	i386	set_robust_list		sys_set_robust_list		compat_sys_set_robust_list
+312	i386	get_robust_list		sys_get_robust_list		compat_sys_get_robust_list
+313	i386	splice			sys_splice
+314	i386	sync_file_range		sys_sync_file_range		sys32_sync_file_range
+315	i386	tee			sys_tee
+316	i386	vmsplice		sys_vmsplice			compat_sys_vmsplice
+317	i386	move_pages		sys_move_pages			compat_sys_move_pages
+318	i386	getcpu			sys_getcpu
+319	i386	epoll_pwait		sys_epoll_pwait
+320	i386	utimensat		sys_utimensat			compat_sys_utimensat
+321	i386	signalfd		sys_signalfd			compat_sys_signalfd
+322	i386	timerfd_create		sys_timerfd_create
+323	i386	eventfd			sys_eventfd
+324	i386	fallocate		sys_fallocate			sys32_fallocate
+325	i386	timerfd_settime		sys_timerfd_settime		compat_sys_timerfd_settime
+326	i386	timerfd_gettime		sys_timerfd_gettime		compat_sys_timerfd_gettime
+327	i386	signalfd4		sys_signalfd4			compat_sys_signalfd4
+328	i386	eventfd2		sys_eventfd2
+329	i386	epoll_create1		sys_epoll_create1
+330	i386	dup3			sys_dup3
+331	i386	pipe2			sys_pipe2
+332	i386	inotify_init1		sys_inotify_init1
+333	i386	preadv			sys_preadv			compat_sys_preadv
+334	i386	pwritev			sys_pwritev			compat_sys_pwritev
+335	i386	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo		compat_sys_rt_tgsigqueueinfo
+336	i386	perf_event_open		sys_perf_event_open
+337	i386	recvmmsg		sys_recvmmsg			compat_sys_recvmmsg
+338	i386	fanotify_init		sys_fanotify_init
+339	i386	fanotify_mark		sys_fanotify_mark		sys32_fanotify_mark
+340	i386	prlimit64		sys_prlimit64
+341	i386	name_to_handle_at	sys_name_to_handle_at
+342	i386	open_by_handle_at	sys_open_by_handle_at		compat_sys_open_by_handle_at
+343	i386	clock_adjtime		sys_clock_adjtime		compat_sys_clock_adjtime
+344	i386	syncfs			sys_syncfs
+345	i386	sendmmsg		sys_sendmmsg			compat_sys_sendmmsg
+346	i386	setns			sys_setns
+347	i386	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
+348	i386	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
new file mode 100644
index 000000000000..b440a8f7eefa
--- /dev/null
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -0,0 +1,320 @@
+#
+# 64-bit system call numbers and entry vectors
+#
+# The format is:
+# <number> <abi> <name> <entry point>
+#
+# The abi is always "64" for this file (for now.)
+#
+0	64	read			sys_read
+1	64	write			sys_write
+2	64	open			sys_open
+3	64	close			sys_close
+4	64	stat			sys_newstat
+5	64	fstat			sys_newfstat
+6	64	lstat			sys_newlstat
+7	64	poll			sys_poll
+8	64	lseek			sys_lseek
+9	64	mmap			sys_mmap
+10	64	mprotect		sys_mprotect
+11	64	munmap			sys_munmap
+12	64	brk			sys_brk
+13	64	rt_sigaction		sys_rt_sigaction
+14	64	rt_sigprocmask		sys_rt_sigprocmask
+15	64	rt_sigreturn		stub_rt_sigreturn
+16	64	ioctl			sys_ioctl
+17	64	pread64			sys_pread64
+18	64	pwrite64		sys_pwrite64
+19	64	readv			sys_readv
+20	64	writev			sys_writev
+21	64	access			sys_access
+22	64	pipe			sys_pipe
+23	64	select			sys_select
+24	64	sched_yield		sys_sched_yield
+25	64	mremap			sys_mremap
+26	64	msync			sys_msync
+27	64	mincore			sys_mincore
+28	64	madvise			sys_madvise
+29	64	shmget			sys_shmget
+30	64	shmat			sys_shmat
+31	64	shmctl			sys_shmctl
+32	64	dup			sys_dup
+33	64	dup2			sys_dup2
+34	64	pause			sys_pause
+35	64	nanosleep		sys_nanosleep
+36	64	getitimer		sys_getitimer
+37	64	alarm			sys_alarm
+38	64	setitimer		sys_setitimer
+39	64	getpid			sys_getpid
+40	64	sendfile		sys_sendfile64
+41	64	socket			sys_socket
+42	64	connect			sys_connect
+43	64	accept			sys_accept
+44	64	sendto			sys_sendto
+45	64	recvfrom		sys_recvfrom
+46	64	sendmsg			sys_sendmsg
+47	64	recvmsg			sys_recvmsg
+48	64	shutdown		sys_shutdown
+49	64	bind			sys_bind
+50	64	listen			sys_listen
+51	64	getsockname		sys_getsockname
+52	64	getpeername		sys_getpeername
+53	64	socketpair		sys_socketpair
+54	64	setsockopt		sys_setsockopt
+55	64	getsockopt		sys_getsockopt
+56	64	clone			stub_clone
+57	64	fork			stub_fork
+58	64	vfork			stub_vfork
+59	64	execve			stub_execve
+60	64	exit			sys_exit
+61	64	wait4			sys_wait4
+62	64	kill			sys_kill
+63	64	uname			sys_newuname
+64	64	semget			sys_semget
+65	64	semop			sys_semop
+66	64	semctl			sys_semctl
+67	64	shmdt			sys_shmdt
+68	64	msgget			sys_msgget
+69	64	msgsnd			sys_msgsnd
+70	64	msgrcv			sys_msgrcv
+71	64	msgctl			sys_msgctl
+72	64	fcntl			sys_fcntl
+73	64	flock			sys_flock
+74	64	fsync			sys_fsync
+75	64	fdatasync		sys_fdatasync
+76	64	truncate		sys_truncate
+77	64	ftruncate		sys_ftruncate
+78	64	getdents		sys_getdents
+79	64	getcwd			sys_getcwd
+80	64	chdir			sys_chdir
+81	64	fchdir			sys_fchdir
+82	64	rename			sys_rename
+83	64	mkdir			sys_mkdir
+84	64	rmdir			sys_rmdir
+85	64	creat			sys_creat
+86	64	link			sys_link
+87	64	unlink			sys_unlink
+88	64	symlink			sys_symlink
+89	64	readlink		sys_readlink
+90	64	chmod			sys_chmod
+91	64	fchmod			sys_fchmod
+92	64	chown			sys_chown
+93	64	fchown			sys_fchown
+94	64	lchown			sys_lchown
+95	64	umask			sys_umask
+96	64	gettimeofday		sys_gettimeofday
+97	64	getrlimit		sys_getrlimit
+98	64	getrusage		sys_getrusage
+99	64	sysinfo			sys_sysinfo
+100	64	times			sys_times
+101	64	ptrace			sys_ptrace
+102	64	getuid			sys_getuid
+103	64	syslog			sys_syslog
+104	64	getgid			sys_getgid
+105	64	setuid			sys_setuid
+106	64	setgid			sys_setgid
+107	64	geteuid			sys_geteuid
+108	64	getegid			sys_getegid
+109	64	setpgid			sys_setpgid
+110	64	getppid			sys_getppid
+111	64	getpgrp			sys_getpgrp
+112	64	setsid			sys_setsid
+113	64	setreuid		sys_setreuid
+114	64	setregid		sys_setregid
+115	64	getgroups		sys_getgroups
+116	64	setgroups		sys_setgroups
+117	64	setresuid		sys_setresuid
+118	64	getresuid		sys_getresuid
+119	64	setresgid		sys_setresgid
+120	64	getresgid		sys_getresgid
+121	64	getpgid			sys_getpgid
+122	64	setfsuid		sys_setfsuid
+123	64	setfsgid		sys_setfsgid
+124	64	getsid			sys_getsid
+125	64	capget			sys_capget
+126	64	capset			sys_capset
+127	64	rt_sigpending		sys_rt_sigpending
+128	64	rt_sigtimedwait		sys_rt_sigtimedwait
+129	64	rt_sigqueueinfo		sys_rt_sigqueueinfo
+130	64	rt_sigsuspend		sys_rt_sigsuspend
+131	64	sigaltstack		stub_sigaltstack
+132	64	utime			sys_utime
+133	64	mknod			sys_mknod
+134	64	uselib
+135	64	personality		sys_personality
+136	64	ustat			sys_ustat
+137	64	statfs			sys_statfs
+138	64	fstatfs			sys_fstatfs
+139	64	sysfs			sys_sysfs
+140	64	getpriority		sys_getpriority
+141	64	setpriority		sys_setpriority
+142	64	sched_setparam		sys_sched_setparam
+143	64	sched_getparam		sys_sched_getparam
+144	64	sched_setscheduler	sys_sched_setscheduler
+145	64	sched_getscheduler	sys_sched_getscheduler
+146	64	sched_get_priority_max	sys_sched_get_priority_max
+147	64	sched_get_priority_min	sys_sched_get_priority_min
+148	64	sched_rr_get_interval	sys_sched_rr_get_interval
+149	64	mlock			sys_mlock
+150	64	munlock			sys_munlock
+151	64	mlockall		sys_mlockall
+152	64	munlockall		sys_munlockall
+153	64	vhangup			sys_vhangup
+154	64	modify_ldt		sys_modify_ldt
+155	64	pivot_root		sys_pivot_root
+156	64	_sysctl			sys_sysctl
+157	64	prctl			sys_prctl
+158	64	arch_prctl		sys_arch_prctl
+159	64	adjtimex		sys_adjtimex
+160	64	setrlimit		sys_setrlimit
+161	64	chroot			sys_chroot
+162	64	sync			sys_sync
+163	64	acct			sys_acct
+164	64	settimeofday		sys_settimeofday
+165	64	mount			sys_mount
+166	64	umount2			sys_umount
+167	64	swapon			sys_swapon
+168	64	swapoff			sys_swapoff
+169	64	reboot			sys_reboot
+170	64	sethostname		sys_sethostname
+171	64	setdomainname		sys_setdomainname
+172	64	iopl			stub_iopl
+173	64	ioperm			sys_ioperm
+174	64	create_module
+175	64	init_module		sys_init_module
+176	64	delete_module		sys_delete_module
+177	64	get_kernel_syms
+178	64	query_module
+179	64	quotactl		sys_quotactl
+180	64	nfsservctl
+181	64	getpmsg
+182	64	putpmsg
+183	64	afs_syscall
+184	64	tuxcall
+185	64	security
+186	64	gettid			sys_gettid
+187	64	readahead		sys_readahead
+188	64	setxattr		sys_setxattr
+189	64	lsetxattr		sys_lsetxattr
+190	64	fsetxattr		sys_fsetxattr
+191	64	getxattr		sys_getxattr
+192	64	lgetxattr		sys_lgetxattr
+193	64	fgetxattr		sys_fgetxattr
+194	64	listxattr		sys_listxattr
+195	64	llistxattr		sys_llistxattr
+196	64	flistxattr		sys_flistxattr
+197	64	removexattr		sys_removexattr
+198	64	lremovexattr		sys_lremovexattr
+199	64	fremovexattr		sys_fremovexattr
+200	64	tkill			sys_tkill
+201	64	time			sys_time
+202	64	futex			sys_futex
+203	64	sched_setaffinity	sys_sched_setaffinity
+204	64	sched_getaffinity	sys_sched_getaffinity
+205	64	set_thread_area
+206	64	io_setup		sys_io_setup
+207	64	io_destroy		sys_io_destroy
+208	64	io_getevents		sys_io_getevents
+209	64	io_submit		sys_io_submit
+210	64	io_cancel		sys_io_cancel
+211	64	get_thread_area
+212	64	lookup_dcookie		sys_lookup_dcookie
+213	64	epoll_create		sys_epoll_create
+214	64	epoll_ctl_old
+215	64	epoll_wait_old
+216	64	remap_file_pages	sys_remap_file_pages
+217	64	getdents64		sys_getdents64
+218	64	set_tid_address		sys_set_tid_address
+219	64	restart_syscall		sys_restart_syscall
+220	64	semtimedop		sys_semtimedop
+221	64	fadvise64		sys_fadvise64
+222	64	timer_create		sys_timer_create
+223	64	timer_settime		sys_timer_settime
+224	64	timer_gettime		sys_timer_gettime
+225	64	timer_getoverrun	sys_timer_getoverrun
+226	64	timer_delete		sys_timer_delete
+227	64	clock_settime		sys_clock_settime
+228	64	clock_gettime		sys_clock_gettime
+229	64	clock_getres		sys_clock_getres
+230	64	clock_nanosleep		sys_clock_nanosleep
+231	64	exit_group		sys_exit_group
+232	64	epoll_wait		sys_epoll_wait
+233	64	epoll_ctl		sys_epoll_ctl
+234	64	tgkill			sys_tgkill
+235	64	utimes			sys_utimes
+236	64	vserver
+237	64	mbind			sys_mbind
+238	64	set_mempolicy		sys_set_mempolicy
+239	64	get_mempolicy		sys_get_mempolicy
+240	64	mq_open			sys_mq_open
+241	64	mq_unlink		sys_mq_unlink
+242	64	mq_timedsend		sys_mq_timedsend
+243	64	mq_timedreceive		sys_mq_timedreceive
+244	64	mq_notify		sys_mq_notify
+245	64	mq_getsetattr		sys_mq_getsetattr
+246	64	kexec_load		sys_kexec_load
+247	64	waitid			sys_waitid
+248	64	add_key			sys_add_key
+249	64	request_key		sys_request_key
+250	64	keyctl			sys_keyctl
+251	64	ioprio_set		sys_ioprio_set
+252	64	ioprio_get		sys_ioprio_get
+253	64	inotify_init		sys_inotify_init
+254	64	inotify_add_watch	sys_inotify_add_watch
+255	64	inotify_rm_watch	sys_inotify_rm_watch
+256	64	migrate_pages		sys_migrate_pages
+257	64	openat			sys_openat
+258	64	mkdirat			sys_mkdirat
+259	64	mknodat			sys_mknodat
+260	64	fchownat		sys_fchownat
+261	64	futimesat		sys_futimesat
+262	64	newfstatat		sys_newfstatat
+263	64	unlinkat		sys_unlinkat
+264	64	renameat		sys_renameat
+265	64	linkat			sys_linkat
+266	64	symlinkat		sys_symlinkat
+267	64	readlinkat		sys_readlinkat
+268	64	fchmodat		sys_fchmodat
+269	64	faccessat		sys_faccessat
+270	64	pselect6		sys_pselect6
+271	64	ppoll			sys_ppoll
+272	64	unshare			sys_unshare
+273	64	set_robust_list		sys_set_robust_list
+274	64	get_robust_list		sys_get_robust_list
+275	64	splice			sys_splice
+276	64	tee			sys_tee
+277	64	sync_file_range		sys_sync_file_range
+278	64	vmsplice		sys_vmsplice
+279	64	move_pages		sys_move_pages
+280	64	utimensat		sys_utimensat
+281	64	epoll_pwait		sys_epoll_pwait
+282	64	signalfd		sys_signalfd
+283	64	timerfd_create		sys_timerfd_create
+284	64	eventfd			sys_eventfd
+285	64	fallocate		sys_fallocate
+286	64	timerfd_settime		sys_timerfd_settime
+287	64	timerfd_gettime		sys_timerfd_gettime
+288	64	accept4			sys_accept4
+289	64	signalfd4		sys_signalfd4
+290	64	eventfd2		sys_eventfd2
+291	64	epoll_create1		sys_epoll_create1
+292	64	dup3			sys_dup3
+293	64	pipe2			sys_pipe2
+294	64	inotify_init1		sys_inotify_init1
+295	64	preadv			sys_preadv
+296	64	pwritev			sys_pwritev
+297	64	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo
+298	64	perf_event_open		sys_perf_event_open
+299	64	recvmmsg		sys_recvmmsg
+300	64	fanotify_init		sys_fanotify_init
+301	64	fanotify_mark		sys_fanotify_mark
+302	64	prlimit64		sys_prlimit64
+303	64	name_to_handle_at	sys_name_to_handle_at
+304	64	open_by_handle_at	sys_open_by_handle_at
+305	64	clock_adjtime		sys_clock_adjtime
+306	64	syncfs			sys_syncfs
+307	64	sendmmsg		sys_sendmmsg
+308	64	setns			sys_setns
+309	64	getcpu			sys_getcpu
+310	64	process_vm_readv	sys_process_vm_readv
+311	64	process_vm_writev	sys_process_vm_writev
diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh
new file mode 100644
index 000000000000..0d473ff12eaf
--- /dev/null
+++ b/arch/x86/syscalls/syscallhdr.sh
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+in="$1"
+out="$2"
+my_abis=`echo "$3" | tr ',' ' '`
+prefix="$4"
+offset="$5"
+
+fileguard=_ASM_X86_`basename "$out" | sed \
+    -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \
+    -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'`
+
+in_list () {
+    local x
+    for x in $1; do
+	if [ x"$x" = x"$2" ]; then
+	    return 0
+	fi
+    done
+    return 1
+}
+
+grep '^[0-9]' "$in" | sort -n | (
+    echo "#ifndef ${fileguard}"
+    echo "#define ${fileguard} 1"
+    echo ""
+
+    while read nr abi name entry ; do
+	if in_list "$my_abis" "$abi"; then
+	    echo "#define __NR_${prefix}${name}" $((nr+offset))
+        fi
+    done
+
+    echo ""
+    echo "#endif /* ${fileguard} */"
+) > "$out"
diff --git a/arch/x86/syscalls/syscalltbl.sh b/arch/x86/syscalls/syscalltbl.sh
new file mode 100644
index 000000000000..0e7f8ec071e7
--- /dev/null
+++ b/arch/x86/syscalls/syscalltbl.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+in="$1"
+out="$2"
+
+grep '^[0-9]' "$in" | sort -n | (
+    while read nr abi name entry compat; do
+	abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
+	if [ -n "$compat" ]; then
+	    echo "__SYSCALL_${abi}($nr, $entry, $compat)"
+	elif [ -n "$entry" ]; then
+	    echo "__SYSCALL_${abi}($nr, $entry, $entry)"
+	fi
+    done
+) > "$out"
-- 
cgit v1.2.3-55-g7522


From 303395ac3bf3e2cb488435537d416bc840438fcb Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Fri, 11 Nov 2011 16:07:41 -0800
Subject: x86: Generate system call tables and unistd_*.h from tables

Generate system call tables and unistd_*.h automatically from the
tables in arch/x86/syscalls.  All other information, like NR_syscalls,
is auto-generated, some of which is in asm-offsets_*.c.

This allows us to keep all the system call information in one place,
and allows for kernel space and user space to see different
information; this is currently used for the ia32 system call numbers
when building the 64-bit kernel, but will be used by the x32 ABI in
the near future.

This also removes some gratuitious differences between i386, x86-64
and ia32; in particular, now all system call tables are generated with
the same mechanism.

Cc: H. J. Lu <hjl.tools@gmail.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Michal Marek <mmarek@suse.cz>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Makefile                  |   6 +
 arch/x86/ia32/Makefile             |   2 +-
 arch/x86/ia32/ia32entry.S          | 356 ------------------
 arch/x86/ia32/syscall_ia32.c       |  25 ++
 arch/x86/include/asm/Kbuild        |   5 +-
 arch/x86/include/asm/ia32_unistd.h |  13 +-
 arch/x86/include/asm/unistd.h      |  54 ++-
 arch/x86/include/asm/unistd_32.h   | 401 --------------------
 arch/x86/include/asm/unistd_64.h   | 732 -------------------------------------
 arch/x86/kernel/Makefile           |   3 +-
 arch/x86/kernel/asm-offsets_32.c   |   8 +
 arch/x86/kernel/asm-offsets_64.c   |  19 +-
 arch/x86/kernel/entry_32.S         |  37 +-
 arch/x86/kernel/syscall_32.c       |  25 ++
 arch/x86/kernel/syscall_64.c       |  14 +-
 arch/x86/kernel/syscall_table_32.S | 350 ------------------
 16 files changed, 154 insertions(+), 1896 deletions(-)
 create mode 100644 arch/x86/ia32/syscall_ia32.c
 delete mode 100644 arch/x86/include/asm/unistd_32.h
 delete mode 100644 arch/x86/include/asm/unistd_64.h
 create mode 100644 arch/x86/kernel/syscall_32.c
 delete mode 100644 arch/x86/kernel/syscall_table_32.S

(limited to 'arch/x86')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b02e509072a7..209ba1294592 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -117,6 +117,12 @@ KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
 KBUILD_CFLAGS += $(mflags-y)
 KBUILD_AFLAGS += $(mflags-y)
 
+###
+# Syscall table generation
+
+archheaders:
+	$(Q)$(MAKE) $(build)=arch/x86/syscalls all
+
 ###
 # Kernel objects
 
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index eea9a1c77d38..455646e0e532 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,7 +3,7 @@
 #
 
 obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
-obj-$(CONFIG_IA32_EMULATION) += nosyscall.o
+obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o
 
 sysv-$(CONFIG_SYSVIPC) := ipc32.o
 obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 59538a777695..72f853aea478 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -27,8 +27,6 @@
 
 	.section .entry.text, "ax"
 
-#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
-
 	.macro IA32_ARG_FIXUP noebp=0
 	movl	%edi,%r8d
 	.if \noebp
@@ -496,357 +494,3 @@ ENTRY(ia32_ptregs_common)
 	jmp  ia32_sysret	/* misbalances the return cache */
 	CFI_ENDPROC
 END(ia32_ptregs_common)
-
-	.section .rodata,"a"
-	.align 8
-ia32_sys_call_table:
-	.quad sys_restart_syscall
-	.quad sys_exit
-	.quad stub32_fork
-	.quad sys_read
-	.quad sys_write
-	.quad compat_sys_open		/* 5 */
-	.quad sys_close
-	.quad sys32_waitpid
-	.quad sys_creat
-	.quad sys_link
-	.quad sys_unlink		/* 10 */
-	.quad stub32_execve
-	.quad sys_chdir
-	.quad compat_sys_time
-	.quad sys_mknod
-	.quad sys_chmod		/* 15 */
-	.quad sys_lchown16
-	.quad quiet_ni_syscall			/* old break syscall holder */
-	.quad sys_stat
-	.quad sys32_lseek
-	.quad sys_getpid		/* 20 */
-	.quad compat_sys_mount	/* mount  */
-	.quad sys_oldumount	/* old_umount  */
-	.quad sys_setuid16
-	.quad sys_getuid16
-	.quad compat_sys_stime	/* stime */		/* 25 */
-	.quad compat_sys_ptrace	/* ptrace */
-	.quad sys_alarm
-	.quad sys_fstat	/* (old)fstat */
-	.quad sys_pause
-	.quad compat_sys_utime	/* 30 */
-	.quad quiet_ni_syscall	/* old stty syscall holder */
-	.quad quiet_ni_syscall	/* old gtty syscall holder */
-	.quad sys_access
-	.quad sys_nice	
-	.quad quiet_ni_syscall	/* 35 */	/* old ftime syscall holder */
-	.quad sys_sync
-	.quad sys32_kill
-	.quad sys_rename
-	.quad sys_mkdir
-	.quad sys_rmdir		/* 40 */
-	.quad sys_dup
-	.quad sys_pipe
-	.quad compat_sys_times
-	.quad quiet_ni_syscall			/* old prof syscall holder */
-	.quad sys_brk		/* 45 */
-	.quad sys_setgid16
-	.quad sys_getgid16
-	.quad sys_signal
-	.quad sys_geteuid16
-	.quad sys_getegid16	/* 50 */
-	.quad sys_acct
-	.quad sys_umount			/* new_umount */
-	.quad quiet_ni_syscall			/* old lock syscall holder */
-	.quad compat_sys_ioctl
-	.quad compat_sys_fcntl64		/* 55 */
-	.quad quiet_ni_syscall			/* old mpx syscall holder */
-	.quad sys_setpgid
-	.quad quiet_ni_syscall			/* old ulimit syscall holder */
-	.quad sys_olduname
-	.quad sys_umask		/* 60 */
-	.quad sys_chroot
-	.quad compat_sys_ustat
-	.quad sys_dup2
-	.quad sys_getppid
-	.quad sys_getpgrp		/* 65 */
-	.quad sys_setsid
-	.quad sys32_sigaction
-	.quad sys_sgetmask
-	.quad sys_ssetmask
-	.quad sys_setreuid16	/* 70 */
-	.quad sys_setregid16
-	.quad sys32_sigsuspend
-	.quad compat_sys_sigpending
-	.quad sys_sethostname
-	.quad compat_sys_setrlimit	/* 75 */
-	.quad compat_sys_old_getrlimit	/* old_getrlimit */
-	.quad compat_sys_getrusage
-	.quad compat_sys_gettimeofday
-	.quad compat_sys_settimeofday
-	.quad sys_getgroups16	/* 80 */
-	.quad sys_setgroups16
-	.quad compat_sys_old_select
-	.quad sys_symlink
-	.quad sys_lstat
-	.quad sys_readlink		/* 85 */
-	.quad sys_uselib
-	.quad sys_swapon
-	.quad sys_reboot
-	.quad compat_sys_old_readdir
-	.quad sys32_mmap		/* 90 */
-	.quad sys_munmap
-	.quad sys_truncate
-	.quad sys_ftruncate
-	.quad sys_fchmod
-	.quad sys_fchown16		/* 95 */
-	.quad sys_getpriority
-	.quad sys_setpriority
-	.quad quiet_ni_syscall			/* old profil syscall holder */
-	.quad compat_sys_statfs
-	.quad compat_sys_fstatfs		/* 100 */
-	.quad sys_ioperm
-	.quad compat_sys_socketcall
-	.quad sys_syslog
-	.quad compat_sys_setitimer
-	.quad compat_sys_getitimer	/* 105 */
-	.quad compat_sys_newstat
-	.quad compat_sys_newlstat
-	.quad compat_sys_newfstat
-	.quad sys_uname
-	.quad stub32_iopl		/* 110 */
-	.quad sys_vhangup
-	.quad quiet_ni_syscall	/* old "idle" system call */
-	.quad sys32_vm86_warning	/* vm86old */ 
-	.quad compat_sys_wait4
-	.quad sys_swapoff		/* 115 */
-	.quad compat_sys_sysinfo
-	.quad sys32_ipc
-	.quad sys_fsync
-	.quad stub32_sigreturn
-	.quad stub32_clone		/* 120 */
-	.quad sys_setdomainname
-	.quad sys_newuname
-	.quad sys_modify_ldt
-	.quad compat_sys_adjtimex
-	.quad sys32_mprotect		/* 125 */
-	.quad compat_sys_sigprocmask
-	.quad quiet_ni_syscall		/* create_module */
-	.quad sys_init_module
-	.quad sys_delete_module
-	.quad quiet_ni_syscall		/* 130  get_kernel_syms */
-	.quad sys32_quotactl
-	.quad sys_getpgid
-	.quad sys_fchdir
-	.quad quiet_ni_syscall	/* bdflush */
-	.quad sys_sysfs		/* 135 */
-	.quad sys_personality
-	.quad quiet_ni_syscall	/* for afs_syscall */
-	.quad sys_setfsuid16
-	.quad sys_setfsgid16
-	.quad sys_llseek		/* 140 */
-	.quad compat_sys_getdents
-	.quad compat_sys_select
-	.quad sys_flock
-	.quad sys_msync
-	.quad compat_sys_readv		/* 145 */
-	.quad compat_sys_writev
-	.quad sys_getsid
-	.quad sys_fdatasync
-	.quad compat_sys_sysctl	/* sysctl */
-	.quad sys_mlock		/* 150 */
-	.quad sys_munlock
-	.quad sys_mlockall
-	.quad sys_munlockall
-	.quad sys_sched_setparam
-	.quad sys_sched_getparam   /* 155 */
-	.quad sys_sched_setscheduler
-	.quad sys_sched_getscheduler
-	.quad sys_sched_yield
-	.quad sys_sched_get_priority_max
-	.quad sys_sched_get_priority_min  /* 160 */
-	.quad sys32_sched_rr_get_interval
-	.quad compat_sys_nanosleep
-	.quad sys_mremap
-	.quad sys_setresuid16
-	.quad sys_getresuid16	/* 165 */
-	.quad sys32_vm86_warning	/* vm86 */ 
-	.quad quiet_ni_syscall	/* query_module */
-	.quad sys_poll
-	.quad quiet_ni_syscall /* old nfsservctl */
-	.quad sys_setresgid16	/* 170 */
-	.quad sys_getresgid16
-	.quad sys_prctl
-	.quad stub32_rt_sigreturn
-	.quad sys32_rt_sigaction
-	.quad sys32_rt_sigprocmask	/* 175 */
-	.quad sys32_rt_sigpending
-	.quad compat_sys_rt_sigtimedwait
-	.quad sys32_rt_sigqueueinfo
-	.quad sys_rt_sigsuspend
-	.quad sys32_pread		/* 180 */
-	.quad sys32_pwrite
-	.quad sys_chown16
-	.quad sys_getcwd
-	.quad sys_capget
-	.quad sys_capset
-	.quad stub32_sigaltstack
-	.quad sys32_sendfile
-	.quad quiet_ni_syscall		/* streams1 */
-	.quad quiet_ni_syscall		/* streams2 */
-	.quad stub32_vfork            /* 190 */
-	.quad compat_sys_getrlimit
-	.quad sys_mmap_pgoff
-	.quad sys32_truncate64
-	.quad sys32_ftruncate64
-	.quad sys32_stat64		/* 195 */
-	.quad sys32_lstat64
-	.quad sys32_fstat64
-	.quad sys_lchown
-	.quad sys_getuid
-	.quad sys_getgid		/* 200 */
-	.quad sys_geteuid
-	.quad sys_getegid
-	.quad sys_setreuid
-	.quad sys_setregid
-	.quad sys_getgroups	/* 205 */
-	.quad sys_setgroups
-	.quad sys_fchown
-	.quad sys_setresuid
-	.quad sys_getresuid
-	.quad sys_setresgid	/* 210 */
-	.quad sys_getresgid
-	.quad sys_chown
-	.quad sys_setuid
-	.quad sys_setgid
-	.quad sys_setfsuid		/* 215 */
-	.quad sys_setfsgid
-	.quad sys_pivot_root
-	.quad sys_mincore
-	.quad sys_madvise
-	.quad compat_sys_getdents64	/* 220 getdents64 */
-	.quad compat_sys_fcntl64	
-	.quad quiet_ni_syscall		/* tux */
-	.quad quiet_ni_syscall    	/* security */
-	.quad sys_gettid	
-	.quad sys32_readahead	/* 225 */
-	.quad sys_setxattr
-	.quad sys_lsetxattr
-	.quad sys_fsetxattr
-	.quad sys_getxattr
-	.quad sys_lgetxattr	/* 230 */
-	.quad sys_fgetxattr
-	.quad sys_listxattr
-	.quad sys_llistxattr
-	.quad sys_flistxattr
-	.quad sys_removexattr	/* 235 */
-	.quad sys_lremovexattr
-	.quad sys_fremovexattr
-	.quad sys_tkill
-	.quad sys_sendfile64 
-	.quad compat_sys_futex		/* 240 */
-	.quad compat_sys_sched_setaffinity
-	.quad compat_sys_sched_getaffinity
-	.quad sys_set_thread_area
-	.quad sys_get_thread_area
-	.quad compat_sys_io_setup	/* 245 */
-	.quad sys_io_destroy
-	.quad compat_sys_io_getevents
-	.quad compat_sys_io_submit
-	.quad sys_io_cancel
-	.quad sys32_fadvise64		/* 250 */
-	.quad quiet_ni_syscall 	/* free_huge_pages */
-	.quad sys_exit_group
-	.quad sys32_lookup_dcookie
-	.quad sys_epoll_create
-	.quad sys_epoll_ctl		/* 255 */
-	.quad sys_epoll_wait
-	.quad sys_remap_file_pages
-	.quad sys_set_tid_address
-	.quad compat_sys_timer_create
-	.quad compat_sys_timer_settime	/* 260 */
-	.quad compat_sys_timer_gettime
-	.quad sys_timer_getoverrun
-	.quad sys_timer_delete
-	.quad compat_sys_clock_settime
-	.quad compat_sys_clock_gettime	/* 265 */
-	.quad compat_sys_clock_getres
-	.quad compat_sys_clock_nanosleep
-	.quad compat_sys_statfs64
-	.quad compat_sys_fstatfs64
-	.quad sys_tgkill		/* 270 */
-	.quad compat_sys_utimes
-	.quad sys32_fadvise64_64
-	.quad quiet_ni_syscall	/* sys_vserver */
-	.quad sys_mbind
-	.quad compat_sys_get_mempolicy	/* 275 */
-	.quad sys_set_mempolicy
-	.quad compat_sys_mq_open
-	.quad sys_mq_unlink
-	.quad compat_sys_mq_timedsend
-	.quad compat_sys_mq_timedreceive	/* 280 */
-	.quad compat_sys_mq_notify
-	.quad compat_sys_mq_getsetattr
-	.quad compat_sys_kexec_load	/* reserved for kexec */
-	.quad compat_sys_waitid
-	.quad quiet_ni_syscall		/* 285: sys_altroot */
-	.quad sys_add_key
-	.quad sys_request_key
-	.quad sys_keyctl
-	.quad sys_ioprio_set
-	.quad sys_ioprio_get		/* 290 */
-	.quad sys_inotify_init
-	.quad sys_inotify_add_watch
-	.quad sys_inotify_rm_watch
-	.quad sys_migrate_pages
-	.quad compat_sys_openat		/* 295 */
-	.quad sys_mkdirat
-	.quad sys_mknodat
-	.quad sys_fchownat
-	.quad compat_sys_futimesat
-	.quad sys32_fstatat		/* 300 */
-	.quad sys_unlinkat
-	.quad sys_renameat
-	.quad sys_linkat
-	.quad sys_symlinkat
-	.quad sys_readlinkat		/* 305 */
-	.quad sys_fchmodat
-	.quad sys_faccessat
-	.quad compat_sys_pselect6
-	.quad compat_sys_ppoll
-	.quad sys_unshare		/* 310 */
-	.quad compat_sys_set_robust_list
-	.quad compat_sys_get_robust_list
-	.quad sys_splice
-	.quad sys32_sync_file_range
-	.quad sys_tee			/* 315 */
-	.quad compat_sys_vmsplice
-	.quad compat_sys_move_pages
-	.quad sys_getcpu
-	.quad sys_epoll_pwait
-	.quad compat_sys_utimensat	/* 320 */
-	.quad compat_sys_signalfd
-	.quad sys_timerfd_create
-	.quad sys_eventfd
-	.quad sys32_fallocate
-	.quad compat_sys_timerfd_settime	/* 325 */
-	.quad compat_sys_timerfd_gettime
-	.quad compat_sys_signalfd4
-	.quad sys_eventfd2
-	.quad sys_epoll_create1
-	.quad sys_dup3				/* 330 */
-	.quad sys_pipe2
-	.quad sys_inotify_init1
-	.quad compat_sys_preadv
-	.quad compat_sys_pwritev
-	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */
-	.quad sys_perf_event_open
-	.quad compat_sys_recvmmsg
-	.quad sys_fanotify_init
-	.quad sys32_fanotify_mark
-	.quad sys_prlimit64		/* 340 */
-	.quad sys_name_to_handle_at
-	.quad compat_sys_open_by_handle_at
-	.quad compat_sys_clock_adjtime
-	.quad sys_syncfs
-	.quad compat_sys_sendmmsg	/* 345 */
-	.quad sys_setns
-	.quad compat_sys_process_vm_readv
-	.quad compat_sys_process_vm_writev
-ia32_syscall_end:
diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c
new file mode 100644
index 000000000000..d04d3dbc47d4
--- /dev/null
+++ b/arch/x86/ia32/syscall_ia32.c
@@ -0,0 +1,25 @@
+/* System call table for ia32 emulation. */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <asm/asm-offsets.h>
+
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ;
+#include <asm/syscalls_32.h>
+#undef __SYSCALL_I386
+
+#define __SYSCALL_I386(nr, sym, compat) [nr] = compat,
+
+typedef void (*sys_call_ptr_t)(void);
+
+extern void compat_ni_syscall(void);
+
+const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
+	/*
+	 * Smells like a like a compiler bug -- it doesn't work
+	 * when the & below is removed.
+	 */
+	[0 ... __NR_ia32_syscall_max] = &compat_ni_syscall,
+#include <asm/syscalls_32.h>
+};
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 6fa90a845e4c..b57e6a43a37a 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -19,7 +19,8 @@ header-y += processor-flags.h
 header-y += ptrace-abi.h
 header-y += sigcontext32.h
 header-y += ucontext.h
-header-y += unistd_32.h
-header-y += unistd_64.h
 header-y += vm86.h
 header-y += vsyscall.h
+
+genhdr-y += unistd_32.h
+genhdr-y += unistd_64.h
diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h
index 976f6ecd2ce6..b0d5716ca1e4 100644
--- a/arch/x86/include/asm/ia32_unistd.h
+++ b/arch/x86/include/asm/ia32_unistd.h
@@ -2,17 +2,10 @@
 #define _ASM_X86_IA32_UNISTD_H
 
 /*
- * This file contains the system call numbers of the ia32 port,
+ * This file contains the system call numbers of the ia32 compat ABI,
  * this is for the kernel only.
- * Only add syscalls here where some part of the kernel needs to know
- * the number. This should be otherwise in sync with asm-x86/unistd_32.h. -AK
  */
-
-#define __NR_ia32_restart_syscall 0
-#define __NR_ia32_exit		  1
-#define __NR_ia32_read		  3
-#define __NR_ia32_write		  4
-#define __NR_ia32_sigreturn	119
-#define __NR_ia32_rt_sigreturn	173
+#define __SYSCALL_ia32_NR(x) (x)
+#include <asm/unistd_32_ia32.h>
 
 #endif /* _ASM_X86_IA32_UNISTD_H */
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 2a58ed3e51d8..b4a3db7ce140 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -1,13 +1,59 @@
+#ifndef _ASM_X86_UNISTD_H
+#define _ASM_X86_UNISTD_H 1
+
 #ifdef __KERNEL__
 # ifdef CONFIG_X86_32
-#  include "unistd_32.h"
+
+#  include <asm/unistd_32.h>
+#  define __ARCH_WANT_IPC_PARSE_VERSION
+#  define __ARCH_WANT_STAT64
+#  define __ARCH_WANT_SYS_OLD_MMAP
+#  define __ARCH_WANT_SYS_OLD_SELECT
+
 # else
-#  include "unistd_64.h"
+
+#  include <asm/unistd_64.h>
+#  define __ARCH_WANT_COMPAT_SYS_TIME
+
 # endif
+
+# define __ARCH_WANT_OLD_READDIR
+# define __ARCH_WANT_OLD_STAT
+# define __ARCH_WANT_SYS_ALARM
+# define __ARCH_WANT_SYS_FADVISE64
+# define __ARCH_WANT_SYS_GETHOSTNAME
+# define __ARCH_WANT_SYS_GETPGRP
+# define __ARCH_WANT_SYS_LLSEEK
+# define __ARCH_WANT_SYS_NICE
+# define __ARCH_WANT_SYS_OLDUMOUNT
+# define __ARCH_WANT_SYS_OLD_GETRLIMIT
+# define __ARCH_WANT_SYS_OLD_UNAME
+# define __ARCH_WANT_SYS_PAUSE
+# define __ARCH_WANT_SYS_RT_SIGACTION
+# define __ARCH_WANT_SYS_RT_SIGSUSPEND
+# define __ARCH_WANT_SYS_SGETMASK
+# define __ARCH_WANT_SYS_SIGNAL
+# define __ARCH_WANT_SYS_SIGPENDING
+# define __ARCH_WANT_SYS_SIGPROCMASK
+# define __ARCH_WANT_SYS_SOCKETCALL
+# define __ARCH_WANT_SYS_TIME
+# define __ARCH_WANT_SYS_UTIME
+# define __ARCH_WANT_SYS_WAITPID
+
+/*
+ * "Conditional" syscalls
+ *
+ * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
+ * but it doesn't work on all toolchains, so we just do it by hand
+ */
+# define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
+
 #else
 # ifdef __i386__
-#  include "unistd_32.h"
+#  include <asm/unistd_32.h>
 # else
-#  include "unistd_64.h"
+#  include <asm/unistd_64.h>
 # endif
 #endif
+
+#endif /* _ASM_X86_UNISTD_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
deleted file mode 100644
index 599c77d38f33..000000000000
--- a/arch/x86/include/asm/unistd_32.h
+++ /dev/null
@@ -1,401 +0,0 @@
-#ifndef _ASM_X86_UNISTD_32_H
-#define _ASM_X86_UNISTD_32_H
-
-/*
- * This file contains the system call numbers.
- */
-
-#define __NR_restart_syscall      0
-#define __NR_exit		  1
-#define __NR_fork		  2
-#define __NR_read		  3
-#define __NR_write		  4
-#define __NR_open		  5
-#define __NR_close		  6
-#define __NR_waitpid		  7
-#define __NR_creat		  8
-#define __NR_link		  9
-#define __NR_unlink		 10
-#define __NR_execve		 11
-#define __NR_chdir		 12
-#define __NR_time		 13
-#define __NR_mknod		 14
-#define __NR_chmod		 15
-#define __NR_lchown		 16
-#define __NR_break		 17
-#define __NR_oldstat		 18
-#define __NR_lseek		 19
-#define __NR_getpid		 20
-#define __NR_mount		 21
-#define __NR_umount		 22
-#define __NR_setuid		 23
-#define __NR_getuid		 24
-#define __NR_stime		 25
-#define __NR_ptrace		 26
-#define __NR_alarm		 27
-#define __NR_oldfstat		 28
-#define __NR_pause		 29
-#define __NR_utime		 30
-#define __NR_stty		 31
-#define __NR_gtty		 32
-#define __NR_access		 33
-#define __NR_nice		 34
-#define __NR_ftime		 35
-#define __NR_sync		 36
-#define __NR_kill		 37
-#define __NR_rename		 38
-#define __NR_mkdir		 39
-#define __NR_rmdir		 40
-#define __NR_dup		 41
-#define __NR_pipe		 42
-#define __NR_times		 43
-#define __NR_prof		 44
-#define __NR_brk		 45
-#define __NR_setgid		 46
-#define __NR_getgid		 47
-#define __NR_signal		 48
-#define __NR_geteuid		 49
-#define __NR_getegid		 50
-#define __NR_acct		 51
-#define __NR_umount2		 52
-#define __NR_lock		 53
-#define __NR_ioctl		 54
-#define __NR_fcntl		 55
-#define __NR_mpx		 56
-#define __NR_setpgid		 57
-#define __NR_ulimit		 58
-#define __NR_oldolduname	 59
-#define __NR_umask		 60
-#define __NR_chroot		 61
-#define __NR_ustat		 62
-#define __NR_dup2		 63
-#define __NR_getppid		 64
-#define __NR_getpgrp		 65
-#define __NR_setsid		 66
-#define __NR_sigaction		 67
-#define __NR_sgetmask		 68
-#define __NR_ssetmask		 69
-#define __NR_setreuid		 70
-#define __NR_setregid		 71
-#define __NR_sigsuspend		 72
-#define __NR_sigpending		 73
-#define __NR_sethostname	 74
-#define __NR_setrlimit		 75
-#define __NR_getrlimit		 76   /* Back compatible 2Gig limited rlimit */
-#define __NR_getrusage		 77
-#define __NR_gettimeofday	 78
-#define __NR_settimeofday	 79
-#define __NR_getgroups		 80
-#define __NR_setgroups		 81
-#define __NR_select		 82
-#define __NR_symlink		 83
-#define __NR_oldlstat		 84
-#define __NR_readlink		 85
-#define __NR_uselib		 86
-#define __NR_swapon		 87
-#define __NR_reboot		 88
-#define __NR_readdir		 89
-#define __NR_mmap		 90
-#define __NR_munmap		 91
-#define __NR_truncate		 92
-#define __NR_ftruncate		 93
-#define __NR_fchmod		 94
-#define __NR_fchown		 95
-#define __NR_getpriority	 96
-#define __NR_setpriority	 97
-#define __NR_profil		 98
-#define __NR_statfs		 99
-#define __NR_fstatfs		100
-#define __NR_ioperm		101
-#define __NR_socketcall		102
-#define __NR_syslog		103
-#define __NR_setitimer		104
-#define __NR_getitimer		105
-#define __NR_stat		106
-#define __NR_lstat		107
-#define __NR_fstat		108
-#define __NR_olduname		109
-#define __NR_iopl		110
-#define __NR_vhangup		111
-#define __NR_idle		112
-#define __NR_vm86old		113
-#define __NR_wait4		114
-#define __NR_swapoff		115
-#define __NR_sysinfo		116
-#define __NR_ipc		117
-#define __NR_fsync		118
-#define __NR_sigreturn		119
-#define __NR_clone		120
-#define __NR_setdomainname	121
-#define __NR_uname		122
-#define __NR_modify_ldt		123
-#define __NR_adjtimex		124
-#define __NR_mprotect		125
-#define __NR_sigprocmask	126
-#define __NR_create_module	127
-#define __NR_init_module	128
-#define __NR_delete_module	129
-#define __NR_get_kernel_syms	130
-#define __NR_quotactl		131
-#define __NR_getpgid		132
-#define __NR_fchdir		133
-#define __NR_bdflush		134
-#define __NR_sysfs		135
-#define __NR_personality	136
-#define __NR_afs_syscall	137 /* Syscall for Andrew File System */
-#define __NR_setfsuid		138
-#define __NR_setfsgid		139
-#define __NR__llseek		140
-#define __NR_getdents		141
-#define __NR__newselect		142
-#define __NR_flock		143
-#define __NR_msync		144
-#define __NR_readv		145
-#define __NR_writev		146
-#define __NR_getsid		147
-#define __NR_fdatasync		148
-#define __NR__sysctl		149
-#define __NR_mlock		150
-#define __NR_munlock		151
-#define __NR_mlockall		152
-#define __NR_munlockall		153
-#define __NR_sched_setparam		154
-#define __NR_sched_getparam		155
-#define __NR_sched_setscheduler		156
-#define __NR_sched_getscheduler		157
-#define __NR_sched_yield		158
-#define __NR_sched_get_priority_max	159
-#define __NR_sched_get_priority_min	160
-#define __NR_sched_rr_get_interval	161
-#define __NR_nanosleep		162
-#define __NR_mremap		163
-#define __NR_setresuid		164
-#define __NR_getresuid		165
-#define __NR_vm86		166
-#define __NR_query_module	167
-#define __NR_poll		168
-#define __NR_nfsservctl		169
-#define __NR_setresgid		170
-#define __NR_getresgid		171
-#define __NR_prctl              172
-#define __NR_rt_sigreturn	173
-#define __NR_rt_sigaction	174
-#define __NR_rt_sigprocmask	175
-#define __NR_rt_sigpending	176
-#define __NR_rt_sigtimedwait	177
-#define __NR_rt_sigqueueinfo	178
-#define __NR_rt_sigsuspend	179
-#define __NR_pread64		180
-#define __NR_pwrite64		181
-#define __NR_chown		182
-#define __NR_getcwd		183
-#define __NR_capget		184
-#define __NR_capset		185
-#define __NR_sigaltstack	186
-#define __NR_sendfile		187
-#define __NR_getpmsg		188	/* some people actually want streams */
-#define __NR_putpmsg		189	/* some people actually want streams */
-#define __NR_vfork		190
-#define __NR_ugetrlimit		191	/* SuS compliant getrlimit */
-#define __NR_mmap2		192
-#define __NR_truncate64		193
-#define __NR_ftruncate64	194
-#define __NR_stat64		195
-#define __NR_lstat64		196
-#define __NR_fstat64		197
-#define __NR_lchown32		198
-#define __NR_getuid32		199
-#define __NR_getgid32		200
-#define __NR_geteuid32		201
-#define __NR_getegid32		202
-#define __NR_setreuid32		203
-#define __NR_setregid32		204
-#define __NR_getgroups32	205
-#define __NR_setgroups32	206
-#define __NR_fchown32		207
-#define __NR_setresuid32	208
-#define __NR_getresuid32	209
-#define __NR_setresgid32	210
-#define __NR_getresgid32	211
-#define __NR_chown32		212
-#define __NR_setuid32		213
-#define __NR_setgid32		214
-#define __NR_setfsuid32		215
-#define __NR_setfsgid32		216
-#define __NR_pivot_root		217
-#define __NR_mincore		218
-#define __NR_madvise		219
-#define __NR_madvise1		219	/* delete when C lib stub is removed */
-#define __NR_getdents64		220
-#define __NR_fcntl64		221
-/* 223 is unused */
-#define __NR_gettid		224
-#define __NR_readahead		225
-#define __NR_setxattr		226
-#define __NR_lsetxattr		227
-#define __NR_fsetxattr		228
-#define __NR_getxattr		229
-#define __NR_lgetxattr		230
-#define __NR_fgetxattr		231
-#define __NR_listxattr		232
-#define __NR_llistxattr		233
-#define __NR_flistxattr		234
-#define __NR_removexattr	235
-#define __NR_lremovexattr	236
-#define __NR_fremovexattr	237
-#define __NR_tkill		238
-#define __NR_sendfile64		239
-#define __NR_futex		240
-#define __NR_sched_setaffinity	241
-#define __NR_sched_getaffinity	242
-#define __NR_set_thread_area	243
-#define __NR_get_thread_area	244
-#define __NR_io_setup		245
-#define __NR_io_destroy		246
-#define __NR_io_getevents	247
-#define __NR_io_submit		248
-#define __NR_io_cancel		249
-#define __NR_fadvise64		250
-/* 251 is available for reuse (was briefly sys_set_zone_reclaim) */
-#define __NR_exit_group		252
-#define __NR_lookup_dcookie	253
-#define __NR_epoll_create	254
-#define __NR_epoll_ctl		255
-#define __NR_epoll_wait		256
-#define __NR_remap_file_pages	257
-#define __NR_set_tid_address	258
-#define __NR_timer_create	259
-#define __NR_timer_settime	(__NR_timer_create+1)
-#define __NR_timer_gettime	(__NR_timer_create+2)
-#define __NR_timer_getoverrun	(__NR_timer_create+3)
-#define __NR_timer_delete	(__NR_timer_create+4)
-#define __NR_clock_settime	(__NR_timer_create+5)
-#define __NR_clock_gettime	(__NR_timer_create+6)
-#define __NR_clock_getres	(__NR_timer_create+7)
-#define __NR_clock_nanosleep	(__NR_timer_create+8)
-#define __NR_statfs64		268
-#define __NR_fstatfs64		269
-#define __NR_tgkill		270
-#define __NR_utimes		271
-#define __NR_fadvise64_64	272
-#define __NR_vserver		273
-#define __NR_mbind		274
-#define __NR_get_mempolicy	275
-#define __NR_set_mempolicy	276
-#define __NR_mq_open 		277
-#define __NR_mq_unlink		(__NR_mq_open+1)
-#define __NR_mq_timedsend	(__NR_mq_open+2)
-#define __NR_mq_timedreceive	(__NR_mq_open+3)
-#define __NR_mq_notify		(__NR_mq_open+4)
-#define __NR_mq_getsetattr	(__NR_mq_open+5)
-#define __NR_kexec_load		283
-#define __NR_waitid		284
-/* #define __NR_sys_setaltroot	285 */
-#define __NR_add_key		286
-#define __NR_request_key	287
-#define __NR_keyctl		288
-#define __NR_ioprio_set		289
-#define __NR_ioprio_get		290
-#define __NR_inotify_init	291
-#define __NR_inotify_add_watch	292
-#define __NR_inotify_rm_watch	293
-#define __NR_migrate_pages	294
-#define __NR_openat		295
-#define __NR_mkdirat		296
-#define __NR_mknodat		297
-#define __NR_fchownat		298
-#define __NR_futimesat		299
-#define __NR_fstatat64		300
-#define __NR_unlinkat		301
-#define __NR_renameat		302
-#define __NR_linkat		303
-#define __NR_symlinkat		304
-#define __NR_readlinkat		305
-#define __NR_fchmodat		306
-#define __NR_faccessat		307
-#define __NR_pselect6		308
-#define __NR_ppoll		309
-#define __NR_unshare		310
-#define __NR_set_robust_list	311
-#define __NR_get_robust_list	312
-#define __NR_splice		313
-#define __NR_sync_file_range	314
-#define __NR_tee		315
-#define __NR_vmsplice		316
-#define __NR_move_pages		317
-#define __NR_getcpu		318
-#define __NR_epoll_pwait	319
-#define __NR_utimensat		320
-#define __NR_signalfd		321
-#define __NR_timerfd_create	322
-#define __NR_eventfd		323
-#define __NR_fallocate		324
-#define __NR_timerfd_settime	325
-#define __NR_timerfd_gettime	326
-#define __NR_signalfd4		327
-#define __NR_eventfd2		328
-#define __NR_epoll_create1	329
-#define __NR_dup3		330
-#define __NR_pipe2		331
-#define __NR_inotify_init1	332
-#define __NR_preadv		333
-#define __NR_pwritev		334
-#define __NR_rt_tgsigqueueinfo	335
-#define __NR_perf_event_open	336
-#define __NR_recvmmsg		337
-#define __NR_fanotify_init	338
-#define __NR_fanotify_mark	339
-#define __NR_prlimit64		340
-#define __NR_name_to_handle_at	341
-#define __NR_open_by_handle_at  342
-#define __NR_clock_adjtime	343
-#define __NR_syncfs             344
-#define __NR_sendmmsg		345
-#define __NR_setns		346
-#define __NR_process_vm_readv	347
-#define __NR_process_vm_writev	348
-
-#ifdef __KERNEL__
-
-#define NR_syscalls 349
-
-#define __ARCH_WANT_IPC_PARSE_VERSION
-#define __ARCH_WANT_OLD_READDIR
-#define __ARCH_WANT_OLD_STAT
-#define __ARCH_WANT_STAT64
-#define __ARCH_WANT_SYS_ALARM
-#define __ARCH_WANT_SYS_GETHOSTNAME
-#define __ARCH_WANT_SYS_IPC
-#define __ARCH_WANT_SYS_PAUSE
-#define __ARCH_WANT_SYS_SGETMASK
-#define __ARCH_WANT_SYS_SIGNAL
-#define __ARCH_WANT_SYS_TIME
-#define __ARCH_WANT_SYS_UTIME
-#define __ARCH_WANT_SYS_WAITPID
-#define __ARCH_WANT_SYS_SOCKETCALL
-#define __ARCH_WANT_SYS_FADVISE64
-#define __ARCH_WANT_SYS_GETPGRP
-#define __ARCH_WANT_SYS_LLSEEK
-#define __ARCH_WANT_SYS_NICE
-#define __ARCH_WANT_SYS_OLD_GETRLIMIT
-#define __ARCH_WANT_SYS_OLD_UNAME
-#define __ARCH_WANT_SYS_OLD_MMAP
-#define __ARCH_WANT_SYS_OLD_SELECT
-#define __ARCH_WANT_SYS_OLDUMOUNT
-#define __ARCH_WANT_SYS_SIGPENDING
-#define __ARCH_WANT_SYS_SIGPROCMASK
-#define __ARCH_WANT_SYS_RT_SIGACTION
-#define __ARCH_WANT_SYS_RT_SIGSUSPEND
-
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#ifndef cond_syscall
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
-#endif
-
-#endif /* __KERNEL__ */
-#endif /* _ASM_X86_UNISTD_32_H */
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
deleted file mode 100644
index 0431f193c3f2..000000000000
--- a/arch/x86/include/asm/unistd_64.h
+++ /dev/null
@@ -1,732 +0,0 @@
-#ifndef _ASM_X86_UNISTD_64_H
-#define _ASM_X86_UNISTD_64_H
-
-#ifndef __SYSCALL
-#define __SYSCALL(a, b)
-#endif
-
-/*
- * This file contains the system call numbers.
- *
- * Note: holes are not allowed.
- */
-
-/* at least 8 syscall per cacheline */
-#define __NR_read				0
-__SYSCALL(__NR_read, sys_read)
-#define __NR_write				1
-__SYSCALL(__NR_write, sys_write)
-#define __NR_open				2
-__SYSCALL(__NR_open, sys_open)
-#define __NR_close				3
-__SYSCALL(__NR_close, sys_close)
-#define __NR_stat				4
-__SYSCALL(__NR_stat, sys_newstat)
-#define __NR_fstat				5
-__SYSCALL(__NR_fstat, sys_newfstat)
-#define __NR_lstat				6
-__SYSCALL(__NR_lstat, sys_newlstat)
-#define __NR_poll				7
-__SYSCALL(__NR_poll, sys_poll)
-
-#define __NR_lseek				8
-__SYSCALL(__NR_lseek, sys_lseek)
-#define __NR_mmap				9
-__SYSCALL(__NR_mmap, sys_mmap)
-#define __NR_mprotect				10
-__SYSCALL(__NR_mprotect, sys_mprotect)
-#define __NR_munmap				11
-__SYSCALL(__NR_munmap, sys_munmap)
-#define __NR_brk				12
-__SYSCALL(__NR_brk, sys_brk)
-#define __NR_rt_sigaction			13
-__SYSCALL(__NR_rt_sigaction, sys_rt_sigaction)
-#define __NR_rt_sigprocmask			14
-__SYSCALL(__NR_rt_sigprocmask, sys_rt_sigprocmask)
-#define __NR_rt_sigreturn			15
-__SYSCALL(__NR_rt_sigreturn, stub_rt_sigreturn)
-
-#define __NR_ioctl				16
-__SYSCALL(__NR_ioctl, sys_ioctl)
-#define __NR_pread64				17
-__SYSCALL(__NR_pread64, sys_pread64)
-#define __NR_pwrite64				18
-__SYSCALL(__NR_pwrite64, sys_pwrite64)
-#define __NR_readv				19
-__SYSCALL(__NR_readv, sys_readv)
-#define __NR_writev				20
-__SYSCALL(__NR_writev, sys_writev)
-#define __NR_access				21
-__SYSCALL(__NR_access, sys_access)
-#define __NR_pipe				22
-__SYSCALL(__NR_pipe, sys_pipe)
-#define __NR_select				23
-__SYSCALL(__NR_select, sys_select)
-
-#define __NR_sched_yield			24
-__SYSCALL(__NR_sched_yield, sys_sched_yield)
-#define __NR_mremap				25
-__SYSCALL(__NR_mremap, sys_mremap)
-#define __NR_msync				26
-__SYSCALL(__NR_msync, sys_msync)
-#define __NR_mincore				27
-__SYSCALL(__NR_mincore, sys_mincore)
-#define __NR_madvise				28
-__SYSCALL(__NR_madvise, sys_madvise)
-#define __NR_shmget				29
-__SYSCALL(__NR_shmget, sys_shmget)
-#define __NR_shmat				30
-__SYSCALL(__NR_shmat, sys_shmat)
-#define __NR_shmctl				31
-__SYSCALL(__NR_shmctl, sys_shmctl)
-
-#define __NR_dup				32
-__SYSCALL(__NR_dup, sys_dup)
-#define __NR_dup2				33
-__SYSCALL(__NR_dup2, sys_dup2)
-#define __NR_pause				34
-__SYSCALL(__NR_pause, sys_pause)
-#define __NR_nanosleep				35
-__SYSCALL(__NR_nanosleep, sys_nanosleep)
-#define __NR_getitimer				36
-__SYSCALL(__NR_getitimer, sys_getitimer)
-#define __NR_alarm				37
-__SYSCALL(__NR_alarm, sys_alarm)
-#define __NR_setitimer				38
-__SYSCALL(__NR_setitimer, sys_setitimer)
-#define __NR_getpid				39
-__SYSCALL(__NR_getpid, sys_getpid)
-
-#define __NR_sendfile				40
-__SYSCALL(__NR_sendfile, sys_sendfile64)
-#define __NR_socket				41
-__SYSCALL(__NR_socket, sys_socket)
-#define __NR_connect				42
-__SYSCALL(__NR_connect, sys_connect)
-#define __NR_accept				43
-__SYSCALL(__NR_accept, sys_accept)
-#define __NR_sendto				44
-__SYSCALL(__NR_sendto, sys_sendto)
-#define __NR_recvfrom				45
-__SYSCALL(__NR_recvfrom, sys_recvfrom)
-#define __NR_sendmsg				46
-__SYSCALL(__NR_sendmsg, sys_sendmsg)
-#define __NR_recvmsg				47
-__SYSCALL(__NR_recvmsg, sys_recvmsg)
-
-#define __NR_shutdown				48
-__SYSCALL(__NR_shutdown, sys_shutdown)
-#define __NR_bind				49
-__SYSCALL(__NR_bind, sys_bind)
-#define __NR_listen				50
-__SYSCALL(__NR_listen, sys_listen)
-#define __NR_getsockname			51
-__SYSCALL(__NR_getsockname, sys_getsockname)
-#define __NR_getpeername			52
-__SYSCALL(__NR_getpeername, sys_getpeername)
-#define __NR_socketpair				53
-__SYSCALL(__NR_socketpair, sys_socketpair)
-#define __NR_setsockopt				54
-__SYSCALL(__NR_setsockopt, sys_setsockopt)
-#define __NR_getsockopt				55
-__SYSCALL(__NR_getsockopt, sys_getsockopt)
-
-#define __NR_clone				56
-__SYSCALL(__NR_clone, stub_clone)
-#define __NR_fork				57
-__SYSCALL(__NR_fork, stub_fork)
-#define __NR_vfork				58
-__SYSCALL(__NR_vfork, stub_vfork)
-#define __NR_execve				59
-__SYSCALL(__NR_execve, stub_execve)
-#define __NR_exit				60
-__SYSCALL(__NR_exit, sys_exit)
-#define __NR_wait4				61
-__SYSCALL(__NR_wait4, sys_wait4)
-#define __NR_kill				62
-__SYSCALL(__NR_kill, sys_kill)
-#define __NR_uname				63
-__SYSCALL(__NR_uname, sys_newuname)
-
-#define __NR_semget				64
-__SYSCALL(__NR_semget, sys_semget)
-#define __NR_semop				65
-__SYSCALL(__NR_semop, sys_semop)
-#define __NR_semctl				66
-__SYSCALL(__NR_semctl, sys_semctl)
-#define __NR_shmdt				67
-__SYSCALL(__NR_shmdt, sys_shmdt)
-#define __NR_msgget				68
-__SYSCALL(__NR_msgget, sys_msgget)
-#define __NR_msgsnd				69
-__SYSCALL(__NR_msgsnd, sys_msgsnd)
-#define __NR_msgrcv				70
-__SYSCALL(__NR_msgrcv, sys_msgrcv)
-#define __NR_msgctl				71
-__SYSCALL(__NR_msgctl, sys_msgctl)
-
-#define __NR_fcntl				72
-__SYSCALL(__NR_fcntl, sys_fcntl)
-#define __NR_flock				73
-__SYSCALL(__NR_flock, sys_flock)
-#define __NR_fsync				74
-__SYSCALL(__NR_fsync, sys_fsync)
-#define __NR_fdatasync				75
-__SYSCALL(__NR_fdatasync, sys_fdatasync)
-#define __NR_truncate				76
-__SYSCALL(__NR_truncate, sys_truncate)
-#define __NR_ftruncate				77
-__SYSCALL(__NR_ftruncate, sys_ftruncate)
-#define __NR_getdents				78
-__SYSCALL(__NR_getdents, sys_getdents)
-#define __NR_getcwd				79
-__SYSCALL(__NR_getcwd, sys_getcwd)
-
-#define __NR_chdir				80
-__SYSCALL(__NR_chdir, sys_chdir)
-#define __NR_fchdir				81
-__SYSCALL(__NR_fchdir, sys_fchdir)
-#define __NR_rename				82
-__SYSCALL(__NR_rename, sys_rename)
-#define __NR_mkdir				83
-__SYSCALL(__NR_mkdir, sys_mkdir)
-#define __NR_rmdir				84
-__SYSCALL(__NR_rmdir, sys_rmdir)
-#define __NR_creat				85
-__SYSCALL(__NR_creat, sys_creat)
-#define __NR_link				86
-__SYSCALL(__NR_link, sys_link)
-#define __NR_unlink				87
-__SYSCALL(__NR_unlink, sys_unlink)
-
-#define __NR_symlink				88
-__SYSCALL(__NR_symlink, sys_symlink)
-#define __NR_readlink				89
-__SYSCALL(__NR_readlink, sys_readlink)
-#define __NR_chmod				90
-__SYSCALL(__NR_chmod, sys_chmod)
-#define __NR_fchmod				91
-__SYSCALL(__NR_fchmod, sys_fchmod)
-#define __NR_chown				92
-__SYSCALL(__NR_chown, sys_chown)
-#define __NR_fchown				93
-__SYSCALL(__NR_fchown, sys_fchown)
-#define __NR_lchown				94
-__SYSCALL(__NR_lchown, sys_lchown)
-#define __NR_umask				95
-__SYSCALL(__NR_umask, sys_umask)
-
-#define __NR_gettimeofday			96
-__SYSCALL(__NR_gettimeofday, sys_gettimeofday)
-#define __NR_getrlimit				97
-__SYSCALL(__NR_getrlimit, sys_getrlimit)
-#define __NR_getrusage				98
-__SYSCALL(__NR_getrusage, sys_getrusage)
-#define __NR_sysinfo				99
-__SYSCALL(__NR_sysinfo, sys_sysinfo)
-#define __NR_times				100
-__SYSCALL(__NR_times, sys_times)
-#define __NR_ptrace				101
-__SYSCALL(__NR_ptrace, sys_ptrace)
-#define __NR_getuid				102
-__SYSCALL(__NR_getuid, sys_getuid)
-#define __NR_syslog				103
-__SYSCALL(__NR_syslog, sys_syslog)
-
-/* at the very end the stuff that never runs during the benchmarks */
-#define __NR_getgid				104
-__SYSCALL(__NR_getgid, sys_getgid)
-#define __NR_setuid				105
-__SYSCALL(__NR_setuid, sys_setuid)
-#define __NR_setgid				106
-__SYSCALL(__NR_setgid, sys_setgid)
-#define __NR_geteuid				107
-__SYSCALL(__NR_geteuid, sys_geteuid)
-#define __NR_getegid				108
-__SYSCALL(__NR_getegid, sys_getegid)
-#define __NR_setpgid				109
-__SYSCALL(__NR_setpgid, sys_setpgid)
-#define __NR_getppid				110
-__SYSCALL(__NR_getppid, sys_getppid)
-#define __NR_getpgrp				111
-__SYSCALL(__NR_getpgrp, sys_getpgrp)
-
-#define __NR_setsid				112
-__SYSCALL(__NR_setsid, sys_setsid)
-#define __NR_setreuid				113
-__SYSCALL(__NR_setreuid, sys_setreuid)
-#define __NR_setregid				114
-__SYSCALL(__NR_setregid, sys_setregid)
-#define __NR_getgroups				115
-__SYSCALL(__NR_getgroups, sys_getgroups)
-#define __NR_setgroups				116
-__SYSCALL(__NR_setgroups, sys_setgroups)
-#define __NR_setresuid				117
-__SYSCALL(__NR_setresuid, sys_setresuid)
-#define __NR_getresuid				118
-__SYSCALL(__NR_getresuid, sys_getresuid)
-#define __NR_setresgid				119
-__SYSCALL(__NR_setresgid, sys_setresgid)
-
-#define __NR_getresgid				120
-__SYSCALL(__NR_getresgid, sys_getresgid)
-#define __NR_getpgid				121
-__SYSCALL(__NR_getpgid, sys_getpgid)
-#define __NR_setfsuid				122
-__SYSCALL(__NR_setfsuid, sys_setfsuid)
-#define __NR_setfsgid				123
-__SYSCALL(__NR_setfsgid, sys_setfsgid)
-#define __NR_getsid				124
-__SYSCALL(__NR_getsid, sys_getsid)
-#define __NR_capget				125
-__SYSCALL(__NR_capget, sys_capget)
-#define __NR_capset				126
-__SYSCALL(__NR_capset, sys_capset)
-
-#define __NR_rt_sigpending			127
-__SYSCALL(__NR_rt_sigpending, sys_rt_sigpending)
-#define __NR_rt_sigtimedwait			128
-__SYSCALL(__NR_rt_sigtimedwait, sys_rt_sigtimedwait)
-#define __NR_rt_sigqueueinfo			129
-__SYSCALL(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo)
-#define __NR_rt_sigsuspend			130
-__SYSCALL(__NR_rt_sigsuspend, sys_rt_sigsuspend)
-#define __NR_sigaltstack			131
-__SYSCALL(__NR_sigaltstack, stub_sigaltstack)
-#define __NR_utime				132
-__SYSCALL(__NR_utime, sys_utime)
-#define __NR_mknod				133
-__SYSCALL(__NR_mknod, sys_mknod)
-
-/* Only needed for a.out */
-#define __NR_uselib				134
-__SYSCALL(__NR_uselib, sys_ni_syscall)
-#define __NR_personality			135
-__SYSCALL(__NR_personality, sys_personality)
-
-#define __NR_ustat				136
-__SYSCALL(__NR_ustat, sys_ustat)
-#define __NR_statfs				137
-__SYSCALL(__NR_statfs, sys_statfs)
-#define __NR_fstatfs				138
-__SYSCALL(__NR_fstatfs, sys_fstatfs)
-#define __NR_sysfs				139
-__SYSCALL(__NR_sysfs, sys_sysfs)
-
-#define __NR_getpriority			140
-__SYSCALL(__NR_getpriority, sys_getpriority)
-#define __NR_setpriority			141
-__SYSCALL(__NR_setpriority, sys_setpriority)
-#define __NR_sched_setparam			142
-__SYSCALL(__NR_sched_setparam, sys_sched_setparam)
-#define __NR_sched_getparam			143
-__SYSCALL(__NR_sched_getparam, sys_sched_getparam)
-#define __NR_sched_setscheduler			144
-__SYSCALL(__NR_sched_setscheduler, sys_sched_setscheduler)
-#define __NR_sched_getscheduler			145
-__SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler)
-#define __NR_sched_get_priority_max		146
-__SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max)
-#define __NR_sched_get_priority_min		147
-__SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min)
-#define __NR_sched_rr_get_interval		148
-__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval)
-
-#define __NR_mlock				149
-__SYSCALL(__NR_mlock, sys_mlock)
-#define __NR_munlock				150
-__SYSCALL(__NR_munlock, sys_munlock)
-#define __NR_mlockall				151
-__SYSCALL(__NR_mlockall, sys_mlockall)
-#define __NR_munlockall				152
-__SYSCALL(__NR_munlockall, sys_munlockall)
-
-#define __NR_vhangup				153
-__SYSCALL(__NR_vhangup, sys_vhangup)
-
-#define __NR_modify_ldt				154
-__SYSCALL(__NR_modify_ldt, sys_modify_ldt)
-
-#define __NR_pivot_root				155
-__SYSCALL(__NR_pivot_root, sys_pivot_root)
-
-#define __NR__sysctl				156
-__SYSCALL(__NR__sysctl, sys_sysctl)
-
-#define __NR_prctl				157
-__SYSCALL(__NR_prctl, sys_prctl)
-#define __NR_arch_prctl				158
-__SYSCALL(__NR_arch_prctl, sys_arch_prctl)
-
-#define __NR_adjtimex				159
-__SYSCALL(__NR_adjtimex, sys_adjtimex)
-
-#define __NR_setrlimit				160
-__SYSCALL(__NR_setrlimit, sys_setrlimit)
-
-#define __NR_chroot				161
-__SYSCALL(__NR_chroot, sys_chroot)
-
-#define __NR_sync				162
-__SYSCALL(__NR_sync, sys_sync)
-
-#define __NR_acct				163
-__SYSCALL(__NR_acct, sys_acct)
-
-#define __NR_settimeofday			164
-__SYSCALL(__NR_settimeofday, sys_settimeofday)
-
-#define __NR_mount				165
-__SYSCALL(__NR_mount, sys_mount)
-#define __NR_umount2				166
-__SYSCALL(__NR_umount2, sys_umount)
-
-#define __NR_swapon				167
-__SYSCALL(__NR_swapon, sys_swapon)
-#define __NR_swapoff				168
-__SYSCALL(__NR_swapoff, sys_swapoff)
-
-#define __NR_reboot				169
-__SYSCALL(__NR_reboot, sys_reboot)
-
-#define __NR_sethostname			170
-__SYSCALL(__NR_sethostname, sys_sethostname)
-#define __NR_setdomainname			171
-__SYSCALL(__NR_setdomainname, sys_setdomainname)
-
-#define __NR_iopl				172
-__SYSCALL(__NR_iopl, stub_iopl)
-#define __NR_ioperm				173
-__SYSCALL(__NR_ioperm, sys_ioperm)
-
-#define __NR_create_module			174
-__SYSCALL(__NR_create_module, sys_ni_syscall)
-#define __NR_init_module			175
-__SYSCALL(__NR_init_module, sys_init_module)
-#define __NR_delete_module			176
-__SYSCALL(__NR_delete_module, sys_delete_module)
-#define __NR_get_kernel_syms			177
-__SYSCALL(__NR_get_kernel_syms, sys_ni_syscall)
-#define __NR_query_module			178
-__SYSCALL(__NR_query_module, sys_ni_syscall)
-
-#define __NR_quotactl				179
-__SYSCALL(__NR_quotactl, sys_quotactl)
-
-#define __NR_nfsservctl				180
-__SYSCALL(__NR_nfsservctl, sys_ni_syscall)
-
-/* reserved for LiS/STREAMS */
-#define __NR_getpmsg				181
-__SYSCALL(__NR_getpmsg, sys_ni_syscall)
-#define __NR_putpmsg				182
-__SYSCALL(__NR_putpmsg, sys_ni_syscall)
-
-/* reserved for AFS */
-#define __NR_afs_syscall			183
-__SYSCALL(__NR_afs_syscall, sys_ni_syscall)
-
-/* reserved for tux */
-#define __NR_tuxcall				184
-__SYSCALL(__NR_tuxcall, sys_ni_syscall)
-
-#define __NR_security				185
-__SYSCALL(__NR_security, sys_ni_syscall)
-
-#define __NR_gettid				186
-__SYSCALL(__NR_gettid, sys_gettid)
-
-#define __NR_readahead				187
-__SYSCALL(__NR_readahead, sys_readahead)
-#define __NR_setxattr				188
-__SYSCALL(__NR_setxattr, sys_setxattr)
-#define __NR_lsetxattr				189
-__SYSCALL(__NR_lsetxattr, sys_lsetxattr)
-#define __NR_fsetxattr				190
-__SYSCALL(__NR_fsetxattr, sys_fsetxattr)
-#define __NR_getxattr				191
-__SYSCALL(__NR_getxattr, sys_getxattr)
-#define __NR_lgetxattr				192
-__SYSCALL(__NR_lgetxattr, sys_lgetxattr)
-#define __NR_fgetxattr				193
-__SYSCALL(__NR_fgetxattr, sys_fgetxattr)
-#define __NR_listxattr				194
-__SYSCALL(__NR_listxattr, sys_listxattr)
-#define __NR_llistxattr				195
-__SYSCALL(__NR_llistxattr, sys_llistxattr)
-#define __NR_flistxattr				196
-__SYSCALL(__NR_flistxattr, sys_flistxattr)
-#define __NR_removexattr			197
-__SYSCALL(__NR_removexattr, sys_removexattr)
-#define __NR_lremovexattr			198
-__SYSCALL(__NR_lremovexattr, sys_lremovexattr)
-#define __NR_fremovexattr			199
-__SYSCALL(__NR_fremovexattr, sys_fremovexattr)
-#define __NR_tkill				200
-__SYSCALL(__NR_tkill, sys_tkill)
-#define __NR_time				201
-__SYSCALL(__NR_time, sys_time)
-#define __NR_futex				202
-__SYSCALL(__NR_futex, sys_futex)
-#define __NR_sched_setaffinity			203
-__SYSCALL(__NR_sched_setaffinity, sys_sched_setaffinity)
-#define __NR_sched_getaffinity			204
-__SYSCALL(__NR_sched_getaffinity, sys_sched_getaffinity)
-#define __NR_set_thread_area			205
-__SYSCALL(__NR_set_thread_area, sys_ni_syscall)	/* use arch_prctl */
-#define __NR_io_setup				206
-__SYSCALL(__NR_io_setup, sys_io_setup)
-#define __NR_io_destroy				207
-__SYSCALL(__NR_io_destroy, sys_io_destroy)
-#define __NR_io_getevents			208
-__SYSCALL(__NR_io_getevents, sys_io_getevents)
-#define __NR_io_submit				209
-__SYSCALL(__NR_io_submit, sys_io_submit)
-#define __NR_io_cancel				210
-__SYSCALL(__NR_io_cancel, sys_io_cancel)
-#define __NR_get_thread_area			211
-__SYSCALL(__NR_get_thread_area, sys_ni_syscall)	/* use arch_prctl */
-#define __NR_lookup_dcookie			212
-__SYSCALL(__NR_lookup_dcookie, sys_lookup_dcookie)
-#define __NR_epoll_create			213
-__SYSCALL(__NR_epoll_create, sys_epoll_create)
-#define __NR_epoll_ctl_old			214
-__SYSCALL(__NR_epoll_ctl_old, sys_ni_syscall)
-#define __NR_epoll_wait_old			215
-__SYSCALL(__NR_epoll_wait_old, sys_ni_syscall)
-#define __NR_remap_file_pages			216
-__SYSCALL(__NR_remap_file_pages, sys_remap_file_pages)
-#define __NR_getdents64				217
-__SYSCALL(__NR_getdents64, sys_getdents64)
-#define __NR_set_tid_address			218
-__SYSCALL(__NR_set_tid_address, sys_set_tid_address)
-#define __NR_restart_syscall			219
-__SYSCALL(__NR_restart_syscall, sys_restart_syscall)
-#define __NR_semtimedop				220
-__SYSCALL(__NR_semtimedop, sys_semtimedop)
-#define __NR_fadvise64				221
-__SYSCALL(__NR_fadvise64, sys_fadvise64)
-#define __NR_timer_create			222
-__SYSCALL(__NR_timer_create, sys_timer_create)
-#define __NR_timer_settime			223
-__SYSCALL(__NR_timer_settime, sys_timer_settime)
-#define __NR_timer_gettime			224
-__SYSCALL(__NR_timer_gettime, sys_timer_gettime)
-#define __NR_timer_getoverrun			225
-__SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun)
-#define __NR_timer_delete			226
-__SYSCALL(__NR_timer_delete, sys_timer_delete)
-#define __NR_clock_settime			227
-__SYSCALL(__NR_clock_settime, sys_clock_settime)
-#define __NR_clock_gettime			228
-__SYSCALL(__NR_clock_gettime, sys_clock_gettime)
-#define __NR_clock_getres			229
-__SYSCALL(__NR_clock_getres, sys_clock_getres)
-#define __NR_clock_nanosleep			230
-__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep)
-#define __NR_exit_group				231
-__SYSCALL(__NR_exit_group, sys_exit_group)
-#define __NR_epoll_wait				232
-__SYSCALL(__NR_epoll_wait, sys_epoll_wait)
-#define __NR_epoll_ctl				233
-__SYSCALL(__NR_epoll_ctl, sys_epoll_ctl)
-#define __NR_tgkill				234
-__SYSCALL(__NR_tgkill, sys_tgkill)
-#define __NR_utimes				235
-__SYSCALL(__NR_utimes, sys_utimes)
-#define __NR_vserver				236
-__SYSCALL(__NR_vserver, sys_ni_syscall)
-#define __NR_mbind				237
-__SYSCALL(__NR_mbind, sys_mbind)
-#define __NR_set_mempolicy			238
-__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy)
-#define __NR_get_mempolicy			239
-__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy)
-#define __NR_mq_open				240
-__SYSCALL(__NR_mq_open, sys_mq_open)
-#define __NR_mq_unlink				241
-__SYSCALL(__NR_mq_unlink, sys_mq_unlink)
-#define __NR_mq_timedsend			242
-__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend)
-#define __NR_mq_timedreceive			243
-__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive)
-#define __NR_mq_notify				244
-__SYSCALL(__NR_mq_notify, sys_mq_notify)
-#define __NR_mq_getsetattr			245
-__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr)
-#define __NR_kexec_load				246
-__SYSCALL(__NR_kexec_load, sys_kexec_load)
-#define __NR_waitid				247
-__SYSCALL(__NR_waitid, sys_waitid)
-#define __NR_add_key				248
-__SYSCALL(__NR_add_key, sys_add_key)
-#define __NR_request_key			249
-__SYSCALL(__NR_request_key, sys_request_key)
-#define __NR_keyctl				250
-__SYSCALL(__NR_keyctl, sys_keyctl)
-#define __NR_ioprio_set				251
-__SYSCALL(__NR_ioprio_set, sys_ioprio_set)
-#define __NR_ioprio_get				252
-__SYSCALL(__NR_ioprio_get, sys_ioprio_get)
-#define __NR_inotify_init			253
-__SYSCALL(__NR_inotify_init, sys_inotify_init)
-#define __NR_inotify_add_watch			254
-__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
-#define __NR_inotify_rm_watch			255
-__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
-#define __NR_migrate_pages			256
-__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
-#define __NR_openat				257
-__SYSCALL(__NR_openat, sys_openat)
-#define __NR_mkdirat				258
-__SYSCALL(__NR_mkdirat, sys_mkdirat)
-#define __NR_mknodat				259
-__SYSCALL(__NR_mknodat, sys_mknodat)
-#define __NR_fchownat				260
-__SYSCALL(__NR_fchownat, sys_fchownat)
-#define __NR_futimesat				261
-__SYSCALL(__NR_futimesat, sys_futimesat)
-#define __NR_newfstatat				262
-__SYSCALL(__NR_newfstatat, sys_newfstatat)
-#define __NR_unlinkat				263
-__SYSCALL(__NR_unlinkat, sys_unlinkat)
-#define __NR_renameat				264
-__SYSCALL(__NR_renameat, sys_renameat)
-#define __NR_linkat				265
-__SYSCALL(__NR_linkat, sys_linkat)
-#define __NR_symlinkat				266
-__SYSCALL(__NR_symlinkat, sys_symlinkat)
-#define __NR_readlinkat				267
-__SYSCALL(__NR_readlinkat, sys_readlinkat)
-#define __NR_fchmodat				268
-__SYSCALL(__NR_fchmodat, sys_fchmodat)
-#define __NR_faccessat				269
-__SYSCALL(__NR_faccessat, sys_faccessat)
-#define __NR_pselect6				270
-__SYSCALL(__NR_pselect6, sys_pselect6)
-#define __NR_ppoll				271
-__SYSCALL(__NR_ppoll,	sys_ppoll)
-#define __NR_unshare				272
-__SYSCALL(__NR_unshare,	sys_unshare)
-#define __NR_set_robust_list			273
-__SYSCALL(__NR_set_robust_list, sys_set_robust_list)
-#define __NR_get_robust_list			274
-__SYSCALL(__NR_get_robust_list, sys_get_robust_list)
-#define __NR_splice				275
-__SYSCALL(__NR_splice, sys_splice)
-#define __NR_tee				276
-__SYSCALL(__NR_tee, sys_tee)
-#define __NR_sync_file_range			277
-__SYSCALL(__NR_sync_file_range, sys_sync_file_range)
-#define __NR_vmsplice				278
-__SYSCALL(__NR_vmsplice, sys_vmsplice)
-#define __NR_move_pages				279
-__SYSCALL(__NR_move_pages, sys_move_pages)
-#define __NR_utimensat				280
-__SYSCALL(__NR_utimensat, sys_utimensat)
-#define __NR_epoll_pwait			281
-__SYSCALL(__NR_epoll_pwait, sys_epoll_pwait)
-#define __NR_signalfd				282
-__SYSCALL(__NR_signalfd, sys_signalfd)
-#define __NR_timerfd_create			283
-__SYSCALL(__NR_timerfd_create, sys_timerfd_create)
-#define __NR_eventfd				284
-__SYSCALL(__NR_eventfd, sys_eventfd)
-#define __NR_fallocate				285
-__SYSCALL(__NR_fallocate, sys_fallocate)
-#define __NR_timerfd_settime			286
-__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime)
-#define __NR_timerfd_gettime			287
-__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
-#define __NR_accept4				288
-__SYSCALL(__NR_accept4, sys_accept4)
-#define __NR_signalfd4				289
-__SYSCALL(__NR_signalfd4, sys_signalfd4)
-#define __NR_eventfd2				290
-__SYSCALL(__NR_eventfd2, sys_eventfd2)
-#define __NR_epoll_create1			291
-__SYSCALL(__NR_epoll_create1, sys_epoll_create1)
-#define __NR_dup3				292
-__SYSCALL(__NR_dup3, sys_dup3)
-#define __NR_pipe2				293
-__SYSCALL(__NR_pipe2, sys_pipe2)
-#define __NR_inotify_init1			294
-__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
-#define __NR_preadv				295
-__SYSCALL(__NR_preadv, sys_preadv)
-#define __NR_pwritev				296
-__SYSCALL(__NR_pwritev, sys_pwritev)
-#define __NR_rt_tgsigqueueinfo			297
-__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
-#define __NR_perf_event_open			298
-__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
-#define __NR_recvmmsg				299
-__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
-#define __NR_fanotify_init			300
-__SYSCALL(__NR_fanotify_init, sys_fanotify_init)
-#define __NR_fanotify_mark			301
-__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
-#define __NR_prlimit64				302
-__SYSCALL(__NR_prlimit64, sys_prlimit64)
-#define __NR_name_to_handle_at			303
-__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
-#define __NR_open_by_handle_at			304
-__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
-#define __NR_clock_adjtime			305
-__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
-#define __NR_syncfs                             306
-__SYSCALL(__NR_syncfs, sys_syncfs)
-#define __NR_sendmmsg				307
-__SYSCALL(__NR_sendmmsg, sys_sendmmsg)
-#define __NR_setns				308
-__SYSCALL(__NR_setns, sys_setns)
-#define __NR_getcpu				309
-__SYSCALL(__NR_getcpu, sys_getcpu)
-#define __NR_process_vm_readv			310
-__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
-#define __NR_process_vm_writev			311
-__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
-
-#ifndef __NO_STUBS
-#define __ARCH_WANT_OLD_READDIR
-#define __ARCH_WANT_OLD_STAT
-#define __ARCH_WANT_SYS_ALARM
-#define __ARCH_WANT_SYS_GETHOSTNAME
-#define __ARCH_WANT_SYS_PAUSE
-#define __ARCH_WANT_SYS_SGETMASK
-#define __ARCH_WANT_SYS_SIGNAL
-#define __ARCH_WANT_SYS_UTIME
-#define __ARCH_WANT_SYS_WAITPID
-#define __ARCH_WANT_SYS_SOCKETCALL
-#define __ARCH_WANT_SYS_FADVISE64
-#define __ARCH_WANT_SYS_GETPGRP
-#define __ARCH_WANT_SYS_LLSEEK
-#define __ARCH_WANT_SYS_NICE
-#define __ARCH_WANT_SYS_OLD_GETRLIMIT
-#define __ARCH_WANT_SYS_OLD_UNAME
-#define __ARCH_WANT_SYS_OLDUMOUNT
-#define __ARCH_WANT_SYS_SIGPENDING
-#define __ARCH_WANT_SYS_SIGPROCMASK
-#define __ARCH_WANT_SYS_RT_SIGACTION
-#define __ARCH_WANT_SYS_RT_SIGSUSPEND
-#define __ARCH_WANT_SYS_TIME
-#define __ARCH_WANT_COMPAT_SYS_TIME
-#endif	/* __NO_STUBS */
-
-#ifdef __KERNEL__
-
-#ifndef COMPILE_OFFSETS
-#include <asm/asm-offsets.h>
-#define NR_syscalls (__NR_syscall_max + 1)
-#endif
-
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
-#endif	/* __KERNEL__ */
-
-#endif /* _ASM_X86_UNISTD_64_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8baca3c4871c..8c473d9d0b83 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,7 +25,8 @@ obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-y			+= probe_roms.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
-obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
+obj-y			+= syscall_$(BITS).o
+obj-$(CONFIG_X86_64)	+= vsyscall_64.o
 obj-$(CONFIG_X86_64)	+= vsyscall_emu_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o topology.o kdebugfs.o
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 395a10e68067..85d98ab15cdc 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -3,6 +3,11 @@
 #include <linux/lguest.h>
 #include "../../../drivers/lguest/lg.h"
 
+#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+static char syscalls[] = {
+#include <asm/syscalls_32.h>
+};
+
 /* workaround for a warning with -Wmissing-prototypes */
 void foo(void);
 
@@ -76,4 +81,7 @@ void foo(void)
 	OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
 	OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
 #endif
+	BLANK();
+	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
+	DEFINE(NR_syscalls, sizeof(syscalls));
 }
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e72a1194af22..834e897b1e25 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,11 +1,12 @@
 #include <asm/ia32.h>
 
-#define __NO_STUBS 1
-#undef __SYSCALL
-#undef _ASM_X86_UNISTD_64_H
-#define __SYSCALL(nr, sym) [nr] = 1,
-static char syscalls[] = {
-#include <asm/unistd.h>
+#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+static char syscalls_64[] = {
+#include <asm/syscalls_64.h>
+};
+#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+static char syscalls_ia32[] = {
+#include <asm/syscalls_32.h>
 };
 
 int main(void)
@@ -72,7 +73,11 @@ int main(void)
 	OFFSET(TSS_ist, tss_struct, x86_tss.ist);
 	BLANK();
 
-	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
+	DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
+	DEFINE(NR_syscalls, sizeof(syscalls_64));
+
+	DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1);
+	DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
 
 	return 0;
 }
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f3f6f5344001..1ffcda22c2f6 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -81,8 +81,6 @@
  * enough to patch inline, increasing performance.
  */
 
-#define nr_syscalls ((syscall_table_size)/4)
-
 #ifdef CONFIG_PREEMPT
 #define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
 #else
@@ -423,7 +421,7 @@ sysenter_past_esp:
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz sysenter_audit
 sysenter_do_call:
-	cmpl $(nr_syscalls), %eax
+	cmpl $(NR_syscalls), %eax
 	jae syscall_badsys
 	call *sys_call_table(,%eax,4)
 	movl %eax,PT_EAX(%esp)
@@ -504,7 +502,7 @@ ENTRY(system_call)
 					# system call tracing in operation / emulation
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz syscall_trace_entry
-	cmpl $(nr_syscalls), %eax
+	cmpl $(NR_syscalls), %eax
 	jae syscall_badsys
 syscall_call:
 	call *sys_call_table(,%eax,4)
@@ -650,7 +648,7 @@ syscall_trace_entry:
 	movl %esp, %eax
 	call syscall_trace_enter
 	/* What it returned is what we'll actually use.  */
-	cmpl $(nr_syscalls), %eax
+	cmpl $(NR_syscalls), %eax
 	jnae syscall_call
 	jmp syscall_exit
 END(syscall_trace_entry)
@@ -690,29 +688,28 @@ END(syscall_badsys)
  * System calls that need a pt_regs pointer.
  */
 #define PTREGSCALL0(name) \
-	ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ;  \
 	leal 4(%esp),%eax; \
-	jmp sys_##name;
+	jmp sys_##name; \
+ENDPROC(ptregs_##name)
 
 #define PTREGSCALL1(name) \
-	ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ; \
 	leal 4(%esp),%edx; \
 	movl (PT_EBX+4)(%esp),%eax; \
-	jmp sys_##name;
+	jmp sys_##name; \
+ENDPROC(ptregs_##name)
 
 #define PTREGSCALL2(name) \
-	ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ; \
 	leal 4(%esp),%ecx; \
 	movl (PT_ECX+4)(%esp),%edx; \
 	movl (PT_EBX+4)(%esp),%eax; \
-	jmp sys_##name;
+	jmp sys_##name; \
+ENDPROC(ptregs_##name)
 
 #define PTREGSCALL3(name) \
-	ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ; \
 	CFI_STARTPROC; \
 	leal 4(%esp),%eax; \
 	pushl_cfi %eax; \
@@ -737,8 +734,7 @@ PTREGSCALL2(vm86)
 PTREGSCALL1(vm86old)
 
 /* Clone is an oddball.  The 4th arg is in %edi */
-	ALIGN;
-ptregs_clone:
+ENTRY(ptregs_clone)
 	CFI_STARTPROC
 	leal 4(%esp),%eax
 	pushl_cfi %eax
@@ -1209,11 +1205,6 @@ return_to_handler:
 	jmp *%ecx
 #endif
 
-.section .rodata,"a"
-#include "syscall_table_32.S"
-
-syscall_table_size=(.-sys_call_table)
-
 /*
  * Some functions should be protected against kprobes
  */
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
new file mode 100644
index 000000000000..b37a57336609
--- /dev/null
+++ b/arch/x86/kernel/syscall_32.c
@@ -0,0 +1,25 @@
+/* System call table for i386. */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <asm/asm-offsets.h>
+
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
+#include <asm/syscalls_32.h>
+#undef __SYSCALL_I386
+
+#define __SYSCALL_I386(nr, sym, compat) [nr] = sym,
+
+typedef asmlinkage void (*sys_call_ptr_t)(void);
+
+extern asmlinkage void sys_ni_syscall(void);
+
+const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
+	/*
+	 * Smells like a like a compiler bug -- it doesn't work
+	 * when the & below is removed.
+	 */
+	[0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm/syscalls_32.h>
+};
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 0edfafa1b269..7ac7943be02c 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -5,15 +5,11 @@
 #include <linux/cache.h>
 #include <asm/asm-offsets.h>
 
-#define __NO_STUBS
+#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
+#include <asm/syscalls_64.h>
+#undef __SYSCALL_64
 
-#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
-#undef _ASM_X86_UNISTD_64_H
-#include <asm/unistd_64.h>
-
-#undef __SYSCALL
-#define __SYSCALL(nr, sym) [nr] = sym,
-#undef _ASM_X86_UNISTD_64_H
+#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
 
 typedef void (*sys_call_ptr_t)(void);
 
@@ -25,5 +21,5 @@ const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
 	 * when the & below is removed.
 	 */
 	[0 ... __NR_syscall_max] = &sys_ni_syscall,
-#include <asm/unistd_64.h>
+#include <asm/syscalls_64.h>
 };
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
deleted file mode 100644
index 9a0e31293920..000000000000
--- a/arch/x86/kernel/syscall_table_32.S
+++ /dev/null
@@ -1,350 +0,0 @@
-ENTRY(sys_call_table)
-	.long sys_restart_syscall	/* 0 - old "setup()" system call, used for restarting */
-	.long sys_exit
-	.long ptregs_fork
-	.long sys_read
-	.long sys_write
-	.long sys_open		/* 5 */
-	.long sys_close
-	.long sys_waitpid
-	.long sys_creat
-	.long sys_link
-	.long sys_unlink	/* 10 */
-	.long ptregs_execve
-	.long sys_chdir
-	.long sys_time
-	.long sys_mknod
-	.long sys_chmod		/* 15 */
-	.long sys_lchown16
-	.long sys_ni_syscall	/* old break syscall holder */
-	.long sys_stat
-	.long sys_lseek
-	.long sys_getpid	/* 20 */
-	.long sys_mount
-	.long sys_oldumount
-	.long sys_setuid16
-	.long sys_getuid16
-	.long sys_stime		/* 25 */
-	.long sys_ptrace
-	.long sys_alarm
-	.long sys_fstat
-	.long sys_pause
-	.long sys_utime		/* 30 */
-	.long sys_ni_syscall	/* old stty syscall holder */
-	.long sys_ni_syscall	/* old gtty syscall holder */
-	.long sys_access
-	.long sys_nice
-	.long sys_ni_syscall	/* 35 - old ftime syscall holder */
-	.long sys_sync
-	.long sys_kill
-	.long sys_rename
-	.long sys_mkdir
-	.long sys_rmdir		/* 40 */
-	.long sys_dup
-	.long sys_pipe
-	.long sys_times
-	.long sys_ni_syscall	/* old prof syscall holder */
-	.long sys_brk		/* 45 */
-	.long sys_setgid16
-	.long sys_getgid16
-	.long sys_signal
-	.long sys_geteuid16
-	.long sys_getegid16	/* 50 */
-	.long sys_acct
-	.long sys_umount	/* recycled never used phys() */
-	.long sys_ni_syscall	/* old lock syscall holder */
-	.long sys_ioctl
-	.long sys_fcntl		/* 55 */
-	.long sys_ni_syscall	/* old mpx syscall holder */
-	.long sys_setpgid
-	.long sys_ni_syscall	/* old ulimit syscall holder */
-	.long sys_olduname
-	.long sys_umask		/* 60 */
-	.long sys_chroot
-	.long sys_ustat
-	.long sys_dup2
-	.long sys_getppid
-	.long sys_getpgrp	/* 65 */
-	.long sys_setsid
-	.long sys_sigaction
-	.long sys_sgetmask
-	.long sys_ssetmask
-	.long sys_setreuid16	/* 70 */
-	.long sys_setregid16
-	.long sys_sigsuspend
-	.long sys_sigpending
-	.long sys_sethostname
-	.long sys_setrlimit	/* 75 */
-	.long sys_old_getrlimit
-	.long sys_getrusage
-	.long sys_gettimeofday
-	.long sys_settimeofday
-	.long sys_getgroups16	/* 80 */
-	.long sys_setgroups16
-	.long sys_old_select
-	.long sys_symlink
-	.long sys_lstat
-	.long sys_readlink	/* 85 */
-	.long sys_uselib
-	.long sys_swapon
-	.long sys_reboot
-	.long sys_old_readdir
-	.long sys_old_mmap	/* 90 */
-	.long sys_munmap
-	.long sys_truncate
-	.long sys_ftruncate
-	.long sys_fchmod
-	.long sys_fchown16	/* 95 */
-	.long sys_getpriority
-	.long sys_setpriority
-	.long sys_ni_syscall	/* old profil syscall holder */
-	.long sys_statfs
-	.long sys_fstatfs	/* 100 */
-	.long sys_ioperm
-	.long sys_socketcall
-	.long sys_syslog
-	.long sys_setitimer
-	.long sys_getitimer	/* 105 */
-	.long sys_newstat
-	.long sys_newlstat
-	.long sys_newfstat
-	.long sys_uname
-	.long ptregs_iopl	/* 110 */
-	.long sys_vhangup
-	.long sys_ni_syscall	/* old "idle" system call */
-	.long ptregs_vm86old
-	.long sys_wait4
-	.long sys_swapoff	/* 115 */
-	.long sys_sysinfo
-	.long sys_ipc
-	.long sys_fsync
-	.long ptregs_sigreturn
-	.long ptregs_clone	/* 120 */
-	.long sys_setdomainname
-	.long sys_newuname
-	.long sys_modify_ldt
-	.long sys_adjtimex
-	.long sys_mprotect	/* 125 */
-	.long sys_sigprocmask
-	.long sys_ni_syscall	/* old "create_module" */
-	.long sys_init_module
-	.long sys_delete_module
-	.long sys_ni_syscall	/* 130:	old "get_kernel_syms" */
-	.long sys_quotactl
-	.long sys_getpgid
-	.long sys_fchdir
-	.long sys_bdflush
-	.long sys_sysfs		/* 135 */
-	.long sys_personality
-	.long sys_ni_syscall	/* reserved for afs_syscall */
-	.long sys_setfsuid16
-	.long sys_setfsgid16
-	.long sys_llseek	/* 140 */
-	.long sys_getdents
-	.long sys_select
-	.long sys_flock
-	.long sys_msync
-	.long sys_readv		/* 145 */
-	.long sys_writev
-	.long sys_getsid
-	.long sys_fdatasync
-	.long sys_sysctl
-	.long sys_mlock		/* 150 */
-	.long sys_munlock
-	.long sys_mlockall
-	.long sys_munlockall
-	.long sys_sched_setparam
-	.long sys_sched_getparam   /* 155 */
-	.long sys_sched_setscheduler
-	.long sys_sched_getscheduler
-	.long sys_sched_yield
-	.long sys_sched_get_priority_max
-	.long sys_sched_get_priority_min  /* 160 */
-	.long sys_sched_rr_get_interval
-	.long sys_nanosleep
-	.long sys_mremap
-	.long sys_setresuid16
-	.long sys_getresuid16	/* 165 */
-	.long ptregs_vm86
-	.long sys_ni_syscall	/* Old sys_query_module */
-	.long sys_poll
-	.long sys_ni_syscall	/* Old nfsservctl */
-	.long sys_setresgid16	/* 170 */
-	.long sys_getresgid16
-	.long sys_prctl
-	.long ptregs_rt_sigreturn
-	.long sys_rt_sigaction
-	.long sys_rt_sigprocmask	/* 175 */
-	.long sys_rt_sigpending
-	.long sys_rt_sigtimedwait
-	.long sys_rt_sigqueueinfo
-	.long sys_rt_sigsuspend
-	.long sys_pread64	/* 180 */
-	.long sys_pwrite64
-	.long sys_chown16
-	.long sys_getcwd
-	.long sys_capget
-	.long sys_capset	/* 185 */
-	.long ptregs_sigaltstack
-	.long sys_sendfile
-	.long sys_ni_syscall	/* reserved for streams1 */
-	.long sys_ni_syscall	/* reserved for streams2 */
-	.long ptregs_vfork	/* 190 */
-	.long sys_getrlimit
-	.long sys_mmap_pgoff
-	.long sys_truncate64
-	.long sys_ftruncate64
-	.long sys_stat64	/* 195 */
-	.long sys_lstat64
-	.long sys_fstat64
-	.long sys_lchown
-	.long sys_getuid
-	.long sys_getgid	/* 200 */
-	.long sys_geteuid
-	.long sys_getegid
-	.long sys_setreuid
-	.long sys_setregid
-	.long sys_getgroups	/* 205 */
-	.long sys_setgroups
-	.long sys_fchown
-	.long sys_setresuid
-	.long sys_getresuid
-	.long sys_setresgid	/* 210 */
-	.long sys_getresgid
-	.long sys_chown
-	.long sys_setuid
-	.long sys_setgid
-	.long sys_setfsuid	/* 215 */
-	.long sys_setfsgid
-	.long sys_pivot_root
-	.long sys_mincore
-	.long sys_madvise
-	.long sys_getdents64	/* 220 */
-	.long sys_fcntl64
-	.long sys_ni_syscall	/* reserved for TUX */
-	.long sys_ni_syscall
-	.long sys_gettid
-	.long sys_readahead	/* 225 */
-	.long sys_setxattr
-	.long sys_lsetxattr
-	.long sys_fsetxattr
-	.long sys_getxattr
-	.long sys_lgetxattr	/* 230 */
-	.long sys_fgetxattr
-	.long sys_listxattr
-	.long sys_llistxattr
-	.long sys_flistxattr
-	.long sys_removexattr	/* 235 */
-	.long sys_lremovexattr
-	.long sys_fremovexattr
-	.long sys_tkill
-	.long sys_sendfile64
-	.long sys_futex		/* 240 */
-	.long sys_sched_setaffinity
-	.long sys_sched_getaffinity
-	.long sys_set_thread_area
-	.long sys_get_thread_area
-	.long sys_io_setup	/* 245 */
-	.long sys_io_destroy
-	.long sys_io_getevents
-	.long sys_io_submit
-	.long sys_io_cancel
-	.long sys_fadvise64	/* 250 */
-	.long sys_ni_syscall
-	.long sys_exit_group
-	.long sys_lookup_dcookie
-	.long sys_epoll_create
-	.long sys_epoll_ctl	/* 255 */
-	.long sys_epoll_wait
- 	.long sys_remap_file_pages
- 	.long sys_set_tid_address
- 	.long sys_timer_create
- 	.long sys_timer_settime		/* 260 */
- 	.long sys_timer_gettime
- 	.long sys_timer_getoverrun
- 	.long sys_timer_delete
- 	.long sys_clock_settime
- 	.long sys_clock_gettime		/* 265 */
- 	.long sys_clock_getres
- 	.long sys_clock_nanosleep
-	.long sys_statfs64
-	.long sys_fstatfs64
-	.long sys_tgkill	/* 270 */
-	.long sys_utimes
- 	.long sys_fadvise64_64
-	.long sys_ni_syscall	/* sys_vserver */
-	.long sys_mbind
-	.long sys_get_mempolicy
-	.long sys_set_mempolicy
-	.long sys_mq_open
-	.long sys_mq_unlink
-	.long sys_mq_timedsend
-	.long sys_mq_timedreceive	/* 280 */
-	.long sys_mq_notify
-	.long sys_mq_getsetattr
-	.long sys_kexec_load
-	.long sys_waitid
-	.long sys_ni_syscall		/* 285 */ /* available */
-	.long sys_add_key
-	.long sys_request_key
-	.long sys_keyctl
-	.long sys_ioprio_set
-	.long sys_ioprio_get		/* 290 */
-	.long sys_inotify_init
-	.long sys_inotify_add_watch
-	.long sys_inotify_rm_watch
-	.long sys_migrate_pages
-	.long sys_openat		/* 295 */
-	.long sys_mkdirat
-	.long sys_mknodat
-	.long sys_fchownat
-	.long sys_futimesat
-	.long sys_fstatat64		/* 300 */
-	.long sys_unlinkat
-	.long sys_renameat
-	.long sys_linkat
-	.long sys_symlinkat
-	.long sys_readlinkat		/* 305 */
-	.long sys_fchmodat
-	.long sys_faccessat
-	.long sys_pselect6
-	.long sys_ppoll
-	.long sys_unshare		/* 310 */
-	.long sys_set_robust_list
-	.long sys_get_robust_list
-	.long sys_splice
-	.long sys_sync_file_range
-	.long sys_tee			/* 315 */
-	.long sys_vmsplice
-	.long sys_move_pages
-	.long sys_getcpu
-	.long sys_epoll_pwait
-	.long sys_utimensat		/* 320 */
-	.long sys_signalfd
-	.long sys_timerfd_create
-	.long sys_eventfd
-	.long sys_fallocate
-	.long sys_timerfd_settime	/* 325 */
-	.long sys_timerfd_gettime
-	.long sys_signalfd4
-	.long sys_eventfd2
-	.long sys_epoll_create1
-	.long sys_dup3			/* 330 */
-	.long sys_pipe2
-	.long sys_inotify_init1
-	.long sys_preadv
-	.long sys_pwritev
-	.long sys_rt_tgsigqueueinfo	/* 335 */
-	.long sys_perf_event_open
-	.long sys_recvmmsg
-	.long sys_fanotify_init
-	.long sys_fanotify_mark
-	.long sys_prlimit64		/* 340 */
-	.long sys_name_to_handle_at
-	.long sys_open_by_handle_at
-	.long sys_clock_adjtime
-	.long sys_syncfs
-	.long sys_sendmmsg		/* 345 */
-	.long sys_setns
-	.long sys_process_vm_readv
-	.long sys_process_vm_writev
-- 
cgit v1.2.3-55-g7522


From f14525f9e033f344996905744f41680ea2b877ce Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Fri, 18 Nov 2011 16:03:27 -0800
Subject: x86: Simplify syscallhdr.sh

Simplify syscallhdr.sh by letting grep sort out the ABIs that we want,
rather than relying on manual list matching.  This is safe since the
ABI strings already have to consist only of characters which are valid in C
macro names.

Suggested-by: Matt Helsley <matthltc@us.ibm.com>
Link: http://lkml.kernel.org/r/20111118221558.GA6408@count0.beaverton.ibm.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/syscalls/syscallhdr.sh | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh
index 0d473ff12eaf..b3c593072785 100644
--- a/arch/x86/syscalls/syscallhdr.sh
+++ b/arch/x86/syscalls/syscallhdr.sh
@@ -2,33 +2,20 @@
 
 in="$1"
 out="$2"
-my_abis=`echo "$3" | tr ',' ' '`
+my_abis=`echo "($3)" | tr ',' '|'`
 prefix="$4"
 offset="$5"
 
 fileguard=_ASM_X86_`basename "$out" | sed \
     -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \
     -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'`
-
-in_list () {
-    local x
-    for x in $1; do
-	if [ x"$x" = x"$2" ]; then
-	    return 0
-	fi
-    done
-    return 1
-}
-
-grep '^[0-9]' "$in" | sort -n | (
+grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
     echo "#ifndef ${fileguard}"
     echo "#define ${fileguard} 1"
     echo ""
 
     while read nr abi name entry ; do
-	if in_list "$my_abis" "$abi"; then
-	    echo "#define __NR_${prefix}${name}" $((nr+offset))
-        fi
+	echo "#define __NR_${prefix}${name}" $((nr+offset))
     done
 
     echo ""
-- 
cgit v1.2.3-55-g7522


From 61f1e7e20874e8f11dab69b6a4bf7616badd4fe8 Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Fri, 18 Nov 2011 16:25:07 -0800
Subject: x86, syscall: Re-fix typo in comment

Fix the same typo as was fixed in:

b7641d2c x86-64, syscall: Adjust comment spacing and remove typo

... for the new versions of this file (32-bit and IA32 compat).

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/1321569446-20433-4-git-send-email-hpa@linux.intel.com
---
 arch/x86/ia32/syscall_ia32.c | 2 +-
 arch/x86/kernel/syscall_32.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c
index d04d3dbc47d4..4754ba0f5d9f 100644
--- a/arch/x86/ia32/syscall_ia32.c
+++ b/arch/x86/ia32/syscall_ia32.c
@@ -17,7 +17,7 @@ extern void compat_ni_syscall(void);
 
 const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
 	/*
-	 * Smells like a like a compiler bug -- it doesn't work
+	 * Smells like a compiler bug -- it doesn't work
 	 * when the & below is removed.
 	 */
 	[0 ... __NR_ia32_syscall_max] = &compat_ni_syscall,
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
index b37a57336609..147fcd4941c4 100644
--- a/arch/x86/kernel/syscall_32.c
+++ b/arch/x86/kernel/syscall_32.c
@@ -17,7 +17,7 @@ extern asmlinkage void sys_ni_syscall(void);
 
 const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
 	/*
-	 * Smells like a like a compiler bug -- it doesn't work
+	 * Smells like a compiler bug -- it doesn't work
 	 * when the & below is removed.
 	 */
 	[0 ... __NR_syscall_max] = &sys_ni_syscall,
-- 
cgit v1.2.3-55-g7522


From 3f86886c72fb68088162c7e08cc7f85282f1860c Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Fri, 18 Nov 2011 17:01:19 -0800
Subject: x86, syscall: Allow syscall offset to be symbolic

Allow the specified syscall offset to be symbolic, e.g. a macro.  For
offset system calls, this if nothing else makes the generated code
easier to read.

Suggested-by: H. J. Lu <hjl.tools@gmail.com>
Link: http://lkml.kernel.org/r/1321569446-20433-7-git-send-email-hpa@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/syscalls/syscallhdr.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh
index b3c593072785..31fd5f1f38f7 100644
--- a/arch/x86/syscalls/syscallhdr.sh
+++ b/arch/x86/syscalls/syscallhdr.sh
@@ -15,7 +15,11 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
     echo ""
 
     while read nr abi name entry ; do
-	echo "#define __NR_${prefix}${name}" $((nr+offset))
+	if [ -z "$offset" ]; then
+	    echo "#define __NR_${prefix}${name} $nr"
+	else
+	    echo "#define __NR_${prefix}${name} ($offset + $nr)"
+        fi
     done
 
     echo ""
-- 
cgit v1.2.3-55-g7522


From 937c30d7f560210b0163035edd42b2aef78fed9e Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna
Date: Wed, 9 Nov 2011 16:26:25 +0200
Subject: crypto: serpent - add 8-way parallel x86_64/SSE2 assembler
 implementation

Patch adds x86_64/SSE2 assembler implementation of serpent cipher. Assembler
functions crypt data in eigth block chunks (two 4 block chunk SSE2 operations
in parallel to improve performance on out-of-order CPUs). Glue code is based
on one from AES-NI implementation, so requests from irq context are redirected
to cryptd.

v2:
 - add missing include of linux/module.h
   (appearently crypto.h used to include module.h, which changed for 3.2 by
    commit 7c926402a7e8c9b279968fd94efec8700ba3859e)

Patch has been tested with tcrypt and automated filesystem tests.

Tcrypt benchmarks results (serpent-sse2/serpent_generic speed ratios):

AMD Phenom II 1055T (fam:16, model:10):

size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec
16B     1.03x   1.01x   1.03x   1.05x   1.00x   0.99x
64B     1.00x   1.01x   1.02x   1.04x   1.02x   1.01x
256B    2.34x   2.41x   0.99x   2.43x   2.39x   2.40x
1024B   2.51x   2.57x   1.00x   2.59x   2.56x   2.56x
8192B   2.50x   2.54x   1.00x   2.55x   2.57x   2.57x

Intel Celeron T1600 (fam:6, model:15, step:13):

size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec
16B     0.97x   0.97x   1.01x   1.01x   1.01x   1.02x
64B     1.00x   1.00x   1.00x   1.02x   1.01x   1.01x
256B    3.41x   3.35x   1.00x   3.39x   3.42x   3.44x
1024B   3.75x   3.72x   0.99x   3.74x   3.75x   3.75x
8192B   3.70x   3.68x   0.99x   3.68x   3.69x   3.69x

Full output:
 http://koti.mbnet.fi/axh/kernel/crypto/phenom-ii-1055t/serpent-generic.txt
 http://koti.mbnet.fi/axh/kernel/crypto/phenom-ii-1055t/serpent-sse2.txt
 http://koti.mbnet.fi/axh/kernel/crypto/celeron-t1600/serpent-generic.txt
 http://koti.mbnet.fi/axh/kernel/crypto/celeron-t1600/serpent-sse2.txt

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile                     |   2 +
 arch/x86/crypto/serpent-sse2-x86_64-asm_64.S | 761 +++++++++++++++++++++++++++
 arch/x86/crypto/serpent_sse2_glue.c          | 719 +++++++++++++++++++++++++
 arch/x86/include/asm/serpent.h               |  32 ++
 crypto/Kconfig                               |  17 +
 crypto/testmgr.c                             |  60 +++
 6 files changed, 1591 insertions(+)
 create mode 100644 arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
 create mode 100644 arch/x86/crypto/serpent_sse2_glue.c
 create mode 100644 arch/x86/include/asm/serpent.h

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3537d4b91f74..12ebdbd80ccb 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
+obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 
@@ -26,6 +27,7 @@ blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
+serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
 
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
 
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
new file mode 100644
index 000000000000..7f24a1540821
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -0,0 +1,761 @@
+/*
+ * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on crypto/serpent.c by
+ *  Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
+ *                2003 Herbert Valerio Riedel <hvr@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ */
+
+.file "serpent-sse2-x86_64-asm_64.S"
+.text
+
+#define CTX %rdi
+
+/**********************************************************************
+  8-way SSE2 serpent
+ **********************************************************************/
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+#define RE1 %xmm4
+
+#define RA2 %xmm5
+#define RB2 %xmm6
+#define RC2 %xmm7
+#define RD2 %xmm8
+#define RE2 %xmm9
+
+#define RNOT %xmm10
+
+#define RK0 %xmm11
+#define RK1 %xmm12
+#define RK2 %xmm13
+#define RK3 %xmm14
+
+#define S0_1(x0, x1, x2, x3, x4) \
+	movdqa x3,		x4; \
+	por x0,			x3; \
+	pxor x4,		x0; \
+	pxor x2,		x4; \
+	pxor RNOT,		x4; \
+	pxor x1,		x3; \
+	pand x0,		x1; \
+	pxor x4,		x1; \
+	pxor x0,		x2;
+#define S0_2(x0, x1, x2, x3, x4) \
+	pxor x3,		x0; \
+	por x0,			x4; \
+	pxor x2,		x0; \
+	pand x1,		x2; \
+	pxor x2,		x3; \
+	pxor RNOT,		x1; \
+	pxor x4,		x2; \
+	pxor x2,		x1;
+
+#define S1_1(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	pxor x0,		x1; \
+	pxor x3,		x0; \
+	pxor RNOT,		x3; \
+	pand x1,		x4; \
+	por x1,			x0; \
+	pxor x2,		x3; \
+	pxor x3,		x0; \
+	pxor x3,		x1;
+#define S1_2(x0, x1, x2, x3, x4) \
+	pxor x4,		x3; \
+	por x4,			x1; \
+	pxor x2,		x4; \
+	pand x0,		x2; \
+	pxor x1,		x2; \
+	por x0,			x1; \
+	pxor RNOT,		x0; \
+	pxor x2,		x0; \
+	pxor x1,		x4;
+
+#define S2_1(x0, x1, x2, x3, x4) \
+	pxor RNOT,		x3; \
+	pxor x0,		x1; \
+	movdqa x0,		x4; \
+	pand x2,		x0; \
+	pxor x3,		x0; \
+	por x4,			x3; \
+	pxor x1,		x2; \
+	pxor x1,		x3; \
+	pand x0,		x1;
+#define S2_2(x0, x1, x2, x3, x4) \
+	pxor x2,		x0; \
+	pand x3,		x2; \
+	por x1,			x3; \
+	pxor RNOT,		x0; \
+	pxor x0,		x3; \
+	pxor x0,		x4; \
+	pxor x2,		x0; \
+	por x2,			x1;
+
+#define S3_1(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	pxor x3,		x1; \
+	por x0,			x3; \
+	pand x0,		x4; \
+	pxor x2,		x0; \
+	pxor x1,		x2; \
+	pand x3,		x1; \
+	pxor x3,		x2; \
+	por x4,			x0; \
+	pxor x3,		x4;
+#define S3_2(x0, x1, x2, x3, x4) \
+	pxor x0,		x1; \
+	pand x3,		x0; \
+	pand x4,		x3; \
+	pxor x2,		x3; \
+	por x1,			x4; \
+	pand x1,		x2; \
+	pxor x3,		x4; \
+	pxor x3,		x0; \
+	pxor x2,		x3;
+
+#define S4_1(x0, x1, x2, x3, x4) \
+	movdqa x3,		x4; \
+	pand x0,		x3; \
+	pxor x4,		x0; \
+	pxor x2,		x3; \
+	por x4,			x2; \
+	pxor x1,		x0; \
+	pxor x3,		x4; \
+	por x0,			x2; \
+	pxor x1,		x2;
+#define S4_2(x0, x1, x2, x3, x4) \
+	pand x0,		x1; \
+	pxor x4,		x1; \
+	pand x2,		x4; \
+	pxor x3,		x2; \
+	pxor x0,		x4; \
+	por x1,			x3; \
+	pxor RNOT,		x1; \
+	pxor x0,		x3;
+
+#define S5_1(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	por x0,			x1; \
+	pxor x1,		x2; \
+	pxor RNOT,		x3; \
+	pxor x0,		x4; \
+	pxor x2,		x0; \
+	pand x4,		x1; \
+	por x3,			x4; \
+	pxor x0,		x4;
+#define S5_2(x0, x1, x2, x3, x4) \
+	pand x3,		x0; \
+	pxor x3,		x1; \
+	pxor x2,		x3; \
+	pxor x1,		x0; \
+	pand x4,		x2; \
+	pxor x2,		x1; \
+	pand x0,		x2; \
+	pxor x2,		x3;
+
+#define S6_1(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	pxor x0,		x3; \
+	pxor x2,		x1; \
+	pxor x0,		x2; \
+	pand x3,		x0; \
+	por x3,			x1; \
+	pxor RNOT,		x4; \
+	pxor x1,		x0; \
+	pxor x2,		x1;
+#define S6_2(x0, x1, x2, x3, x4) \
+	pxor x4,		x3; \
+	pxor x0,		x4; \
+	pand x0,		x2; \
+	pxor x1,		x4; \
+	pxor x3,		x2; \
+	pand x1,		x3; \
+	pxor x0,		x3; \
+	pxor x2,		x1;
+
+#define S7_1(x0, x1, x2, x3, x4) \
+	pxor RNOT,		x1; \
+	movdqa x1,		x4; \
+	pxor RNOT,		x0; \
+	pand x2,		x1; \
+	pxor x3,		x1; \
+	por x4,			x3; \
+	pxor x2,		x4; \
+	pxor x3,		x2; \
+	pxor x0,		x3; \
+	por x1,			x0;
+#define S7_2(x0, x1, x2, x3, x4) \
+	pand x0,		x2; \
+	pxor x4,		x0; \
+	pxor x3,		x4; \
+	pand x0,		x3; \
+	pxor x1,		x4; \
+	pxor x4,		x2; \
+	pxor x1,		x3; \
+	por x0,			x4; \
+	pxor x1,		x4;
+
+#define SI0_1(x0, x1, x2, x3, x4) \
+	movdqa x3,		x4; \
+	pxor x0,		x1; \
+	por x1,			x3; \
+	pxor x1,		x4; \
+	pxor RNOT,		x0; \
+	pxor x3,		x2; \
+	pxor x0,		x3; \
+	pand x1,		x0; \
+	pxor x2,		x0;
+#define SI0_2(x0, x1, x2, x3, x4) \
+	pand x3,		x2; \
+	pxor x4,		x3; \
+	pxor x3,		x2; \
+	pxor x3,		x1; \
+	pand x0,		x3; \
+	pxor x0,		x1; \
+	pxor x2,		x0; \
+	pxor x3,		x4;
+
+#define SI1_1(x0, x1, x2, x3, x4) \
+	pxor x3,		x1; \
+	movdqa x0,		x4; \
+	pxor x2,		x0; \
+	pxor RNOT,		x2; \
+	por x1,			x4; \
+	pxor x3,		x4; \
+	pand x1,		x3; \
+	pxor x2,		x1; \
+	pand x4,		x2;
+#define SI1_2(x0, x1, x2, x3, x4) \
+	pxor x1,		x4; \
+	por x3,			x1; \
+	pxor x0,		x3; \
+	pxor x0,		x2; \
+	por x4,			x0; \
+	pxor x4,		x2; \
+	pxor x0,		x1; \
+	pxor x1,		x4;
+
+#define SI2_1(x0, x1, x2, x3, x4) \
+	pxor x1,		x2; \
+	movdqa x3,		x4; \
+	pxor RNOT,		x3; \
+	por x2,			x3; \
+	pxor x4,		x2; \
+	pxor x0,		x4; \
+	pxor x1,		x3; \
+	por x2,			x1; \
+	pxor x0,		x2;
+#define SI2_2(x0, x1, x2, x3, x4) \
+	pxor x4,		x1; \
+	por x3,			x4; \
+	pxor x3,		x2; \
+	pxor x2,		x4; \
+	pand x1,		x2; \
+	pxor x3,		x2; \
+	pxor x4,		x3; \
+	pxor x0,		x4;
+
+#define SI3_1(x0, x1, x2, x3, x4) \
+	pxor x1,		x2; \
+	movdqa x1,		x4; \
+	pand x2,		x1; \
+	pxor x0,		x1; \
+	por x4,			x0; \
+	pxor x3,		x4; \
+	pxor x3,		x0; \
+	por x1,			x3; \
+	pxor x2,		x1;
+#define SI3_2(x0, x1, x2, x3, x4) \
+	pxor x3,		x1; \
+	pxor x2,		x0; \
+	pxor x3,		x2; \
+	pand x1,		x3; \
+	pxor x0,		x1; \
+	pand x2,		x0; \
+	pxor x3,		x4; \
+	pxor x0,		x3; \
+	pxor x1,		x0;
+
+#define SI4_1(x0, x1, x2, x3, x4) \
+	pxor x3,		x2; \
+	movdqa x0,		x4; \
+	pand x1,		x0; \
+	pxor x2,		x0; \
+	por x3,			x2; \
+	pxor RNOT,		x4; \
+	pxor x0,		x1; \
+	pxor x2,		x0; \
+	pand x4,		x2;
+#define SI4_2(x0, x1, x2, x3, x4) \
+	pxor x0,		x2; \
+	por x4,			x0; \
+	pxor x3,		x0; \
+	pand x2,		x3; \
+	pxor x3,		x4; \
+	pxor x1,		x3; \
+	pand x0,		x1; \
+	pxor x1,		x4; \
+	pxor x3,		x0;
+
+#define SI5_1(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	por x2,			x1; \
+	pxor x4,		x2; \
+	pxor x3,		x1; \
+	pand x4,		x3; \
+	pxor x3,		x2; \
+	por x0,			x3; \
+	pxor RNOT,		x0; \
+	pxor x2,		x3; \
+	por x0,			x2;
+#define SI5_2(x0, x1, x2, x3, x4) \
+	pxor x1,		x4; \
+	pxor x4,		x2; \
+	pand x0,		x4; \
+	pxor x1,		x0; \
+	pxor x3,		x1; \
+	pand x2,		x0; \
+	pxor x3,		x2; \
+	pxor x2,		x0; \
+	pxor x4,		x2; \
+	pxor x3,		x4;
+
+#define SI6_1(x0, x1, x2, x3, x4) \
+	pxor x2,		x0; \
+	movdqa x0,		x4; \
+	pand x3,		x0; \
+	pxor x3,		x2; \
+	pxor x2,		x0; \
+	pxor x1,		x3; \
+	por x4,			x2; \
+	pxor x3,		x2; \
+	pand x0,		x3;
+#define SI6_2(x0, x1, x2, x3, x4) \
+	pxor RNOT,		x0; \
+	pxor x1,		x3; \
+	pand x2,		x1; \
+	pxor x0,		x4; \
+	pxor x4,		x3; \
+	pxor x2,		x4; \
+	pxor x1,		x0; \
+	pxor x0,		x2;
+
+#define SI7_1(x0, x1, x2, x3, x4) \
+	movdqa x3,		x4; \
+	pand x0,		x3; \
+	pxor x2,		x0; \
+	por x4,			x2; \
+	pxor x1,		x4; \
+	pxor RNOT,		x0; \
+	por x3,			x1; \
+	pxor x0,		x4; \
+	pand x2,		x0; \
+	pxor x1,		x0;
+#define SI7_2(x0, x1, x2, x3, x4) \
+	pand x2,		x1; \
+	pxor x2,		x3; \
+	pxor x3,		x4; \
+	pand x3,		x2; \
+	por x0,			x3; \
+	pxor x4,		x1; \
+	pxor x4,		x3; \
+	pand x0,		x4; \
+	pxor x2,		x4;
+
+#define get_key(i, j, t) \
+	movd (4*(i)+(j))*4(CTX), t; \
+	pshufd $0, t, t;
+
+#define K2(x0, x1, x2, x3, x4, i) \
+	get_key(i, 0, RK0); \
+	get_key(i, 1, RK1); \
+	get_key(i, 2, RK2); \
+	get_key(i, 3, RK3); \
+	pxor RK0,		x0 ## 1; \
+	pxor RK1,		x1 ## 1; \
+	pxor RK2,		x2 ## 1; \
+	pxor RK3,		x3 ## 1; \
+		pxor RK0,		x0 ## 2; \
+		pxor RK1,		x1 ## 2; \
+		pxor RK2,		x2 ## 2; \
+		pxor RK3,		x3 ## 2;
+
+#define LK2(x0, x1, x2, x3, x4, i) \
+	movdqa x0 ## 1,		x4 ## 1; \
+	pslld $13,		x0 ## 1; \
+	psrld $(32 - 13),	x4 ## 1; \
+	por x4 ## 1,		x0 ## 1; \
+	pxor x0 ## 1,		x1 ## 1; \
+	movdqa x2 ## 1,		x4 ## 1; \
+	pslld $3,		x2 ## 1; \
+	psrld $(32 - 3),	x4 ## 1; \
+	por x4 ## 1,		x2 ## 1; \
+	pxor x2 ## 1,		x1 ## 1; \
+		movdqa x0 ## 2,		x4 ## 2; \
+		pslld $13,		x0 ## 2; \
+		psrld $(32 - 13),	x4 ## 2; \
+		por x4 ## 2,		x0 ## 2; \
+		pxor x0 ## 2,		x1 ## 2; \
+		movdqa x2 ## 2,		x4 ## 2; \
+		pslld $3,		x2 ## 2; \
+		psrld $(32 - 3),	x4 ## 2; \
+		por x4 ## 2,		x2 ## 2; \
+		pxor x2 ## 2,		x1 ## 2; \
+	movdqa x1 ## 1,		x4 ## 1; \
+	pslld $1,		x1 ## 1; \
+	psrld $(32 - 1),	x4 ## 1; \
+	por x4 ## 1,		x1 ## 1; \
+	movdqa x0 ## 1,		x4 ## 1; \
+	pslld $3,		x4 ## 1; \
+	pxor x2 ## 1,		x3 ## 1; \
+	pxor x4 ## 1,		x3 ## 1; \
+	movdqa x3 ## 1,		x4 ## 1; \
+	get_key(i, 1, RK1); \
+		movdqa x1 ## 2,		x4 ## 2; \
+		pslld $1,		x1 ## 2; \
+		psrld $(32 - 1),	x4 ## 2; \
+		por x4 ## 2,		x1 ## 2; \
+		movdqa x0 ## 2,		x4 ## 2; \
+		pslld $3,		x4 ## 2; \
+		pxor x2 ## 2,		x3 ## 2; \
+		pxor x4 ## 2,		x3 ## 2; \
+		movdqa x3 ## 2,		x4 ## 2; \
+		get_key(i, 3, RK3); \
+	pslld $7,		x3 ## 1; \
+	psrld $(32 - 7),	x4 ## 1; \
+	por x4 ## 1,		x3 ## 1; \
+	movdqa x1 ## 1,		x4 ## 1; \
+	pslld $7,		x4 ## 1; \
+	pxor x1 ## 1,		x0 ## 1; \
+	pxor x3 ## 1,		x0 ## 1; \
+	pxor x3 ## 1,		x2 ## 1; \
+	pxor x4 ## 1,		x2 ## 1; \
+	get_key(i, 0, RK0); \
+		pslld $7,		x3 ## 2; \
+		psrld $(32 - 7),	x4 ## 2; \
+		por x4 ## 2,		x3 ## 2; \
+		movdqa x1 ## 2,		x4 ## 2; \
+		pslld $7,		x4 ## 2; \
+		pxor x1 ## 2,		x0 ## 2; \
+		pxor x3 ## 2,		x0 ## 2; \
+		pxor x3 ## 2,		x2 ## 2; \
+		pxor x4 ## 2,		x2 ## 2; \
+		get_key(i, 2, RK2); \
+	pxor RK1,		x1 ## 1; \
+	pxor RK3,		x3 ## 1; \
+	movdqa x0 ## 1,		x4 ## 1; \
+	pslld $5,		x0 ## 1; \
+	psrld $(32 - 5),	x4 ## 1; \
+	por x4 ## 1,		x0 ## 1; \
+	movdqa x2 ## 1,		x4 ## 1; \
+	pslld $22,		x2 ## 1; \
+	psrld $(32 - 22),	x4 ## 1; \
+	por x4 ## 1,		x2 ## 1; \
+	pxor RK0,		x0 ## 1; \
+	pxor RK2,		x2 ## 1; \
+		pxor RK1,		x1 ## 2; \
+		pxor RK3,		x3 ## 2; \
+		movdqa x0 ## 2,		x4 ## 2; \
+		pslld $5,		x0 ## 2; \
+		psrld $(32 - 5),	x4 ## 2; \
+		por x4 ## 2,		x0 ## 2; \
+		movdqa x2 ## 2,		x4 ## 2; \
+		pslld $22,		x2 ## 2; \
+		psrld $(32 - 22),	x4 ## 2; \
+		por x4 ## 2,		x2 ## 2; \
+		pxor RK0,		x0 ## 2; \
+		pxor RK2,		x2 ## 2;
+
+#define KL2(x0, x1, x2, x3, x4, i) \
+	pxor RK0,		x0 ## 1; \
+	pxor RK2,		x2 ## 1; \
+	movdqa x0 ## 1,		x4 ## 1; \
+	psrld $5,		x0 ## 1; \
+	pslld $(32 - 5),	x4 ## 1; \
+	por x4 ## 1,		x0 ## 1; \
+	pxor RK3,		x3 ## 1; \
+	pxor RK1,		x1 ## 1; \
+	movdqa x2 ## 1,		x4 ## 1; \
+	psrld $22,		x2 ## 1; \
+	pslld $(32 - 22),	x4 ## 1; \
+	por x4 ## 1,		x2 ## 1; \
+	pxor x3 ## 1,		x2 ## 1; \
+		pxor RK0,		x0 ## 2; \
+		pxor RK2,		x2 ## 2; \
+		movdqa x0 ## 2,		x4 ## 2; \
+		psrld $5,		x0 ## 2; \
+		pslld $(32 - 5),	x4 ## 2; \
+		por x4 ## 2,		x0 ## 2; \
+		pxor RK3,		x3 ## 2; \
+		pxor RK1,		x1 ## 2; \
+		movdqa x2 ## 2,		x4 ## 2; \
+		psrld $22,		x2 ## 2; \
+		pslld $(32 - 22),	x4 ## 2; \
+		por x4 ## 2,		x2 ## 2; \
+		pxor x3 ## 2,		x2 ## 2; \
+	pxor x3 ## 1,		x0 ## 1; \
+	movdqa x1 ## 1,		x4 ## 1; \
+	pslld $7,		x4 ## 1; \
+	pxor x1 ## 1,		x0 ## 1; \
+	pxor x4 ## 1,		x2 ## 1; \
+	movdqa x1 ## 1,		x4 ## 1; \
+	psrld $1,		x1 ## 1; \
+	pslld $(32 - 1),	x4 ## 1; \
+	por x4 ## 1,		x1 ## 1; \
+		pxor x3 ## 2,		x0 ## 2; \
+		movdqa x1 ## 2,		x4 ## 2; \
+		pslld $7,		x4 ## 2; \
+		pxor x1 ## 2,		x0 ## 2; \
+		pxor x4 ## 2,		x2 ## 2; \
+		movdqa x1 ## 2,		x4 ## 2; \
+		psrld $1,		x1 ## 2; \
+		pslld $(32 - 1),	x4 ## 2; \
+		por x4 ## 2,		x1 ## 2; \
+	movdqa x3 ## 1,		x4 ## 1; \
+	psrld $7,		x3 ## 1; \
+	pslld $(32 - 7),	x4 ## 1; \
+	por x4 ## 1,		x3 ## 1; \
+	pxor x0 ## 1,		x1 ## 1; \
+	movdqa x0 ## 1,		x4 ## 1; \
+	pslld $3,		x4 ## 1; \
+	pxor x4 ## 1,		x3 ## 1; \
+	movdqa x0 ## 1,		x4 ## 1; \
+		movdqa x3 ## 2,		x4 ## 2; \
+		psrld $7,		x3 ## 2; \
+		pslld $(32 - 7),	x4 ## 2; \
+		por x4 ## 2,		x3 ## 2; \
+		pxor x0 ## 2,		x1 ## 2; \
+		movdqa x0 ## 2,		x4 ## 2; \
+		pslld $3,		x4 ## 2; \
+		pxor x4 ## 2,		x3 ## 2; \
+		movdqa x0 ## 2,		x4 ## 2; \
+	psrld $13,		x0 ## 1; \
+	pslld $(32 - 13),	x4 ## 1; \
+	por x4 ## 1,		x0 ## 1; \
+	pxor x2 ## 1,		x1 ## 1; \
+	pxor x2 ## 1,		x3 ## 1; \
+	movdqa x2 ## 1,		x4 ## 1; \
+	psrld $3,		x2 ## 1; \
+	pslld $(32 - 3),	x4 ## 1; \
+	por x4 ## 1,		x2 ## 1; \
+		psrld $13,		x0 ## 2; \
+		pslld $(32 - 13),	x4 ## 2; \
+		por x4 ## 2,		x0 ## 2; \
+		pxor x2 ## 2,		x1 ## 2; \
+		pxor x2 ## 2,		x3 ## 2; \
+		movdqa x2 ## 2,		x4 ## 2; \
+		psrld $3,		x2 ## 2; \
+		pslld $(32 - 3),	x4 ## 2; \
+		por x4 ## 2,		x2 ## 2;
+
+#define S(SBOX, x0, x1, x2, x3, x4) \
+	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
+
+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
+	get_key(i, 0, RK0); \
+	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	get_key(i, 2, RK2); \
+	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+	get_key(i, 3, RK3); \
+	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	get_key(i, 1, RK1); \
+	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+	movdqa x2,		t3; \
+	movdqa x0,		t1; \
+	unpcklps x3,		t3; \
+	movdqa x0,		t2; \
+	unpcklps x1,		t1; \
+	unpckhps x1,		t2; \
+	movdqa t3,		x1; \
+	unpckhps x3,		x2; \
+	movdqa t1,		x0; \
+	movhlps t1,		x1; \
+	movdqa t2,		t1; \
+	movlhps t3,		x0; \
+	movlhps x2,		t1; \
+	movhlps t2,		x2; \
+	movdqa x2,		x3; \
+	movdqa t1,		x2;
+
+#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+	movdqu (0*4*4)(in),	x0; \
+	movdqu (1*4*4)(in),	x1; \
+	movdqu (2*4*4)(in),	x2; \
+	movdqu (3*4*4)(in),	x3; \
+	\
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+	\
+	movdqu x0,		(0*4*4)(out); \
+	movdqu x1,		(1*4*4)(out); \
+	movdqu x2,		(2*4*4)(out); \
+	movdqu x3,		(3*4*4)(out);
+
+#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+	\
+	movdqu (0*4*4)(out),	t0; \
+	pxor t0,		x0; \
+	movdqu x0,		(0*4*4)(out); \
+	movdqu (1*4*4)(out),	t0; \
+	pxor t0,		x1; \
+	movdqu x1,		(1*4*4)(out); \
+	movdqu (2*4*4)(out),	t0; \
+	pxor t0,		x2; \
+	movdqu x2,		(2*4*4)(out); \
+	movdqu (3*4*4)(out),	t0; \
+	pxor t0,		x3; \
+	movdqu x3,		(3*4*4)(out);
+
+.align 8
+.global __serpent_enc_blk_8way
+.type   __serpent_enc_blk_8way,@function;
+
+__serpent_enc_blk_8way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: bool, if true: xor output
+	 */
+
+	pcmpeqd RNOT, RNOT;
+
+	leaq (4*4*4)(%rdx), %rax;
+	read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+						 K2(RA, RB, RC, RD, RE, 0);
+	S(S0, RA, RB, RC, RD, RE);		LK2(RC, RB, RD, RA, RE, 1);
+	S(S1, RC, RB, RD, RA, RE);		LK2(RE, RD, RA, RC, RB, 2);
+	S(S2, RE, RD, RA, RC, RB);		LK2(RB, RD, RE, RC, RA, 3);
+	S(S3, RB, RD, RE, RC, RA);		LK2(RC, RA, RD, RB, RE, 4);
+	S(S4, RC, RA, RD, RB, RE);		LK2(RA, RD, RB, RE, RC, 5);
+	S(S5, RA, RD, RB, RE, RC);		LK2(RC, RA, RD, RE, RB, 6);
+	S(S6, RC, RA, RD, RE, RB);		LK2(RD, RB, RA, RE, RC, 7);
+	S(S7, RD, RB, RA, RE, RC);		LK2(RC, RA, RE, RD, RB, 8);
+	S(S0, RC, RA, RE, RD, RB);		LK2(RE, RA, RD, RC, RB, 9);
+	S(S1, RE, RA, RD, RC, RB);		LK2(RB, RD, RC, RE, RA, 10);
+	S(S2, RB, RD, RC, RE, RA);		LK2(RA, RD, RB, RE, RC, 11);
+	S(S3, RA, RD, RB, RE, RC);		LK2(RE, RC, RD, RA, RB, 12);
+	S(S4, RE, RC, RD, RA, RB);		LK2(RC, RD, RA, RB, RE, 13);
+	S(S5, RC, RD, RA, RB, RE);		LK2(RE, RC, RD, RB, RA, 14);
+	S(S6, RE, RC, RD, RB, RA);		LK2(RD, RA, RC, RB, RE, 15);
+	S(S7, RD, RA, RC, RB, RE);		LK2(RE, RC, RB, RD, RA, 16);
+	S(S0, RE, RC, RB, RD, RA);		LK2(RB, RC, RD, RE, RA, 17);
+	S(S1, RB, RC, RD, RE, RA);		LK2(RA, RD, RE, RB, RC, 18);
+	S(S2, RA, RD, RE, RB, RC);		LK2(RC, RD, RA, RB, RE, 19);
+	S(S3, RC, RD, RA, RB, RE);		LK2(RB, RE, RD, RC, RA, 20);
+	S(S4, RB, RE, RD, RC, RA);		LK2(RE, RD, RC, RA, RB, 21);
+	S(S5, RE, RD, RC, RA, RB);		LK2(RB, RE, RD, RA, RC, 22);
+	S(S6, RB, RE, RD, RA, RC);		LK2(RD, RC, RE, RA, RB, 23);
+	S(S7, RD, RC, RE, RA, RB);		LK2(RB, RE, RA, RD, RC, 24);
+	S(S0, RB, RE, RA, RD, RC);		LK2(RA, RE, RD, RB, RC, 25);
+	S(S1, RA, RE, RD, RB, RC);		LK2(RC, RD, RB, RA, RE, 26);
+	S(S2, RC, RD, RB, RA, RE);		LK2(RE, RD, RC, RA, RB, 27);
+	S(S3, RE, RD, RC, RA, RB);		LK2(RA, RB, RD, RE, RC, 28);
+	S(S4, RA, RB, RD, RE, RC);		LK2(RB, RD, RE, RC, RA, 29);
+	S(S5, RB, RD, RE, RC, RA);		LK2(RA, RB, RD, RC, RE, 30);
+	S(S6, RA, RB, RD, RC, RE);		LK2(RD, RE, RB, RC, RA, 31);
+	S(S7, RD, RE, RB, RC, RA);		 K2(RA, RB, RC, RD, RE, 32);
+
+	leaq (4*4*4)(%rsi), %rax;
+
+	testb %cl, %cl;
+	jnz __enc_xor8;
+
+	write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+	ret;
+
+__enc_xor8:
+	xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+	ret;
+
+.align 8
+.global serpent_dec_blk_8way
+.type   serpent_dec_blk_8way,@function;
+
+serpent_dec_blk_8way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	pcmpeqd RNOT, RNOT;
+
+	leaq (4*4*4)(%rdx), %rax;
+	read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+						 K2(RA, RB, RC, RD, RE, 32);
+	SP(SI7, RA, RB, RC, RD, RE, 31);	KL2(RB, RD, RA, RE, RC, 31);
+	SP(SI6, RB, RD, RA, RE, RC, 30);	KL2(RA, RC, RE, RB, RD, 30);
+	SP(SI5, RA, RC, RE, RB, RD, 29);	KL2(RC, RD, RA, RE, RB, 29);
+	SP(SI4, RC, RD, RA, RE, RB, 28);	KL2(RC, RA, RB, RE, RD, 28);
+	SP(SI3, RC, RA, RB, RE, RD, 27);	KL2(RB, RC, RD, RE, RA, 27);
+	SP(SI2, RB, RC, RD, RE, RA, 26);	KL2(RC, RA, RE, RD, RB, 26);
+	SP(SI1, RC, RA, RE, RD, RB, 25);	KL2(RB, RA, RE, RD, RC, 25);
+	SP(SI0, RB, RA, RE, RD, RC, 24);	KL2(RE, RC, RA, RB, RD, 24);
+	SP(SI7, RE, RC, RA, RB, RD, 23);	KL2(RC, RB, RE, RD, RA, 23);
+	SP(SI6, RC, RB, RE, RD, RA, 22);	KL2(RE, RA, RD, RC, RB, 22);
+	SP(SI5, RE, RA, RD, RC, RB, 21);	KL2(RA, RB, RE, RD, RC, 21);
+	SP(SI4, RA, RB, RE, RD, RC, 20);	KL2(RA, RE, RC, RD, RB, 20);
+	SP(SI3, RA, RE, RC, RD, RB, 19);	KL2(RC, RA, RB, RD, RE, 19);
+	SP(SI2, RC, RA, RB, RD, RE, 18);	KL2(RA, RE, RD, RB, RC, 18);
+	SP(SI1, RA, RE, RD, RB, RC, 17);	KL2(RC, RE, RD, RB, RA, 17);
+	SP(SI0, RC, RE, RD, RB, RA, 16);	KL2(RD, RA, RE, RC, RB, 16);
+	SP(SI7, RD, RA, RE, RC, RB, 15);	KL2(RA, RC, RD, RB, RE, 15);
+	SP(SI6, RA, RC, RD, RB, RE, 14);	KL2(RD, RE, RB, RA, RC, 14);
+	SP(SI5, RD, RE, RB, RA, RC, 13);	KL2(RE, RC, RD, RB, RA, 13);
+	SP(SI4, RE, RC, RD, RB, RA, 12);	KL2(RE, RD, RA, RB, RC, 12);
+	SP(SI3, RE, RD, RA, RB, RC, 11);	KL2(RA, RE, RC, RB, RD, 11);
+	SP(SI2, RA, RE, RC, RB, RD, 10);	KL2(RE, RD, RB, RC, RA, 10);
+	SP(SI1, RE, RD, RB, RC, RA, 9);		KL2(RA, RD, RB, RC, RE, 9);
+	SP(SI0, RA, RD, RB, RC, RE, 8);		KL2(RB, RE, RD, RA, RC, 8);
+	SP(SI7, RB, RE, RD, RA, RC, 7);		KL2(RE, RA, RB, RC, RD, 7);
+	SP(SI6, RE, RA, RB, RC, RD, 6);		KL2(RB, RD, RC, RE, RA, 6);
+	SP(SI5, RB, RD, RC, RE, RA, 5);		KL2(RD, RA, RB, RC, RE, 5);
+	SP(SI4, RD, RA, RB, RC, RE, 4);		KL2(RD, RB, RE, RC, RA, 4);
+	SP(SI3, RD, RB, RE, RC, RA, 3);		KL2(RE, RD, RA, RC, RB, 3);
+	SP(SI2, RE, RD, RA, RC, RB, 2);		KL2(RD, RB, RC, RA, RE, 2);
+	SP(SI1, RD, RB, RC, RA, RE, 1);		KL2(RE, RB, RC, RA, RD, 1);
+	S(SI0, RE, RB, RC, RA, RD);		 K2(RC, RD, RB, RE, RA, 0);
+
+	leaq (4*4*4)(%rsi), %rax;
+	write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+	write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+	ret;
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
new file mode 100644
index 000000000000..947cf570f6a7
--- /dev/null
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -0,0 +1,719 @@
+/*
+ * Glue Code for SSE2 assembler versions of Serpent Cipher
+ *
+ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Glue code based on aesni-intel_glue.c by:
+ *  Copyright (C) 2008, Intel Corp.
+ *    Author: Huang Ying <ying.huang@intel.com>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ *   (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/serpent.h>
+#include <crypto/cryptd.h>
+#include <crypto/b128ops.h>
+#include <crypto/ctr.h>
+#include <asm/i387.h>
+#include <asm/serpent.h>
+#include <crypto/scatterwalk.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+
+struct async_serpent_ctx {
+	struct cryptd_ablkcipher *cryptd_tfm;
+};
+
+static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+	if (fpu_enabled)
+		return true;
+
+	/* SSE2 is only used when chunk to be processed is large enough, so
+	 * do not enable FPU until it is necessary.
+	 */
+	if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS)
+		return false;
+
+	kernel_fpu_begin();
+	return true;
+}
+
+static inline void serpent_fpu_end(bool fpu_enabled)
+{
+	if (fpu_enabled)
+		kernel_fpu_end();
+}
+
+static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
+		     bool enc)
+{
+	bool fpu_enabled = false;
+	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
+	unsigned int nbytes;
+	int err;
+
+	err = blkcipher_walk_virt(desc, walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	while ((nbytes = walk->nbytes)) {
+		u8 *wsrc = walk->src.virt.addr;
+		u8 *wdst = walk->dst.virt.addr;
+
+		fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
+
+		/* Process multi-block batch */
+		if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
+			do {
+				if (enc)
+					serpent_enc_blk_xway(ctx, wdst, wsrc);
+				else
+					serpent_dec_blk_xway(ctx, wdst, wsrc);
+
+				wsrc += bsize * SERPENT_PARALLEL_BLOCKS;
+				wdst += bsize * SERPENT_PARALLEL_BLOCKS;
+				nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
+			} while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
+
+			if (nbytes < bsize)
+				goto done;
+		}
+
+		/* Handle leftovers */
+		do {
+			if (enc)
+				__serpent_encrypt(ctx, wdst, wsrc);
+			else
+				__serpent_decrypt(ctx, wdst, wsrc);
+
+			wsrc += bsize;
+			wdst += bsize;
+			nbytes -= bsize;
+		} while (nbytes >= bsize);
+
+done:
+		err = blkcipher_walk_done(desc, walk, nbytes);
+	}
+
+	serpent_fpu_end(fpu_enabled);
+	return err;
+}
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return ecb_crypt(desc, &walk, true);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return ecb_crypt(desc, &walk, false);
+}
+
+static struct crypto_alg blk_ecb_alg = {
+	.cra_name		= "__ecb-serpent-sse2",
+	.cra_driver_name	= "__driver-ecb-serpent-sse2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_ecb_alg.cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.setkey		= serpent_setkey,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+};
+
+static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
+				  struct blkcipher_walk *walk)
+{
+	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
+	unsigned int nbytes = walk->nbytes;
+	u128 *src = (u128 *)walk->src.virt.addr;
+	u128 *dst = (u128 *)walk->dst.virt.addr;
+	u128 *iv = (u128 *)walk->iv;
+
+	do {
+		u128_xor(dst, src, iv);
+		__serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst);
+		iv = dst;
+
+		src += 1;
+		dst += 1;
+		nbytes -= bsize;
+	} while (nbytes >= bsize);
+
+	u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
+	return nbytes;
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	while ((nbytes = walk.nbytes)) {
+		nbytes = __cbc_encrypt(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	return err;
+}
+
+static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
+				  struct blkcipher_walk *walk)
+{
+	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
+	unsigned int nbytes = walk->nbytes;
+	u128 *src = (u128 *)walk->src.virt.addr;
+	u128 *dst = (u128 *)walk->dst.virt.addr;
+	u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
+	u128 last_iv;
+	int i;
+
+	/* Start of the last block. */
+	src += nbytes / bsize - 1;
+	dst += nbytes / bsize - 1;
+
+	last_iv = *src;
+
+	/* Process multi-block batch */
+	if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
+		do {
+			nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1);
+			src -= SERPENT_PARALLEL_BLOCKS - 1;
+			dst -= SERPENT_PARALLEL_BLOCKS - 1;
+
+			for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
+				ivs[i] = src[i];
+
+			serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
+
+			for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
+				u128_xor(dst + (i + 1), dst + (i + 1), ivs + i);
+
+			nbytes -= bsize;
+			if (nbytes < bsize)
+				goto done;
+
+			u128_xor(dst, dst, src - 1);
+			src -= 1;
+			dst -= 1;
+		} while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
+
+		if (nbytes < bsize)
+			goto done;
+	}
+
+	/* Handle leftovers */
+	for (;;) {
+		__serpent_decrypt(ctx, (u8 *)dst, (u8 *)src);
+
+		nbytes -= bsize;
+		if (nbytes < bsize)
+			break;
+
+		u128_xor(dst, dst, src - 1);
+		src -= 1;
+		dst -= 1;
+	}
+
+done:
+	u128_xor(dst, dst, (u128 *)walk->iv);
+	*(u128 *)walk->iv = last_iv;
+
+	return nbytes;
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	bool fpu_enabled = false;
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	while ((nbytes = walk.nbytes)) {
+		fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
+		nbytes = __cbc_decrypt(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	serpent_fpu_end(fpu_enabled);
+	return err;
+}
+
+static struct crypto_alg blk_cbc_alg = {
+	.cra_name		= "__cbc-serpent-sse2",
+	.cra_driver_name	= "__driver-cbc-serpent-sse2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_cbc_alg.cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.setkey		= serpent_setkey,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+};
+
+static inline void u128_to_be128(be128 *dst, const u128 *src)
+{
+	dst->a = cpu_to_be64(src->a);
+	dst->b = cpu_to_be64(src->b);
+}
+
+static inline void be128_to_u128(u128 *dst, const be128 *src)
+{
+	dst->a = be64_to_cpu(src->a);
+	dst->b = be64_to_cpu(src->b);
+}
+
+static inline void u128_inc(u128 *i)
+{
+	i->b++;
+	if (!i->b)
+		i->a++;
+}
+
+static void ctr_crypt_final(struct blkcipher_desc *desc,
+			    struct blkcipher_walk *walk)
+{
+	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	u8 *ctrblk = walk->iv;
+	u8 keystream[SERPENT_BLOCK_SIZE];
+	u8 *src = walk->src.virt.addr;
+	u8 *dst = walk->dst.virt.addr;
+	unsigned int nbytes = walk->nbytes;
+
+	__serpent_encrypt(ctx, keystream, ctrblk);
+	crypto_xor(keystream, src, nbytes);
+	memcpy(dst, keystream, nbytes);
+
+	crypto_inc(ctrblk, SERPENT_BLOCK_SIZE);
+}
+
+static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
+				struct blkcipher_walk *walk)
+{
+	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
+	unsigned int nbytes = walk->nbytes;
+	u128 *src = (u128 *)walk->src.virt.addr;
+	u128 *dst = (u128 *)walk->dst.virt.addr;
+	u128 ctrblk;
+	be128 ctrblocks[SERPENT_PARALLEL_BLOCKS];
+	int i;
+
+	be128_to_u128(&ctrblk, (be128 *)walk->iv);
+
+	/* Process multi-block batch */
+	if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
+		do {
+			/* create ctrblks for parallel encrypt */
+			for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
+				if (dst != src)
+					dst[i] = src[i];
+
+				u128_to_be128(&ctrblocks[i], &ctrblk);
+				u128_inc(&ctrblk);
+			}
+
+			serpent_enc_blk_xway_xor(ctx, (u8 *)dst,
+						 (u8 *)ctrblocks);
+
+			src += SERPENT_PARALLEL_BLOCKS;
+			dst += SERPENT_PARALLEL_BLOCKS;
+			nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
+		} while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
+
+		if (nbytes < bsize)
+			goto done;
+	}
+
+	/* Handle leftovers */
+	do {
+		if (dst != src)
+			*dst = *src;
+
+		u128_to_be128(&ctrblocks[0], &ctrblk);
+		u128_inc(&ctrblk);
+
+		__serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
+		u128_xor(dst, dst, (u128 *)ctrblocks);
+
+		src += 1;
+		dst += 1;
+		nbytes -= bsize;
+	} while (nbytes >= bsize);
+
+done:
+	u128_to_be128((be128 *)walk->iv, &ctrblk);
+	return nbytes;
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		     struct scatterlist *src, unsigned int nbytes)
+{
+	bool fpu_enabled = false;
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) {
+		fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
+		nbytes = __ctr_crypt(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	serpent_fpu_end(fpu_enabled);
+
+	if (walk.nbytes) {
+		ctr_crypt_final(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+
+	return err;
+}
+
+static struct crypto_alg blk_ctr_alg = {
+	.cra_name		= "__ctr-serpent-sse2",
+	.cra_driver_name	= "__driver-ctr-serpent-sse2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_ctr_alg.cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= serpent_setkey,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+};
+
+static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
+			unsigned int key_len)
+{
+	struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+	struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
+	int err;
+
+	crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+	crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
+				    & CRYPTO_TFM_REQ_MASK);
+	err = crypto_ablkcipher_setkey(child, key, key_len);
+	crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
+				    & CRYPTO_TFM_RES_MASK);
+	return err;
+}
+
+static int __ablk_encrypt(struct ablkcipher_request *req)
+{
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+	struct blkcipher_desc desc;
+
+	desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
+	desc.info = req->info;
+	desc.flags = 0;
+
+	return crypto_blkcipher_crt(desc.tfm)->encrypt(
+		&desc, req->dst, req->src, req->nbytes);
+}
+
+static int ablk_encrypt(struct ablkcipher_request *req)
+{
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+
+	if (!irq_fpu_usable()) {
+		struct ablkcipher_request *cryptd_req =
+			ablkcipher_request_ctx(req);
+
+		memcpy(cryptd_req, req, sizeof(*req));
+		ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+
+		return crypto_ablkcipher_encrypt(cryptd_req);
+	} else {
+		return __ablk_encrypt(req);
+	}
+}
+
+static int ablk_decrypt(struct ablkcipher_request *req)
+{
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+
+	if (!irq_fpu_usable()) {
+		struct ablkcipher_request *cryptd_req =
+			ablkcipher_request_ctx(req);
+
+		memcpy(cryptd_req, req, sizeof(*req));
+		ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+
+		return crypto_ablkcipher_decrypt(cryptd_req);
+	} else {
+		struct blkcipher_desc desc;
+
+		desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
+		desc.info = req->info;
+		desc.flags = 0;
+
+		return crypto_blkcipher_crt(desc.tfm)->decrypt(
+			&desc, req->dst, req->src, req->nbytes);
+	}
+}
+
+static void ablk_exit(struct crypto_tfm *tfm)
+{
+	struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	cryptd_free_ablkcipher(ctx->cryptd_tfm);
+}
+
+static void ablk_init_common(struct crypto_tfm *tfm,
+			     struct cryptd_ablkcipher *cryptd_tfm)
+{
+	struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	ctx->cryptd_tfm = cryptd_tfm;
+	tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
+		crypto_ablkcipher_reqsize(&cryptd_tfm->base);
+}
+
+static int ablk_ecb_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-serpent-sse2", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_ecb_alg = {
+	.cra_name		= "ecb(serpent)",
+	.cra_driver_name	= "ecb-serpent-sse2",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_ecb_alg.cra_list),
+	.cra_init		= ablk_ecb_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+};
+
+static int ablk_cbc_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-serpent-sse2", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_cbc_alg = {
+	.cra_name		= "cbc(serpent)",
+	.cra_driver_name	= "cbc-serpent-sse2",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_cbc_alg.cra_list),
+	.cra_init		= ablk_cbc_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= __ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+};
+
+static int ablk_ctr_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-serpent-sse2", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_ctr_alg = {
+	.cra_name		= "ctr(serpent)",
+	.cra_driver_name	= "ctr-serpent-sse2",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
+	.cra_init		= ablk_ctr_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_encrypt,
+			.geniv		= "chainiv",
+		},
+	},
+};
+
+static int __init serpent_sse2_init(void)
+{
+	int err;
+
+	if (!cpu_has_xmm2) {
+		printk(KERN_INFO "SSE2 instructions are not detected.\n");
+		return -ENODEV;
+	}
+
+	err = crypto_register_alg(&blk_ecb_alg);
+	if (err)
+		goto blk_ecb_err;
+	err = crypto_register_alg(&blk_cbc_alg);
+	if (err)
+		goto blk_cbc_err;
+	err = crypto_register_alg(&blk_ctr_alg);
+	if (err)
+		goto blk_ctr_err;
+	err = crypto_register_alg(&ablk_ecb_alg);
+	if (err)
+		goto ablk_ecb_err;
+	err = crypto_register_alg(&ablk_cbc_alg);
+	if (err)
+		goto ablk_cbc_err;
+	err = crypto_register_alg(&ablk_ctr_alg);
+	if (err)
+		goto ablk_ctr_err;
+	return err;
+
+ablk_ctr_err:
+	crypto_unregister_alg(&ablk_cbc_alg);
+ablk_cbc_err:
+	crypto_unregister_alg(&ablk_ecb_alg);
+ablk_ecb_err:
+	crypto_unregister_alg(&blk_ctr_alg);
+blk_ctr_err:
+	crypto_unregister_alg(&blk_cbc_alg);
+blk_cbc_err:
+	crypto_unregister_alg(&blk_ecb_alg);
+blk_ecb_err:
+	return err;
+}
+
+static void __exit serpent_sse2_exit(void)
+{
+	crypto_unregister_alg(&ablk_ctr_alg);
+	crypto_unregister_alg(&ablk_cbc_alg);
+	crypto_unregister_alg(&ablk_ecb_alg);
+	crypto_unregister_alg(&blk_ctr_alg);
+	crypto_unregister_alg(&blk_cbc_alg);
+	crypto_unregister_alg(&blk_ecb_alg);
+}
+
+module_init(serpent_sse2_init);
+module_exit(serpent_sse2_exit);
+
+MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("serpent");
diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/serpent.h
new file mode 100644
index 000000000000..b7fd3b595b27
--- /dev/null
+++ b/arch/x86/include/asm/serpent.h
@@ -0,0 +1,32 @@
+#ifndef ASM_X86_SERPENT_H
+#define ASM_X86_SERPENT_H
+
+#include <linux/crypto.h>
+#include <crypto/serpent.h>
+
+#define SERPENT_PARALLEL_BLOCKS 8
+
+asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst,
+				       const u8 *src, bool xor);
+asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst,
+				     const u8 *src);
+
+static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+				   const u8 *src)
+{
+	__serpent_enc_blk_8way(ctx, dst, src, false);
+}
+
+static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
+				       const u8 *src)
+{
+	__serpent_enc_blk_8way(ctx, dst, src, true);
+}
+
+static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+				   const u8 *src)
+{
+	serpent_dec_blk_8way(ctx, dst, src);
+}
+
+#endif
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 733208fe0a2d..2df61e458f06 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -766,6 +766,23 @@ config CRYPTO_SERPENT
 	  See also:
 	  <http://www.cl.cam.ac.uk/~rja14/serpent.html>
 
+config CRYPTO_SERPENT_SSE2_X86_64
+	tristate "Serpent cipher algorithm (x86_64/SSE2)"
+	depends on X86 && 64BIT
+	select CRYPTO_ALGAPI
+	select CRYPTO_SERPENT
+	help
+	  Serpent cipher algorithm, by Anderson, Biham & Knudsen.
+
+	  Keys are allowed to be from 0 to 256 bits in length, in steps
+	  of 8 bits.
+
+	  This module provides Serpent cipher algorithm that processes eigth
+	  blocks parallel using SSE2 instruction set.
+
+	  See also:
+	  <http://www.cl.cam.ac.uk/~rja14/serpent.html>
+
 config CRYPTO_TEA
 	tristate "TEA, XTEA and XETA cipher algorithms"
 	select CRYPTO_ALGAPI
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 01553a6754b7..bb54b882d738 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1534,6 +1534,21 @@ static int alg_test_null(const struct alg_test_desc *desc,
 /* Please keep this list sorted by algorithm name. */
 static const struct alg_test_desc alg_test_descs[] = {
 	{
+		.alg = "__cbc-serpent-sse2",
+		.test = alg_test_null,
+		.suite = {
+			.cipher = {
+				.enc = {
+					.vecs = NULL,
+					.count = 0
+				},
+				.dec = {
+					.vecs = NULL,
+					.count = 0
+				}
+			}
+		}
+	}, {
 		.alg = "__driver-cbc-aes-aesni",
 		.test = alg_test_null,
 		.suite = {
@@ -1548,6 +1563,21 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
+	}, {
+		.alg = "__driver-cbc-serpent-sse2",
+		.test = alg_test_null,
+		.suite = {
+			.cipher = {
+				.enc = {
+					.vecs = NULL,
+					.count = 0
+				},
+				.dec = {
+					.vecs = NULL,
+					.count = 0
+				}
+			}
+		}
 	}, {
 		.alg = "__driver-ecb-aes-aesni",
 		.test = alg_test_null,
@@ -1563,6 +1593,21 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
+	}, {
+		.alg = "__driver-ecb-serpent-sse2",
+		.test = alg_test_null,
+		.suite = {
+			.cipher = {
+				.enc = {
+					.vecs = NULL,
+					.count = 0
+				},
+				.dec = {
+					.vecs = NULL,
+					.count = 0
+				}
+			}
+		}
 	}, {
 		.alg = "__ghash-pclmulqdqni",
 		.test = alg_test_null,
@@ -1745,6 +1790,21 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
+	}, {
+		.alg = "cryptd(__driver-ecb-serpent-sse2)",
+		.test = alg_test_null,
+		.suite = {
+			.cipher = {
+				.enc = {
+					.vecs = NULL,
+					.count = 0
+				},
+				.dec = {
+					.vecs = NULL,
+					.count = 0
+				}
+			}
+		}
 	}, {
 		.alg = "cryptd(__ghash-pclmulqdqni)",
 		.test = alg_test_null,
-- 
cgit v1.2.3-55-g7522


From 251496dbfc1be38bc43b49651f3d33c02faccc47 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna
Date: Wed, 9 Nov 2011 16:26:31 +0200
Subject: crypto: serpent - add 4-way parallel i586/SSE2 assembler
 implementation

Patch adds i586/SSE2 assembler implementation of serpent cipher. Assembler
functions crypt data in four block chunks.

Patch has been tested with tcrypt and automated filesystem tests.

Tcrypt benchmarks results (serpent-sse2/serpent_generic speed ratios):

Intel Atom N270:

size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec
16      0.95x   1.12x   1.02x   1.07x   0.97x   0.98x
64      1.73x   1.82x   1.08x   1.82x   1.72x   1.73x
256     2.08x   2.00x   1.04x   2.07x   1.99x   2.01x
1024    2.28x   2.18x   1.05x   2.23x   2.17x   2.20x
8192    2.28x   2.13x   1.05x   2.23x   2.18x   2.20x

Full output:
 http://koti.mbnet.fi/axh/kernel/crypto/atom-n270/serpent-generic.txt
 http://koti.mbnet.fi/axh/kernel/crypto/atom-n270/serpent-sse2.txt

Userspace test results:

Encryption/decryption of sse2-i586 vs generic on Intel Atom N270:
 encrypt: 2.35x
 decrypt: 2.54x

Encryption/decryption of sse2-i586 vs generic on AMD Phenom II:
 encrypt: 1.82x
 decrypt: 2.51x

Encryption/decryption of sse2-i586 vs generic on Intel Xeon E7330:
 encrypt: 2.99x
 decrypt: 3.48x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile                   |   2 +
 arch/x86/crypto/serpent-sse2-i586-asm_32.S | 638 +++++++++++++++++++++++++++++
 arch/x86/include/asm/serpent.h             |  31 ++
 crypto/Kconfig                             |  17 +
 4 files changed, 688 insertions(+)
 create mode 100644 arch/x86/crypto/serpent-sse2-i586-asm_32.S

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 12ebdbd80ccb..2b0b9631474b 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -5,6 +5,7 @@
 obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
 obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
 obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
+obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
@@ -21,6 +22,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
 twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
 salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
+serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
 
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
new file mode 100644
index 000000000000..4e37677ca851
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -0,0 +1,638 @@
+/*
+ * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on crypto/serpent.c by
+ *  Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
+ *                2003 Herbert Valerio Riedel <hvr@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ */
+
+.file "serpent-sse2-i586-asm_32.S"
+.text
+
+#define arg_ctx 4
+#define arg_dst 8
+#define arg_src 12
+#define arg_xor 16
+
+/**********************************************************************
+  4-way SSE2 serpent
+ **********************************************************************/
+#define CTX %edx
+
+#define RA %xmm0
+#define RB %xmm1
+#define RC %xmm2
+#define RD %xmm3
+#define RE %xmm4
+
+#define RT0 %xmm5
+#define RT1 %xmm6
+
+#define RNOT %xmm7
+
+#define get_key(i, j, t) \
+	movd (4*(i)+(j))*4(CTX), t; \
+	pshufd $0, t, t;
+
+#define K(x0, x1, x2, x3, x4, i) \
+	get_key(i, 0, x4); \
+	get_key(i, 1, RT0); \
+	get_key(i, 2, RT1); \
+	pxor x4,		x0; \
+	pxor RT0,		x1; \
+	pxor RT1,		x2; \
+	get_key(i, 3, x4); \
+	pxor x4,		x3;
+
+#define LK(x0, x1, x2, x3, x4, i) \
+	movdqa x0,		x4; \
+	pslld $13,		x0; \
+	psrld $(32 - 13),	x4; \
+	por x4,			x0; \
+	pxor x0,		x1; \
+	movdqa x2,		x4; \
+	pslld $3,		x2; \
+	psrld $(32 - 3),	x4; \
+	por x4,			x2; \
+	pxor x2,		x1; \
+	movdqa x1,		x4; \
+	pslld $1,		x1; \
+	psrld $(32 - 1),	x4; \
+	por x4,			x1; \
+	movdqa x0,		x4; \
+	pslld $3,		x4; \
+	pxor x2,		x3; \
+	pxor x4,		x3; \
+	movdqa x3,		x4; \
+	pslld $7,		x3; \
+	psrld $(32 - 7),	x4; \
+	por x4,			x3; \
+	movdqa x1,		x4; \
+	pslld $7,		x4; \
+	pxor x1,		x0; \
+	pxor x3,		x0; \
+	pxor x3,		x2; \
+	pxor x4,		x2; \
+	movdqa x0,		x4; \
+	get_key(i, 1, RT0); \
+	pxor RT0,		x1; \
+	get_key(i, 3, RT0); \
+	pxor RT0,		x3; \
+	pslld $5,		x0; \
+	psrld $(32 - 5),	x4; \
+	por x4,			x0; \
+	movdqa x2,		x4; \
+	pslld $22,		x2; \
+	psrld $(32 - 22),	x4; \
+	por x4,			x2; \
+	get_key(i, 0, RT0); \
+	pxor RT0,		x0; \
+	get_key(i, 2, RT0); \
+	pxor RT0,		x2;
+
+#define KL(x0, x1, x2, x3, x4, i) \
+	K(x0, x1, x2, x3, x4, i); \
+	movdqa x0,		x4; \
+	psrld $5,		x0; \
+	pslld $(32 - 5),	x4; \
+	por x4,			x0; \
+	movdqa x2,		x4; \
+	psrld $22,		x2; \
+	pslld $(32 - 22),	x4; \
+	por x4,			x2; \
+	pxor x3,		x2; \
+	pxor x3,		x0; \
+	movdqa x1,		x4; \
+	pslld $7,		x4; \
+	pxor x1,		x0; \
+	pxor x4,		x2; \
+	movdqa x1,		x4; \
+	psrld $1,		x1; \
+	pslld $(32 - 1),	x4; \
+	por x4,			x1; \
+	movdqa x3,		x4; \
+	psrld $7,		x3; \
+	pslld $(32 - 7),	x4; \
+	por x4,			x3; \
+	pxor x0,		x1; \
+	movdqa x0,		x4; \
+	pslld $3,		x4; \
+	pxor x4,		x3; \
+	movdqa x0,		x4; \
+	psrld $13,		x0; \
+	pslld $(32 - 13),	x4; \
+	por x4,			x0; \
+	pxor x2,		x1; \
+	pxor x2,		x3; \
+	movdqa x2,		x4; \
+	psrld $3,		x2; \
+	pslld $(32 - 3),	x4; \
+	por x4,			x2;
+
+#define S0(x0, x1, x2, x3, x4) \
+	movdqa x3,		x4; \
+	por x0,			x3; \
+	pxor x4,		x0; \
+	pxor x2,		x4; \
+	pxor RNOT,		x4; \
+	pxor x1,		x3; \
+	pand x0,		x1; \
+	pxor x4,		x1; \
+	pxor x0,		x2; \
+	pxor x3,		x0; \
+	por x0,			x4; \
+	pxor x2,		x0; \
+	pand x1,		x2; \
+	pxor x2,		x3; \
+	pxor RNOT,		x1; \
+	pxor x4,		x2; \
+	pxor x2,		x1;
+
+#define S1(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	pxor x0,		x1; \
+	pxor x3,		x0; \
+	pxor RNOT,		x3; \
+	pand x1,		x4; \
+	por x1,			x0; \
+	pxor x2,		x3; \
+	pxor x3,		x0; \
+	pxor x3,		x1; \
+	pxor x4,		x3; \
+	por x4,			x1; \
+	pxor x2,		x4; \
+	pand x0,		x2; \
+	pxor x1,		x2; \
+	por x0,			x1; \
+	pxor RNOT,		x0; \
+	pxor x2,		x0; \
+	pxor x1,		x4;
+
+#define S2(x0, x1, x2, x3, x4) \
+	pxor RNOT,		x3; \
+	pxor x0,		x1; \
+	movdqa x0,		x4; \
+	pand x2,		x0; \
+	pxor x3,		x0; \
+	por x4,			x3; \
+	pxor x1,		x2; \
+	pxor x1,		x3; \
+	pand x0,		x1; \
+	pxor x2,		x0; \
+	pand x3,		x2; \
+	por x1,			x3; \
+	pxor RNOT,		x0; \
+	pxor x0,		x3; \
+	pxor x0,		x4; \
+	pxor x2,		x0; \
+	por x2,			x1;
+
+#define S3(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	pxor x3,		x1; \
+	por x0,			x3; \
+	pand x0,		x4; \
+	pxor x2,		x0; \
+	pxor x1,		x2; \
+	pand x3,		x1; \
+	pxor x3,		x2; \
+	por x4,			x0; \
+	pxor x3,		x4; \
+	pxor x0,		x1; \
+	pand x3,		x0; \
+	pand x4,		x3; \
+	pxor x2,		x3; \
+	por x1,			x4; \
+	pand x1,		x2; \
+	pxor x3,		x4; \
+	pxor x3,		x0; \
+	pxor x2,		x3;
+
+#define S4(x0, x1, x2, x3, x4) \
+	movdqa x3,		x4; \
+	pand x0,		x3; \
+	pxor x4,		x0; \
+	pxor x2,		x3; \
+	por x4,			x2; \
+	pxor x1,		x0; \
+	pxor x3,		x4; \
+	por x0,			x2; \
+	pxor x1,		x2; \
+	pand x0,		x1; \
+	pxor x4,		x1; \
+	pand x2,		x4; \
+	pxor x3,		x2; \
+	pxor x0,		x4; \
+	por x1,			x3; \
+	pxor RNOT,		x1; \
+	pxor x0,		x3;
+
+#define S5(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	por x0,			x1; \
+	pxor x1,		x2; \
+	pxor RNOT,		x3; \
+	pxor x0,		x4; \
+	pxor x2,		x0; \
+	pand x4,		x1; \
+	por x3,			x4; \
+	pxor x0,		x4; \
+	pand x3,		x0; \
+	pxor x3,		x1; \
+	pxor x2,		x3; \
+	pxor x1,		x0; \
+	pand x4,		x2; \
+	pxor x2,		x1; \
+	pand x0,		x2; \
+	pxor x2,		x3;
+
+#define S6(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	pxor x0,		x3; \
+	pxor x2,		x1; \
+	pxor x0,		x2; \
+	pand x3,		x0; \
+	por x3,			x1; \
+	pxor RNOT,		x4; \
+	pxor x1,		x0; \
+	pxor x2,		x1; \
+	pxor x4,		x3; \
+	pxor x0,		x4; \
+	pand x0,		x2; \
+	pxor x1,		x4; \
+	pxor x3,		x2; \
+	pand x1,		x3; \
+	pxor x0,		x3; \
+	pxor x2,		x1;
+
+#define S7(x0, x1, x2, x3, x4) \
+	pxor RNOT,		x1; \
+	movdqa x1,		x4; \
+	pxor RNOT,		x0; \
+	pand x2,		x1; \
+	pxor x3,		x1; \
+	por x4,			x3; \
+	pxor x2,		x4; \
+	pxor x3,		x2; \
+	pxor x0,		x3; \
+	por x1,			x0; \
+	pand x0,		x2; \
+	pxor x4,		x0; \
+	pxor x3,		x4; \
+	pand x0,		x3; \
+	pxor x1,		x4; \
+	pxor x4,		x2; \
+	pxor x1,		x3; \
+	por x0,			x4; \
+	pxor x1,		x4;
+
+#define SI0(x0, x1, x2, x3, x4) \
+	movdqa x3,		x4; \
+	pxor x0,		x1; \
+	por x1,			x3; \
+	pxor x1,		x4; \
+	pxor RNOT,		x0; \
+	pxor x3,		x2; \
+	pxor x0,		x3; \
+	pand x1,		x0; \
+	pxor x2,		x0; \
+	pand x3,		x2; \
+	pxor x4,		x3; \
+	pxor x3,		x2; \
+	pxor x3,		x1; \
+	pand x0,		x3; \
+	pxor x0,		x1; \
+	pxor x2,		x0; \
+	pxor x3,		x4;
+
+#define SI1(x0, x1, x2, x3, x4) \
+	pxor x3,		x1; \
+	movdqa x0,		x4; \
+	pxor x2,		x0; \
+	pxor RNOT,		x2; \
+	por x1,			x4; \
+	pxor x3,		x4; \
+	pand x1,		x3; \
+	pxor x2,		x1; \
+	pand x4,		x2; \
+	pxor x1,		x4; \
+	por x3,			x1; \
+	pxor x0,		x3; \
+	pxor x0,		x2; \
+	por x4,			x0; \
+	pxor x4,		x2; \
+	pxor x0,		x1; \
+	pxor x1,		x4;
+
+#define SI2(x0, x1, x2, x3, x4) \
+	pxor x1,		x2; \
+	movdqa x3,		x4; \
+	pxor RNOT,		x3; \
+	por x2,			x3; \
+	pxor x4,		x2; \
+	pxor x0,		x4; \
+	pxor x1,		x3; \
+	por x2,			x1; \
+	pxor x0,		x2; \
+	pxor x4,		x1; \
+	por x3,			x4; \
+	pxor x3,		x2; \
+	pxor x2,		x4; \
+	pand x1,		x2; \
+	pxor x3,		x2; \
+	pxor x4,		x3; \
+	pxor x0,		x4;
+
+#define SI3(x0, x1, x2, x3, x4) \
+	pxor x1,		x2; \
+	movdqa x1,		x4; \
+	pand x2,		x1; \
+	pxor x0,		x1; \
+	por x4,			x0; \
+	pxor x3,		x4; \
+	pxor x3,		x0; \
+	por x1,			x3; \
+	pxor x2,		x1; \
+	pxor x3,		x1; \
+	pxor x2,		x0; \
+	pxor x3,		x2; \
+	pand x1,		x3; \
+	pxor x0,		x1; \
+	pand x2,		x0; \
+	pxor x3,		x4; \
+	pxor x0,		x3; \
+	pxor x1,		x0;
+
+#define SI4(x0, x1, x2, x3, x4) \
+	pxor x3,		x2; \
+	movdqa x0,		x4; \
+	pand x1,		x0; \
+	pxor x2,		x0; \
+	por x3,			x2; \
+	pxor RNOT,		x4; \
+	pxor x0,		x1; \
+	pxor x2,		x0; \
+	pand x4,		x2; \
+	pxor x0,		x2; \
+	por x4,			x0; \
+	pxor x3,		x0; \
+	pand x2,		x3; \
+	pxor x3,		x4; \
+	pxor x1,		x3; \
+	pand x0,		x1; \
+	pxor x1,		x4; \
+	pxor x3,		x0;
+
+#define SI5(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	por x2,			x1; \
+	pxor x4,		x2; \
+	pxor x3,		x1; \
+	pand x4,		x3; \
+	pxor x3,		x2; \
+	por x0,			x3; \
+	pxor RNOT,		x0; \
+	pxor x2,		x3; \
+	por x0,			x2; \
+	pxor x1,		x4; \
+	pxor x4,		x2; \
+	pand x0,		x4; \
+	pxor x1,		x0; \
+	pxor x3,		x1; \
+	pand x2,		x0; \
+	pxor x3,		x2; \
+	pxor x2,		x0; \
+	pxor x4,		x2; \
+	pxor x3,		x4;
+
+#define SI6(x0, x1, x2, x3, x4) \
+	pxor x2,		x0; \
+	movdqa x0,		x4; \
+	pand x3,		x0; \
+	pxor x3,		x2; \
+	pxor x2,		x0; \
+	pxor x1,		x3; \
+	por x4,			x2; \
+	pxor x3,		x2; \
+	pand x0,		x3; \
+	pxor RNOT,		x0; \
+	pxor x1,		x3; \
+	pand x2,		x1; \
+	pxor x0,		x4; \
+	pxor x4,		x3; \
+	pxor x2,		x4; \
+	pxor x1,		x0; \
+	pxor x0,		x2;
+
+#define SI7(x0, x1, x2, x3, x4) \
+	movdqa x3,		x4; \
+	pand x0,		x3; \
+	pxor x2,		x0; \
+	por x4,			x2; \
+	pxor x1,		x4; \
+	pxor RNOT,		x0; \
+	por x3,			x1; \
+	pxor x0,		x4; \
+	pand x2,		x0; \
+	pxor x1,		x0; \
+	pand x2,		x1; \
+	pxor x2,		x3; \
+	pxor x3,		x4; \
+	pand x3,		x2; \
+	por x0,			x3; \
+	pxor x4,		x1; \
+	pxor x4,		x3; \
+	pand x0,		x4; \
+	pxor x2,		x4;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+	movdqa x2,		t3; \
+	movdqa x0,		t1; \
+	unpcklps x3,		t3; \
+	movdqa x0,		t2; \
+	unpcklps x1,		t1; \
+	unpckhps x1,		t2; \
+	movdqa t3,		x1; \
+	unpckhps x3,		x2; \
+	movdqa t1,		x0; \
+	movhlps t1,		x1; \
+	movdqa t2,		t1; \
+	movlhps t3,		x0; \
+	movlhps x2,		t1; \
+	movhlps t2,		x2; \
+	movdqa x2,		x3; \
+	movdqa t1,		x2;
+
+#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+	movdqu (0*4*4)(in),	x0; \
+	movdqu (1*4*4)(in),	x1; \
+	movdqu (2*4*4)(in),	x2; \
+	movdqu (3*4*4)(in),	x3; \
+	\
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+	\
+	movdqu x0, (0*4*4)(out); \
+	movdqu x1, (1*4*4)(out); \
+	movdqu x2, (2*4*4)(out); \
+	movdqu x3, (3*4*4)(out);
+
+#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+	\
+	movdqu (0*4*4)(out),	t0; \
+	pxor t0,		x0; \
+	movdqu x0,		(0*4*4)(out); \
+	movdqu (1*4*4)(out),	t0; \
+	pxor t0,		x1; \
+	movdqu x1,		(1*4*4)(out); \
+	movdqu (2*4*4)(out),	t0; \
+	pxor t0,		x2; \
+	movdqu x2,		(2*4*4)(out); \
+	movdqu (3*4*4)(out),	t0; \
+	pxor t0,		x3; \
+	movdqu x3,		(3*4*4)(out);
+
+.align 8
+.global __serpent_enc_blk_4way
+.type   __serpent_enc_blk_4way,@function;
+
+__serpent_enc_blk_4way:
+	/* input:
+	 *	arg_ctx(%esp): ctx, CTX
+	 *	arg_dst(%esp): dst
+	 *	arg_src(%esp): src
+	 *	arg_xor(%esp): bool, if true: xor output
+	 */
+
+	pcmpeqd RNOT, RNOT;
+
+	movl arg_ctx(%esp), CTX;
+
+	movl arg_src(%esp), %eax;
+	read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+					 K(RA, RB, RC, RD, RE, 0);
+	S0(RA, RB, RC, RD, RE);		LK(RC, RB, RD, RA, RE, 1);
+	S1(RC, RB, RD, RA, RE);		LK(RE, RD, RA, RC, RB, 2);
+	S2(RE, RD, RA, RC, RB);		LK(RB, RD, RE, RC, RA, 3);
+	S3(RB, RD, RE, RC, RA);		LK(RC, RA, RD, RB, RE, 4);
+	S4(RC, RA, RD, RB, RE);		LK(RA, RD, RB, RE, RC, 5);
+	S5(RA, RD, RB, RE, RC);		LK(RC, RA, RD, RE, RB, 6);
+	S6(RC, RA, RD, RE, RB);		LK(RD, RB, RA, RE, RC, 7);
+	S7(RD, RB, RA, RE, RC);		LK(RC, RA, RE, RD, RB, 8);
+	S0(RC, RA, RE, RD, RB);		LK(RE, RA, RD, RC, RB, 9);
+	S1(RE, RA, RD, RC, RB);		LK(RB, RD, RC, RE, RA, 10);
+	S2(RB, RD, RC, RE, RA);		LK(RA, RD, RB, RE, RC, 11);
+	S3(RA, RD, RB, RE, RC);		LK(RE, RC, RD, RA, RB, 12);
+	S4(RE, RC, RD, RA, RB);		LK(RC, RD, RA, RB, RE, 13);
+	S5(RC, RD, RA, RB, RE);		LK(RE, RC, RD, RB, RA, 14);
+	S6(RE, RC, RD, RB, RA);		LK(RD, RA, RC, RB, RE, 15);
+	S7(RD, RA, RC, RB, RE);		LK(RE, RC, RB, RD, RA, 16);
+	S0(RE, RC, RB, RD, RA);		LK(RB, RC, RD, RE, RA, 17);
+	S1(RB, RC, RD, RE, RA);		LK(RA, RD, RE, RB, RC, 18);
+	S2(RA, RD, RE, RB, RC);		LK(RC, RD, RA, RB, RE, 19);
+	S3(RC, RD, RA, RB, RE);		LK(RB, RE, RD, RC, RA, 20);
+	S4(RB, RE, RD, RC, RA);		LK(RE, RD, RC, RA, RB, 21);
+	S5(RE, RD, RC, RA, RB);		LK(RB, RE, RD, RA, RC, 22);
+	S6(RB, RE, RD, RA, RC);		LK(RD, RC, RE, RA, RB, 23);
+	S7(RD, RC, RE, RA, RB);		LK(RB, RE, RA, RD, RC, 24);
+	S0(RB, RE, RA, RD, RC);		LK(RA, RE, RD, RB, RC, 25);
+	S1(RA, RE, RD, RB, RC);		LK(RC, RD, RB, RA, RE, 26);
+	S2(RC, RD, RB, RA, RE);		LK(RE, RD, RC, RA, RB, 27);
+	S3(RE, RD, RC, RA, RB);		LK(RA, RB, RD, RE, RC, 28);
+	S4(RA, RB, RD, RE, RC);		LK(RB, RD, RE, RC, RA, 29);
+	S5(RB, RD, RE, RC, RA);		LK(RA, RB, RD, RC, RE, 30);
+	S6(RA, RB, RD, RC, RE);		LK(RD, RE, RB, RC, RA, 31);
+	S7(RD, RE, RB, RC, RA);		 K(RA, RB, RC, RD, RE, 32);
+
+	movl arg_dst(%esp), %eax;
+
+	cmpb $0, arg_xor(%esp);
+	jnz __enc_xor4;
+
+	write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+	ret;
+
+__enc_xor4:
+	xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+	ret;
+
+.align 8
+.global serpent_dec_blk_4way
+.type   serpent_dec_blk_4way,@function;
+
+serpent_dec_blk_4way:
+	/* input:
+	 *	arg_ctx(%esp): ctx, CTX
+	 *	arg_dst(%esp): dst
+	 *	arg_src(%esp): src
+	 */
+
+	pcmpeqd RNOT, RNOT;
+
+	movl arg_ctx(%esp), CTX;
+
+	movl arg_src(%esp), %eax;
+	read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+					 K(RA, RB, RC, RD, RE, 32);
+	SI7(RA, RB, RC, RD, RE);	KL(RB, RD, RA, RE, RC, 31);
+	SI6(RB, RD, RA, RE, RC);	KL(RA, RC, RE, RB, RD, 30);
+	SI5(RA, RC, RE, RB, RD);	KL(RC, RD, RA, RE, RB, 29);
+	SI4(RC, RD, RA, RE, RB);	KL(RC, RA, RB, RE, RD, 28);
+	SI3(RC, RA, RB, RE, RD);	KL(RB, RC, RD, RE, RA, 27);
+	SI2(RB, RC, RD, RE, RA);	KL(RC, RA, RE, RD, RB, 26);
+	SI1(RC, RA, RE, RD, RB);	KL(RB, RA, RE, RD, RC, 25);
+	SI0(RB, RA, RE, RD, RC);	KL(RE, RC, RA, RB, RD, 24);
+	SI7(RE, RC, RA, RB, RD);	KL(RC, RB, RE, RD, RA, 23);
+	SI6(RC, RB, RE, RD, RA);	KL(RE, RA, RD, RC, RB, 22);
+	SI5(RE, RA, RD, RC, RB);	KL(RA, RB, RE, RD, RC, 21);
+	SI4(RA, RB, RE, RD, RC);	KL(RA, RE, RC, RD, RB, 20);
+	SI3(RA, RE, RC, RD, RB);	KL(RC, RA, RB, RD, RE, 19);
+	SI2(RC, RA, RB, RD, RE);	KL(RA, RE, RD, RB, RC, 18);
+	SI1(RA, RE, RD, RB, RC);	KL(RC, RE, RD, RB, RA, 17);
+	SI0(RC, RE, RD, RB, RA);	KL(RD, RA, RE, RC, RB, 16);
+	SI7(RD, RA, RE, RC, RB);	KL(RA, RC, RD, RB, RE, 15);
+	SI6(RA, RC, RD, RB, RE);	KL(RD, RE, RB, RA, RC, 14);
+	SI5(RD, RE, RB, RA, RC);	KL(RE, RC, RD, RB, RA, 13);
+	SI4(RE, RC, RD, RB, RA);	KL(RE, RD, RA, RB, RC, 12);
+	SI3(RE, RD, RA, RB, RC);	KL(RA, RE, RC, RB, RD, 11);
+	SI2(RA, RE, RC, RB, RD);	KL(RE, RD, RB, RC, RA, 10);
+	SI1(RE, RD, RB, RC, RA);	KL(RA, RD, RB, RC, RE, 9);
+	SI0(RA, RD, RB, RC, RE);	KL(RB, RE, RD, RA, RC, 8);
+	SI7(RB, RE, RD, RA, RC);	KL(RE, RA, RB, RC, RD, 7);
+	SI6(RE, RA, RB, RC, RD);	KL(RB, RD, RC, RE, RA, 6);
+	SI5(RB, RD, RC, RE, RA);	KL(RD, RA, RB, RC, RE, 5);
+	SI4(RD, RA, RB, RC, RE);	KL(RD, RB, RE, RC, RA, 4);
+	SI3(RD, RB, RE, RC, RA);	KL(RE, RD, RA, RC, RB, 3);
+	SI2(RE, RD, RA, RC, RB);	KL(RD, RB, RC, RA, RE, 2);
+	SI1(RD, RB, RC, RA, RE);	KL(RE, RB, RC, RA, RD, 1);
+	SI0(RE, RB, RC, RA, RD);	 K(RC, RD, RB, RE, RA, 0);
+
+	movl arg_dst(%esp), %eax;
+	write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
+
+	ret;
diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/serpent.h
index b7fd3b595b27..d3ef63fe0c81 100644
--- a/arch/x86/include/asm/serpent.h
+++ b/arch/x86/include/asm/serpent.h
@@ -4,6 +4,35 @@
 #include <linux/crypto.h>
 #include <crypto/serpent.h>
 
+#ifdef CONFIG_X86_32
+
+#define SERPENT_PARALLEL_BLOCKS 4
+
+asmlinkage void __serpent_enc_blk_4way(struct serpent_ctx *ctx, u8 *dst,
+				       const u8 *src, bool xor);
+asmlinkage void serpent_dec_blk_4way(struct serpent_ctx *ctx, u8 *dst,
+				     const u8 *src);
+
+static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+					const u8 *src)
+{
+	__serpent_enc_blk_4way(ctx, dst, src, false);
+}
+
+static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
+					    const u8 *src)
+{
+	__serpent_enc_blk_4way(ctx, dst, src, true);
+}
+
+static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+					const u8 *src)
+{
+	serpent_dec_blk_4way(ctx, dst, src);
+}
+
+#else
+
 #define SERPENT_PARALLEL_BLOCKS 8
 
 asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst,
@@ -30,3 +59,5 @@ static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
 }
 
 #endif
+
+#endif
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 2df61e458f06..16e0c1304c3a 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -783,6 +783,23 @@ config CRYPTO_SERPENT_SSE2_X86_64
 	  See also:
 	  <http://www.cl.cam.ac.uk/~rja14/serpent.html>
 
+config CRYPTO_SERPENT_SSE2_586
+	tristate "Serpent cipher algorithm (i586/SSE2)"
+	depends on X86 && !64BIT
+	select CRYPTO_ALGAPI
+	select CRYPTO_SERPENT
+	help
+	  Serpent cipher algorithm, by Anderson, Biham & Knudsen.
+
+	  Keys are allowed to be from 0 to 256 bits in length, in steps
+	  of 8 bits.
+
+	  This module provides Serpent cipher algorithm that processes four
+	  blocks parallel using SSE2 instruction set.
+
+	  See also:
+	  <http://www.cl.cam.ac.uk/~rja14/serpent.html>
+
 config CRYPTO_TEA
 	tristate "TEA, XTEA and XETA cipher algorithms"
 	select CRYPTO_ALGAPI
-- 
cgit v1.2.3-55-g7522


From 18482053f92b099663bd36a10e8f6bd2c8544669 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna
Date: Wed, 9 Nov 2011 16:26:36 +0200
Subject: crypto: serpent-sse2 - add lrw support

Patch adds LRW support for serpent-sse2 by using lrw_crypt(). Patch has been
tested with tcrypt and automated filesystem tests.

Tcrypt benchmarks results (serpent-sse2/serpent_generic speed ratios):

Benchmark results with tcrypt:

Intel Celeron T1600 (x86_64) (fam:6, model:15, step:13):
size    lrw-enc lrw-dec
16B     1.00x   0.96x
64B     1.01x   1.01x
256B    3.01x   2.97x
1024B   3.39x   3.33x
8192B   3.35x   3.33x

AMD Phenom II 1055T (x86_64) (fam:16, model:10):
size    lrw-enc lrw-dec
16B     0.98x   1.03x
64B     1.01x   1.04x
256B    2.10x   2.14x
1024B   2.28x   2.33x
8192B   2.30x   2.33x

Intel Atom N270 (i586):
size    lrw-enc lrw-dec
16B     0.97x   0.97x
64B     1.47x   1.50x
256B    1.72x   1.69x
1024B   1.88x   1.81x
8192B   1.84x   1.79x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent_sse2_glue.c | 211 ++++++++++++++++++++++++++++++++++++
 crypto/serpent.c                    |  10 +-
 include/crypto/serpent.h            |   2 +
 3 files changed, 221 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 947cf570f6a7..db318e5cb240 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -39,12 +39,17 @@
 #include <crypto/cryptd.h>
 #include <crypto/b128ops.h>
 #include <crypto/ctr.h>
+#include <crypto/lrw.h>
 #include <asm/i387.h>
 #include <asm/serpent.h>
 #include <crypto/scatterwalk.h>
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
 
+#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE)
+#define HAS_LRW
+#endif
+
 struct async_serpent_ctx {
 	struct cryptd_ablkcipher *cryptd_tfm;
 };
@@ -460,6 +465,152 @@ static struct crypto_alg blk_ctr_alg = {
 	},
 };
 
+#ifdef HAS_LRW
+
+struct crypt_priv {
+	struct serpent_ctx *ctx;
+	bool fpu_enabled;
+};
+
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+		serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+		return;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		__serpent_encrypt(ctx->ctx, srcdst, srcdst);
+}
+
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+		serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+		return;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		__serpent_decrypt(ctx->ctx, srcdst, srcdst);
+}
+
+struct serpent_lrw_ctx {
+	struct lrw_table_ctx lrw_table;
+	struct serpent_ctx serpent_ctx;
+};
+
+static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen)
+{
+	struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+	int err;
+
+	err = __serpent_setkey(&ctx->serpent_ctx, key, keylen -
+							SERPENT_BLOCK_SIZE);
+	if (err)
+		return err;
+
+	return lrw_init_table(&ctx->lrw_table, key + keylen -
+						SERPENT_BLOCK_SIZE);
+}
+
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[SERPENT_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->serpent_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+	int ret;
+
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[SERPENT_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->serpent_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+	int ret;
+
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static void lrw_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	lrw_free_table(&ctx->lrw_table);
+}
+
+static struct crypto_alg blk_lrw_alg = {
+	.cra_name		= "__lrw-serpent-sse2",
+	.cra_driver_name	= "__driver-lrw-serpent-sse2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_lrw_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_lrw_alg.cra_list),
+	.cra_exit		= lrw_exit_tfm,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= lrw_serpent_setkey,
+			.encrypt	= lrw_encrypt,
+			.decrypt	= lrw_decrypt,
+		},
+	},
+};
+
+#endif
+
 static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
 			unsigned int key_len)
 {
@@ -658,6 +809,48 @@ static struct crypto_alg ablk_ctr_alg = {
 	},
 };
 
+#ifdef HAS_LRW
+
+static int ablk_lrw_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-lrw-serpent-sse2", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_lrw_alg = {
+	.cra_name		= "lrw(serpent)",
+	.cra_driver_name	= "lrw-serpent-sse2",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_lrw_alg.cra_list),
+	.cra_init		= ablk_lrw_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+};
+
+#endif
+
 static int __init serpent_sse2_init(void)
 {
 	int err;
@@ -685,8 +878,22 @@ static int __init serpent_sse2_init(void)
 	err = crypto_register_alg(&ablk_ctr_alg);
 	if (err)
 		goto ablk_ctr_err;
+#ifdef HAS_LRW
+	err = crypto_register_alg(&blk_lrw_alg);
+	if (err)
+		goto blk_lrw_err;
+	err = crypto_register_alg(&ablk_lrw_alg);
+	if (err)
+		goto ablk_lrw_err;
+#endif
 	return err;
 
+#ifdef HAS_LRW
+ablk_lrw_err:
+	crypto_unregister_alg(&blk_lrw_alg);
+blk_lrw_err:
+	crypto_unregister_alg(&ablk_ctr_alg);
+#endif
 ablk_ctr_err:
 	crypto_unregister_alg(&ablk_cbc_alg);
 ablk_cbc_err:
@@ -703,6 +910,10 @@ blk_ecb_err:
 
 static void __exit serpent_sse2_exit(void)
 {
+#ifdef HAS_LRW
+	crypto_unregister_alg(&ablk_lrw_alg);
+	crypto_unregister_alg(&blk_lrw_alg);
+#endif
 	crypto_unregister_alg(&ablk_ctr_alg);
 	crypto_unregister_alg(&ablk_cbc_alg);
 	crypto_unregister_alg(&ablk_ecb_alg);
diff --git a/crypto/serpent.c b/crypto/serpent.c
index eb6163041b08..7cc2caeb88a5 100644
--- a/crypto/serpent.c
+++ b/crypto/serpent.c
@@ -206,9 +206,9 @@
 	x1 ^= x4;	x3 ^= x4;	x4 &= x0;	\
 	x4 ^= x2;
 
-int serpent_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen)
+int __serpent_setkey(struct serpent_ctx *ctx, const u8 *key,
+		     unsigned int keylen)
 {
-	struct serpent_ctx *ctx = crypto_tfm_ctx(tfm);
 	u32 *k = ctx->expkey;
 	u8  *k8 = (u8 *)k;
 	u32 r0,r1,r2,r3,r4;
@@ -349,6 +349,12 @@ int serpent_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(__serpent_setkey);
+
+int serpent_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen)
+{
+	return __serpent_setkey(crypto_tfm_ctx(tfm), key, keylen);
+}
 EXPORT_SYMBOL_GPL(serpent_setkey);
 
 void __serpent_encrypt(struct serpent_ctx *ctx, u8 *dst, const u8 *src)
diff --git a/include/crypto/serpent.h b/include/crypto/serpent.h
index 40df885f9d1f..b7e0941eb6fc 100644
--- a/include/crypto/serpent.h
+++ b/include/crypto/serpent.h
@@ -17,6 +17,8 @@ struct serpent_ctx {
 	u32 expkey[SERPENT_EXPKEY_WORDS];
 };
 
+int __serpent_setkey(struct serpent_ctx *ctx, const u8 *key,
+		     unsigned int keylen);
 int serpent_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen);
 
 void __serpent_encrypt(struct serpent_ctx *ctx, u8 *dst, const u8 *src);
-- 
cgit v1.2.3-55-g7522


From 5962f8b66dd040ad89d55b58967ea2dec607f4d3 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna
Date: Wed, 9 Nov 2011 16:26:41 +0200
Subject: crypto: serpent-sse2 - add xts support

Patch adds XTS support for serpent-sse2 by using xts_crypt(). Patch has been
tested with tcrypt and automated filesystem tests.

Tcrypt benchmarks results (serpent-sse2/serpent_generic speed ratios):

Intel Celeron T1600 (x86_64) (fam:6, model:15, step:13):
size    xts-enc xts-dec
16B     0.98x   1.00x
64B     1.00x   1.01x
256B    2.78x   2.75x
1024B   3.30x   3.26x
8192B   3.39x   3.30x

AMD Phenom II 1055T (x86_64) (fam:16, model:10):
size    xts-enc xts-dec
16B     1.05x   1.02x
64B     1.04x   1.03x
256B    2.10x   2.05x
1024B   2.34x   2.35x
8192B   2.34x   2.40x

Intel Atom N270 (i586):
size    xts-enc xts-dec
16B     0.95x   0.96x
64B     1.53x   1.50x
256B    1.72x   1.75x
1024B   1.88x   1.87x
8192B   1.86x   1.83x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent_sse2_glue.c | 180 +++++++++++++++++++++++++++++++++++-
 1 file changed, 178 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index db318e5cb240..2dffc5ab883e 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -40,6 +40,7 @@
 #include <crypto/b128ops.h>
 #include <crypto/ctr.h>
 #include <crypto/lrw.h>
+#include <crypto/xts.h>
 #include <asm/i387.h>
 #include <asm/serpent.h>
 #include <crypto/scatterwalk.h>
@@ -50,6 +51,10 @@
 #define HAS_LRW
 #endif
 
+#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE)
+#define HAS_XTS
+#endif
+
 struct async_serpent_ctx {
 	struct cryptd_ablkcipher *cryptd_tfm;
 };
@@ -465,7 +470,7 @@ static struct crypto_alg blk_ctr_alg = {
 	},
 };
 
-#ifdef HAS_LRW
+#if defined(HAS_LRW) || defined(HAS_XTS)
 
 struct crypt_priv {
 	struct serpent_ctx *ctx;
@@ -506,6 +511,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 		__serpent_decrypt(ctx->ctx, srcdst, srcdst);
 }
 
+#endif
+
+#ifdef HAS_LRW
+
 struct serpent_lrw_ctx {
 	struct lrw_table_ctx lrw_table;
 	struct serpent_ctx serpent_ctx;
@@ -611,6 +620,114 @@ static struct crypto_alg blk_lrw_alg = {
 
 #endif
 
+#ifdef HAS_XTS
+
+struct serpent_xts_ctx {
+	struct serpent_ctx tweak_ctx;
+	struct serpent_ctx crypt_ctx;
+};
+
+static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen)
+{
+	struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+	u32 *flags = &tfm->crt_flags;
+	int err;
+
+	/* key consists of keys of equal size concatenated, therefore
+	 * the length must be even
+	 */
+	if (keylen % 2) {
+		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+
+	/* first half of xts-key is for crypt */
+	err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
+	if (err)
+		return err;
+
+	/* second half of xts-key is for tweak */
+	return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
+}
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[SERPENT_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->crypt_ctx,
+		.fpu_enabled = false,
+	};
+	struct xts_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.tweak_ctx = &ctx->tweak_ctx,
+		.tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+	int ret;
+
+	ret = xts_crypt(desc, dst, src, nbytes, &req);
+	serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[SERPENT_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->crypt_ctx,
+		.fpu_enabled = false,
+	};
+	struct xts_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.tweak_ctx = &ctx->tweak_ctx,
+		.tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+	int ret;
+
+	ret = xts_crypt(desc, dst, src, nbytes, &req);
+	serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static struct crypto_alg blk_xts_alg = {
+	.cra_name		= "__xts-serpent-sse2",
+	.cra_driver_name	= "__driver-xts-serpent-sse2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_xts_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_xts_alg.cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE * 2,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE * 2,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= xts_serpent_setkey,
+			.encrypt	= xts_encrypt,
+			.decrypt	= xts_decrypt,
+		},
+	},
+};
+
+#endif
+
 static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
 			unsigned int key_len)
 {
@@ -851,6 +968,46 @@ static struct crypto_alg ablk_lrw_alg = {
 
 #endif
 
+#ifdef HAS_XTS
+
+static int ablk_xts_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-xts-serpent-sse2", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_xts_alg = {
+	.cra_name		= "xts(serpent)",
+	.cra_driver_name	= "xts-serpent-sse2",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_xts_alg.cra_list),
+	.cra_init		= ablk_xts_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE * 2,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE * 2,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+};
+
+#endif
+
 static int __init serpent_sse2_init(void)
 {
 	int err;
@@ -885,15 +1042,30 @@ static int __init serpent_sse2_init(void)
 	err = crypto_register_alg(&ablk_lrw_alg);
 	if (err)
 		goto ablk_lrw_err;
+#endif
+#ifdef HAS_XTS
+	err = crypto_register_alg(&blk_xts_alg);
+	if (err)
+		goto blk_xts_err;
+	err = crypto_register_alg(&ablk_xts_alg);
+	if (err)
+		goto ablk_xts_err;
 #endif
 	return err;
 
+#ifdef HAS_XTS
+	crypto_unregister_alg(&ablk_xts_alg);
+ablk_xts_err:
+	crypto_unregister_alg(&blk_xts_alg);
+blk_xts_err:
+#endif
 #ifdef HAS_LRW
+	crypto_unregister_alg(&ablk_lrw_alg);
 ablk_lrw_err:
 	crypto_unregister_alg(&blk_lrw_alg);
 blk_lrw_err:
-	crypto_unregister_alg(&ablk_ctr_alg);
 #endif
+	crypto_unregister_alg(&ablk_ctr_alg);
 ablk_ctr_err:
 	crypto_unregister_alg(&ablk_cbc_alg);
 ablk_cbc_err:
@@ -910,6 +1082,10 @@ blk_ecb_err:
 
 static void __exit serpent_sse2_exit(void)
 {
+#ifdef HAS_XTS
+	crypto_unregister_alg(&ablk_xts_alg);
+	crypto_unregister_alg(&blk_xts_alg);
+#endif
 #ifdef HAS_LRW
 	crypto_unregister_alg(&ablk_lrw_alg);
 	crypto_unregister_alg(&blk_lrw_alg);
-- 
cgit v1.2.3-55-g7522


From d35643385628d44a5933a0755b01478eb4df5c65 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna
Date: Wed, 9 Nov 2011 19:44:12 +0200
Subject: crypto: serpent-sse2 - clear CRYPTO_TFM_REQ_MAY_SLEEP in lrw and xts
 modes

LRW/XTS patches for serpent-sse2 forgot to add this. CRYPTO_TFM_REQ_MAY_SLEEP
should be cleared as sleeping between kernel_fpu_begin()/kernel_fpu_end() is
not allowed.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent_sse2_glue.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 2dffc5ab883e..2f5c304653f4 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -554,6 +554,7 @@ static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	};
 	int ret;
 
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 	ret = lrw_crypt(desc, dst, src, nbytes, &req);
 	serpent_fpu_end(crypt_ctx.fpu_enabled);
 
@@ -579,6 +580,7 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	};
 	int ret;
 
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 	ret = lrw_crypt(desc, dst, src, nbytes, &req);
 	serpent_fpu_end(crypt_ctx.fpu_enabled);
 
@@ -671,6 +673,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	};
 	int ret;
 
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 	ret = xts_crypt(desc, dst, src, nbytes, &req);
 	serpent_fpu_end(crypt_ctx.fpu_enabled);
 
@@ -697,6 +700,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	};
 	int ret;
 
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 	ret = xts_crypt(desc, dst, src, nbytes, &req);
 	serpent_fpu_end(crypt_ctx.fpu_enabled);
 
-- 
cgit v1.2.3-55-g7522


From d88e4cb67197d007fb778d62fe17360e970d5bfa Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Mon, 21 Nov 2011 12:32:25 -0800
Subject: freezer: remove now unused TIF_FREEZE

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-arch@vger.kernel.org
---
 arch/alpha/include/asm/thread_info.h      | 2 --
 arch/arm/include/asm/thread_info.h        | 2 --
 arch/avr32/include/asm/thread_info.h      | 2 --
 arch/blackfin/include/asm/thread_info.h   | 2 --
 arch/cris/include/asm/thread_info.h       | 2 --
 arch/frv/include/asm/thread_info.h        | 2 --
 arch/h8300/include/asm/thread_info.h      | 2 --
 arch/ia64/include/asm/thread_info.h       | 2 --
 arch/m32r/include/asm/thread_info.h       | 2 --
 arch/m68k/include/asm/thread_info.h       | 1 -
 arch/microblaze/include/asm/thread_info.h | 2 --
 arch/mips/include/asm/thread_info.h       | 2 --
 arch/mn10300/include/asm/thread_info.h    | 2 --
 arch/parisc/include/asm/thread_info.h     | 2 --
 arch/powerpc/include/asm/thread_info.h    | 2 --
 arch/s390/include/asm/thread_info.h       | 2 --
 arch/sh/include/asm/thread_info.h         | 2 --
 arch/sparc/include/asm/thread_info_32.h   | 2 --
 arch/sparc/include/asm/thread_info_64.h   | 2 --
 arch/um/include/asm/thread_info.h         | 2 --
 arch/unicore32/include/asm/thread_info.h  | 2 --
 arch/x86/include/asm/thread_info.h        | 2 --
 arch/xtensa/include/asm/thread_info.h     | 2 --
 23 files changed, 45 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h
index ff73db022342..28335bd40e40 100644
--- a/arch/alpha/include/asm/thread_info.h
+++ b/arch/alpha/include/asm/thread_info.h
@@ -79,7 +79,6 @@ register struct thread_info *__current_thread_info __asm__("$8");
 #define TIF_UAC_SIGBUS		12	/* ! userspace part of 'osf_sysinfo' */
 #define TIF_MEMDIE		13	/* is terminating due to OOM killer */
 #define TIF_RESTORE_SIGMASK	14	/* restore signal mask in do_signal */
-#define TIF_FREEZE		16	/* is freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
@@ -87,7 +86,6 @@ register struct thread_info *__current_thread_info __asm__("$8");
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
-#define _TIF_FREEZE		(1<<TIF_FREEZE)
 
 /* Work to do on interrupt/exception return.  */
 #define _TIF_WORK_MASK		(_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 7b5cc8dae06e..0f30c3a78fc1 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -142,7 +142,6 @@ extern void vfp_flush_hwstate(struct thread_info *);
 #define TIF_POLLING_NRFLAG	16
 #define TIF_USING_IWMMXT	17
 #define TIF_MEMDIE		18	/* is terminating due to OOM killer */
-#define TIF_FREEZE		19
 #define TIF_RESTORE_SIGMASK	20
 #define TIF_SECCOMP		21
 
@@ -152,7 +151,6 @@ extern void vfp_flush_hwstate(struct thread_info *);
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_USING_IWMMXT	(1 << TIF_USING_IWMMXT)
-#define _TIF_FREEZE		(1 << TIF_FREEZE)
 #define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 
diff --git a/arch/avr32/include/asm/thread_info.h b/arch/avr32/include/asm/thread_info.h
index 7a9c03dcb0b6..e5deda4691db 100644
--- a/arch/avr32/include/asm/thread_info.h
+++ b/arch/avr32/include/asm/thread_info.h
@@ -85,7 +85,6 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_RESTORE_SIGMASK	7	/* restore signal mask in do_signal */
 #define TIF_CPU_GOING_TO_SLEEP	8	/* CPU is entering sleep 0 mode */
 #define TIF_NOTIFY_RESUME	9	/* callback before returning to user */
-#define TIF_FREEZE		29
 #define TIF_DEBUG		30	/* debugging enabled */
 #define TIF_USERSPACE		31      /* true if FS sets userspace */
 
@@ -98,7 +97,6 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
 #define _TIF_CPU_GOING_TO_SLEEP (1 << TIF_CPU_GOING_TO_SLEEP)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
-#define _TIF_FREEZE		(1 << TIF_FREEZE)
 
 /* Note: The masks below must never span more than 16 bits! */
 
diff --git a/arch/blackfin/include/asm/thread_info.h b/arch/blackfin/include/asm/thread_info.h
index 02560fd8a121..53ad10005ae3 100644
--- a/arch/blackfin/include/asm/thread_info.h
+++ b/arch/blackfin/include/asm/thread_info.h
@@ -100,7 +100,6 @@ static inline struct thread_info *current_thread_info(void)
 					   TIF_NEED_RESCHED */
 #define TIF_MEMDIE		4	/* is terminating due to OOM killer */
 #define TIF_RESTORE_SIGMASK	5	/* restore signal mask in do_signal() */
-#define TIF_FREEZE		6	/* is freezing for suspend */
 #define TIF_IRQ_SYNC		7	/* sync pipeline stage */
 #define TIF_NOTIFY_RESUME	8	/* callback before returning to user */
 #define TIF_SINGLESTEP		9
@@ -111,7 +110,6 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
-#define _TIF_FREEZE		(1<<TIF_FREEZE)
 #define _TIF_IRQ_SYNC		(1<<TIF_IRQ_SYNC)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
diff --git a/arch/cris/include/asm/thread_info.h b/arch/cris/include/asm/thread_info.h
index 332f19c54557..29b92884d793 100644
--- a/arch/cris/include/asm/thread_info.h
+++ b/arch/cris/include/asm/thread_info.h
@@ -86,7 +86,6 @@ struct thread_info {
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal() */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		17	/* is terminating due to OOM killer */
-#define TIF_FREEZE		18	/* is freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
@@ -94,7 +93,6 @@ struct thread_info {
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
-#define _TIF_FREEZE		(1<<TIF_FREEZE)
 
 #define _TIF_WORK_MASK		0x0000FFFE	/* work to do on interrupt/exception return */
 #define _TIF_ALLWORK_MASK	0x0000FFFF	/* work to do on any return to u-space */
diff --git a/arch/frv/include/asm/thread_info.h b/arch/frv/include/asm/thread_info.h
index cefbe73dc119..92d83ea99ae5 100644
--- a/arch/frv/include/asm/thread_info.h
+++ b/arch/frv/include/asm/thread_info.h
@@ -111,7 +111,6 @@ register struct thread_info *__current_thread_info asm("gr15");
 #define TIF_RESTORE_SIGMASK	5	/* restore signal mask in do_signal() */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		17	/* is terminating due to OOM killer */
-#define TIF_FREEZE		18	/* freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -120,7 +119,6 @@ register struct thread_info *__current_thread_info asm("gr15");
 #define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
 #define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
-#define _TIF_FREEZE		(1 << TIF_FREEZE)
 
 #define _TIF_WORK_MASK		0x0000FFFE	/* work to do on interrupt/exception return */
 #define _TIF_ALLWORK_MASK	0x0000FFFF	/* work to do on any return to u-space */
diff --git a/arch/h8300/include/asm/thread_info.h b/arch/h8300/include/asm/thread_info.h
index d6f1784bfdee..9c126e0c09aa 100644
--- a/arch/h8300/include/asm/thread_info.h
+++ b/arch/h8300/include/asm/thread_info.h
@@ -90,7 +90,6 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_MEMDIE		4	/* is terminating due to OOM killer */
 #define TIF_RESTORE_SIGMASK	5	/* restore signal mask in do_signal() */
 #define TIF_NOTIFY_RESUME	6	/* callback before returning to user */
-#define TIF_FREEZE		16	/* is freezing for suspend */
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -99,7 +98,6 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
-#define _TIF_FREEZE		(1<<TIF_FREEZE)
 
 #define _TIF_WORK_MASK		0x0000FFFE	/* work to do on interrupt/exception return */
 
diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
index ff0cc84e7bcc..e054bcc4273c 100644
--- a/arch/ia64/include/asm/thread_info.h
+++ b/arch/ia64/include/asm/thread_info.h
@@ -113,7 +113,6 @@ struct thread_info {
 #define TIF_MEMDIE		17	/* is terminating due to OOM killer */
 #define TIF_MCA_INIT		18	/* this task is processing MCA or INIT */
 #define TIF_DB_DISABLED		19	/* debug trap disabled for fsyscall */
-#define TIF_FREEZE		20	/* is freezing for suspend */
 #define TIF_RESTORE_RSE		21	/* user RBS is newer than kernel RBS */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
@@ -126,7 +125,6 @@ struct thread_info {
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_MCA_INIT		(1 << TIF_MCA_INIT)
 #define _TIF_DB_DISABLED	(1 << TIF_DB_DISABLED)
-#define _TIF_FREEZE		(1 << TIF_FREEZE)
 #define _TIF_RESTORE_RSE	(1 << TIF_RESTORE_RSE)
 
 /* "work to do on user-return" bits */
diff --git a/arch/m32r/include/asm/thread_info.h b/arch/m32r/include/asm/thread_info.h
index 0227dba44068..bf8fa3c06f4e 100644
--- a/arch/m32r/include/asm/thread_info.h
+++ b/arch/m32r/include/asm/thread_info.h
@@ -138,7 +138,6 @@ static inline unsigned int get_thread_fault_code(void)
 #define TIF_USEDFPU		16	/* FPU was used by this task this quantum (SMP) */
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		18	/* is terminating due to OOM killer */
-#define TIF_FREEZE		19	/* is freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
@@ -149,7 +148,6 @@ static inline unsigned int get_thread_fault_code(void)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_USEDFPU		(1<<TIF_USEDFPU)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
-#define _TIF_FREEZE		(1<<TIF_FREEZE)
 
 #define _TIF_WORK_MASK		0x0000FFFE	/* work to do on interrupt/exception return */
 #define _TIF_ALLWORK_MASK	0x0000FFFF	/* work to do on any return to u-space */
diff --git a/arch/m68k/include/asm/thread_info.h b/arch/m68k/include/asm/thread_info.h
index 790988967ba7..294df1592de5 100644
--- a/arch/m68k/include/asm/thread_info.h
+++ b/arch/m68k/include/asm/thread_info.h
@@ -103,7 +103,6 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_DELAYED_TRACE	14	/* single step a syscall */
 #define TIF_SYSCALL_TRACE	15	/* syscall trace active */
 #define TIF_MEMDIE		16	/* is terminating due to OOM killer */
-#define TIF_FREEZE		17	/* thread is freezing for suspend */
 #define TIF_RESTORE_SIGMASK	18	/* restore signal mask in do_signal */
 
 #endif	/* _ASM_M68K_THREAD_INFO_H */
diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h
index b73da2ac21b3..1a8ab6a5c03f 100644
--- a/arch/microblaze/include/asm/thread_info.h
+++ b/arch/microblaze/include/asm/thread_info.h
@@ -125,7 +125,6 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_MEMDIE		6	/* is terminating due to OOM killer */
 #define TIF_SYSCALL_AUDIT	9       /* syscall auditing active */
 #define TIF_SECCOMP		10      /* secure computing */
-#define TIF_FREEZE		14	/* Freezing for suspend */
 
 /* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_POLLING_NRFLAG	16
@@ -137,7 +136,6 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
 #define _TIF_IRET		(1 << TIF_IRET)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
-#define _TIF_FREEZE		(1 << TIF_FREEZE)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 
diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h
index 97f8bf6639e7..0d85d8e440c5 100644
--- a/arch/mips/include/asm/thread_info.h
+++ b/arch/mips/include/asm/thread_info.h
@@ -117,7 +117,6 @@ register struct thread_info *__current_thread_info __asm__("$28");
 #define TIF_USEDFPU		16	/* FPU was used by this task this quantum (SMP) */
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		18	/* is terminating due to OOM killer */
-#define TIF_FREEZE		19
 #define TIF_FIXADE		20	/* Fix address errors in software */
 #define TIF_LOGADE		21	/* Log address errors to syslog */
 #define TIF_32BIT_REGS		22	/* also implies 16/32 fprs */
@@ -141,7 +140,6 @@ register struct thread_info *__current_thread_info __asm__("$28");
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_USEDFPU		(1<<TIF_USEDFPU)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
-#define _TIF_FREEZE		(1<<TIF_FREEZE)
 #define _TIF_FIXADE		(1<<TIF_FIXADE)
 #define _TIF_LOGADE		(1<<TIF_LOGADE)
 #define _TIF_32BIT_REGS		(1<<TIF_32BIT_REGS)
diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h
index 87c213002d4c..28cf52100baa 100644
--- a/arch/mn10300/include/asm/thread_info.h
+++ b/arch/mn10300/include/asm/thread_info.h
@@ -165,7 +165,6 @@ extern void free_thread_info(struct thread_info *);
 #define TIF_RESTORE_SIGMASK	5	/* restore signal mask in do_signal() */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		17	/* is terminating due to OOM killer */
-#define TIF_FREEZE		18	/* freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	+(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	+(1 << TIF_NOTIFY_RESUME)
@@ -174,7 +173,6 @@ extern void free_thread_info(struct thread_info *);
 #define _TIF_SINGLESTEP		+(1 << TIF_SINGLESTEP)
 #define _TIF_RESTORE_SIGMASK	+(1 << TIF_RESTORE_SIGMASK)
 #define _TIF_POLLING_NRFLAG	+(1 << TIF_POLLING_NRFLAG)
-#define _TIF_FREEZE		+(1 << TIF_FREEZE)
 
 #define _TIF_WORK_MASK		0x0000FFFE	/* work to do on interrupt/exception return */
 #define _TIF_ALLWORK_MASK	0x0000FFFF	/* work to do on any return to u-space */
diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h
index aa8de727e90b..6d9c7c7973d0 100644
--- a/arch/parisc/include/asm/thread_info.h
+++ b/arch/parisc/include/asm/thread_info.h
@@ -58,7 +58,6 @@ struct thread_info {
 #define TIF_32BIT               4       /* 32 bit binary */
 #define TIF_MEMDIE		5	/* is terminating due to OOM killer */
 #define TIF_RESTORE_SIGMASK	6	/* restore saved signal mask */
-#define TIF_FREEZE		7	/* is freezing for suspend */
 #define TIF_NOTIFY_RESUME	8	/* callback before returning to user */
 #define TIF_SINGLESTEP		9	/* single stepping? */
 #define TIF_BLOCKSTEP		10	/* branch stepping? */
@@ -69,7 +68,6 @@ struct thread_info {
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_32BIT		(1 << TIF_32BIT)
 #define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
-#define _TIF_FREEZE		(1 << TIF_FREEZE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 836f231ec1f0..964714940961 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -109,7 +109,6 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_RESTOREALL		11	/* Restore all regs (implies NOERROR) */
 #define TIF_NOERROR		12	/* Force successful syscall return */
 #define TIF_NOTIFY_RESUME	13	/* callback before returning to user */
-#define TIF_FREEZE		14	/* Freezing for suspend */
 #define TIF_SYSCALL_TRACEPOINT	15	/* syscall tracepoint instrumentation */
 #define TIF_RUNLATCH		16	/* Is the runlatch enabled? */
 
@@ -127,7 +126,6 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_RESTOREALL		(1<<TIF_RESTOREALL)
 #define _TIF_NOERROR		(1<<TIF_NOERROR)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
-#define _TIF_FREEZE		(1<<TIF_FREEZE)
 #define _TIF_SYSCALL_TRACEPOINT	(1<<TIF_SYSCALL_TRACEPOINT)
 #define _TIF_RUNLATCH		(1<<TIF_RUNLATCH)
 #define _TIF_SYSCALL_T_OR_A	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index a23183423b14..a73038155e0d 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -102,7 +102,6 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_MEMDIE		18	/* is terminating due to OOM killer */
 #define TIF_RESTORE_SIGMASK	19	/* restore signal mask in do_signal() */
 #define TIF_SINGLE_STEP		20	/* This task is single stepped */
-#define TIF_FREEZE		21	/* thread is freezing for suspend */
 
 #define _TIF_SYSCALL		(1<<TIF_SYSCALL)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
@@ -119,7 +118,6 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_31BIT		(1<<TIF_31BIT)
 #define _TIF_SINGLE_STEP	(1<<TIF_SINGLE_STEP)
-#define _TIF_FREEZE		(1<<TIF_FREEZE)
 
 #ifdef CONFIG_64BIT
 #define is_32bit_task()		(test_thread_flag(TIF_31BIT))
diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h
index ea2d5089de1e..20ee40af16e9 100644
--- a/arch/sh/include/asm/thread_info.h
+++ b/arch/sh/include/asm/thread_info.h
@@ -122,7 +122,6 @@ extern void init_thread_xstate(void);
 #define TIF_SYSCALL_TRACEPOINT	8	/* for ftrace syscall instrumentation */
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		18	/* is terminating due to OOM killer */
-#define TIF_FREEZE		19	/* Freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
@@ -133,7 +132,6 @@ extern void init_thread_xstate(void);
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
-#define _TIF_FREEZE		(1 << TIF_FREEZE)
 
 /*
  * _TIF_ALLWORK_MASK and _TIF_WORK_MASK need to fit within 2 bytes, or we
diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h
index fa5753233410..5cc5888ad5a3 100644
--- a/arch/sparc/include/asm/thread_info_32.h
+++ b/arch/sparc/include/asm/thread_info_32.h
@@ -133,7 +133,6 @@ BTFIXUPDEF_CALL(void, free_thread_info, struct thread_info *)
 #define TIF_POLLING_NRFLAG	9	/* true if poll_idle() is polling
 					 * TIF_NEED_RESCHED */
 #define TIF_MEMDIE		10	/* is terminating due to OOM killer */
-#define TIF_FREEZE		11	/* is freezing for suspend */
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -147,7 +146,6 @@ BTFIXUPDEF_CALL(void, free_thread_info, struct thread_info *)
 #define _TIF_DO_NOTIFY_RESUME_MASK	(_TIF_NOTIFY_RESUME | \
 					 _TIF_SIGPENDING | \
 					 _TIF_RESTORE_SIGMASK)
-#define _TIF_FREEZE		(1<<TIF_FREEZE)
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index 60d86be1a533..01d057fe6a3f 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -225,7 +225,6 @@ register struct thread_info *current_thread_info_reg asm("g6");
 /* flag bit 12 is available */
 #define TIF_MEMDIE		13	/* is terminating due to OOM killer */
 #define TIF_POLLING_NRFLAG	14
-#define TIF_FREEZE		15	/* is freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
@@ -237,7 +236,6 @@ register struct thread_info *current_thread_info_reg asm("g6");
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SYSCALL_TRACEPOINT	(1<<TIF_SYSCALL_TRACEPOINT)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
-#define _TIF_FREEZE		(1<<TIF_FREEZE)
 
 #define _TIF_USER_WORK_MASK	((0xff << TI_FLAG_WSAVED_SHIFT) | \
 				 _TIF_DO_NOTIFY_RESUME_MASK | \
diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h
index 5bd1bad33fab..200c4ab1240c 100644
--- a/arch/um/include/asm/thread_info.h
+++ b/arch/um/include/asm/thread_info.h
@@ -71,7 +71,6 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_MEMDIE		5	/* is terminating due to OOM killer */
 #define TIF_SYSCALL_AUDIT	6
 #define TIF_RESTORE_SIGMASK	7
-#define TIF_FREEZE		16	/* is freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
@@ -80,6 +79,5 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_MEMDIE		(1 << TIF_MEMDIE)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
-#define _TIF_FREEZE		(1 << TIF_FREEZE)
 
 #endif
diff --git a/arch/unicore32/include/asm/thread_info.h b/arch/unicore32/include/asm/thread_info.h
index c270e9e04861..89f7557583b8 100644
--- a/arch/unicore32/include/asm/thread_info.h
+++ b/arch/unicore32/include/asm/thread_info.h
@@ -135,14 +135,12 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
 #define TIF_SYSCALL_TRACE	8
 #define TIF_MEMDIE		18
-#define TIF_FREEZE		19
 #define TIF_RESTORE_SIGMASK	20
 
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
-#define _TIF_FREEZE		(1 << TIF_FREEZE)
 #define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
 
 /*
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1fe5c127b52..32125af20d32 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -90,7 +90,6 @@ struct thread_info {
 #define TIF_MEMDIE		20	/* is terminating due to OOM killer */
 #define TIF_DEBUG		21	/* uses debug registers */
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
-#define TIF_FREEZE		23	/* is freezing for suspend */
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
 #define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
@@ -112,7 +111,6 @@ struct thread_info {
 #define _TIF_FORK		(1 << TIF_FORK)
 #define _TIF_DEBUG		(1 << TIF_DEBUG)
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
-#define _TIF_FREEZE		(1 << TIF_FREEZE)
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h
index 7be8accb0b0c..6abbedd09d85 100644
--- a/arch/xtensa/include/asm/thread_info.h
+++ b/arch/xtensa/include/asm/thread_info.h
@@ -132,7 +132,6 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_MEMDIE		5	/* is terminating due to OOM killer */
 #define TIF_RESTORE_SIGMASK	6	/* restore signal mask in do_signal() */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
-#define TIF_FREEZE		17	/* is freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
@@ -141,7 +140,6 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_IRET		(1<<TIF_IRET)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
-#define _TIF_FREEZE		(1<<TIF_FREEZE)
 
 #define _TIF_WORK_MASK		0x0000FFFE	/* work to do on interrupt/exception return */
 #define _TIF_ALLWORK_MASK	0x0000FFFF	/* work to do on any return to u-space */
-- 
cgit v1.2.3-55-g7522


From 80df46494846e857399618c54df30ce294dc1edd Mon Sep 17 00:00:00 2001
From: Maxim Uvarov
Date: Fri, 14 Oct 2011 15:36:51 -0700
Subject: xen: Make XEN_MAX_DOMAIN_MEMORY have more sensible defaults

Which is that 128GB is not going to happen with 32-bit PV DomU.
Lets use something more realistic. Also update the 64-bit to 500GB
which is the max a PV guest can do.

Signed-off-by: Maxim Uvarov <maxim.uvarov@oracle.com>
[v1: Updated 128GB->500GB for 64-bit]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/Kconfig | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 26c731a106af..fdce49c7aff6 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -29,7 +29,8 @@ config XEN_PVHVM
 
 config XEN_MAX_DOMAIN_MEMORY
        int
-       default 128
+       default 500 if X86_64
+       default 64 if X86_32
        depends on XEN
        help
          This only affects the sizing of some bss arrays, the unused
@@ -48,3 +49,4 @@ config XEN_DEBUG_FS
 	help
 	  Enable statistics output and various tuning options in debugfs.
 	  Enabling this option may incur a significant performance overhead.
+
-- 
cgit v1.2.3-55-g7522


From 0f9f5a9588468cddeccc9146b86798492c7cd4f5 Mon Sep 17 00:00:00 2001
From: Annie Li
Date: Tue, 22 Nov 2011 09:58:06 +0800
Subject: xen/granttable: Introducing grant table V2 stucture

This patch introduces new structures of grant table V2, grant table V2 is an
extension from V1. Grant table is shared between guest and Xen, and Xen is
responsible to do corresponding work for grant operations, such as: figure
out guest's grant table version, perform different actions based on
different grant table version, etc. Although full-page structure of V2
is different from V1, it play the same role as V1.

Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Annie Li <annie.li@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/grant-table.c          |   7 +-
 drivers/xen/grant-table.c           | 181 ++++++++++++++++++++++++++++--------
 include/xen/grant_table.h           |   4 +-
 include/xen/interface/grant_table.h | 167 ++++++++++++++++++++++++++++++++-
 include/xen/interface/xen.h         |   2 +
 5 files changed, 310 insertions(+), 51 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 6bbfd7ac5e81..c6ab2e7ca3a6 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -64,10 +64,10 @@ static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
 
 int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
 			   unsigned long max_nr_gframes,
-			   struct grant_entry **__shared)
+			   void **__shared)
 {
 	int rc;
-	struct grant_entry *shared = *__shared;
+	void *shared = *__shared;
 
 	if (shared == NULL) {
 		struct vm_struct *area =
@@ -83,8 +83,7 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
 	return rc;
 }
 
-void arch_gnttab_unmap_shared(struct grant_entry *shared,
-			      unsigned long nr_gframes)
+void arch_gnttab_unmap_shared(void *shared, unsigned long nr_gframes)
 {
 	apply_to_page_range(&init_mm, (unsigned long)shared,
 			    PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index bf1c094f4ebf..18355a53763f 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -53,7 +53,7 @@
 /* External tools reserve first few grant table entries. */
 #define NR_RESERVED_ENTRIES 8
 #define GNTTAB_LIST_END 0xffffffff
-#define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(struct grant_entry))
+#define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(struct grant_entry_v1))
 
 static grant_ref_t **gnttab_list;
 static unsigned int nr_grant_frames;
@@ -64,7 +64,63 @@ static DEFINE_SPINLOCK(gnttab_list_lock);
 unsigned long xen_hvm_resume_frames;
 EXPORT_SYMBOL_GPL(xen_hvm_resume_frames);
 
-static struct grant_entry *shared;
+static union {
+	struct grant_entry_v1 *v1;
+	void *addr;
+} gnttab_shared;
+
+/*This is a structure of function pointers for grant table*/
+struct gnttab_ops {
+	/*
+	 * Mapping a list of frames for storing grant entries. First input
+	 * parameter is used to storing grant table address when grant table
+	 * being setup, second parameter is the number of frames to map grant
+	 * table. Returning GNTST_okay means success and negative value means
+	 * failure.
+	 */
+	int (*map_frames)(unsigned long *, unsigned int);
+	/*
+	 * Release a list of frames which are mapped in map_frames for grant
+	 * entry status.
+	 */
+	void (*unmap_frames)(void);
+	/*
+	 * Introducing a valid entry into the grant table, granting the frame
+	 * of this grant entry to domain for accessing, or transfering, or
+	 * transitively accessing. First input parameter is reference of this
+	 * introduced grant entry, second one is domid of granted domain, third
+	 * one is the frame to be granted, and the last one is status of the
+	 * grant entry to be updated.
+	 */
+	void (*update_entry)(grant_ref_t, domid_t, unsigned long, unsigned);
+	/*
+	 * Stop granting a grant entry to domain for accessing. First input
+	 * parameter is reference of a grant entry whose grant access will be
+	 * stopped, second one is not in use now. If the grant entry is
+	 * currently mapped for reading or writing, just return failure(==0)
+	 * directly and don't tear down the grant access. Otherwise, stop grant
+	 * access for this entry and return success(==1).
+	 */
+	int (*end_foreign_access_ref)(grant_ref_t, int);
+	/*
+	 * Stop granting a grant entry to domain for transfer. If tranfer has
+	 * not started, just reclaim the grant entry and return failure(==0).
+	 * Otherwise, wait for the transfer to complete and then return the
+	 * frame.
+	 */
+	unsigned long (*end_foreign_transfer_ref)(grant_ref_t);
+	/*
+	 * Query the status of a grant entry. Input parameter is reference of
+	 * queried grant entry, return value is the status of queried entry.
+	 * Detailed status(writing/reading) can be gotten from the return value
+	 * by bit operations.
+	 */
+	int (*query_foreign_access)(grant_ref_t);
+};
+
+static struct gnttab_ops *gnttab_interface;
+
+static int grant_table_version;
 
 static struct gnttab_free_callback *gnttab_free_callback_list;
 
@@ -142,23 +198,23 @@ static void put_free_entry(grant_ref_t ref)
 	spin_unlock_irqrestore(&gnttab_list_lock, flags);
 }
 
-static void update_grant_entry(grant_ref_t ref, domid_t domid,
-			       unsigned long frame, unsigned flags)
+/*
+ * Introducing a valid entry into the grant table:
+ *  1. Write ent->domid.
+ *  2. Write ent->frame:
+ *      GTF_permit_access:   Frame to which access is permitted.
+ *      GTF_accept_transfer: Pseudo-phys frame slot being filled by new
+ *                           frame, or zero if none.
+ *  3. Write memory barrier (WMB).
+ *  4. Write ent->flags, inc. valid type.
+ */
+static void gnttab_update_entry_v1(grant_ref_t ref, domid_t domid,
+				   unsigned long frame, unsigned flags)
 {
-	/*
-	 * Introducing a valid entry into the grant table:
-	 *  1. Write ent->domid.
-	 *  2. Write ent->frame:
-	 *      GTF_permit_access:   Frame to which access is permitted.
-	 *      GTF_accept_transfer: Pseudo-phys frame slot being filled by new
-	 *                           frame, or zero if none.
-	 *  3. Write memory barrier (WMB).
-	 *  4. Write ent->flags, inc. valid type.
-	 */
-	shared[ref].frame = frame;
-	shared[ref].domid = domid;
+	gnttab_shared.v1[ref].domid = domid;
+	gnttab_shared.v1[ref].frame = frame;
 	wmb();
-	shared[ref].flags = flags;
+	gnttab_shared.v1[ref].flags = flags;
 }
 
 /*
@@ -167,7 +223,7 @@ static void update_grant_entry(grant_ref_t ref, domid_t domid,
 void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
 				     unsigned long frame, int readonly)
 {
-	update_grant_entry(ref, domid, frame,
+	gnttab_interface->update_entry(ref, domid, frame,
 			   GTF_permit_access | (readonly ? GTF_readonly : 0));
 }
 EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
@@ -187,31 +243,37 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
 }
 EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
 
-int gnttab_query_foreign_access(grant_ref_t ref)
+static int gnttab_query_foreign_access_v1(grant_ref_t ref)
 {
-	u16 nflags;
-
-	nflags = shared[ref].flags;
+	return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing);
+}
 
-	return nflags & (GTF_reading|GTF_writing);
+int gnttab_query_foreign_access(grant_ref_t ref)
+{
+	return gnttab_interface->query_foreign_access(ref);
 }
 EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
 
-int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
+static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly)
 {
 	u16 flags, nflags;
 
-	nflags = shared[ref].flags;
+	nflags = gnttab_shared.v1[ref].flags;
 	do {
 		flags = nflags;
 		if (flags & (GTF_reading|GTF_writing)) {
 			printk(KERN_ALERT "WARNING: g.e. still in use!\n");
 			return 0;
 		}
-	} while ((nflags = sync_cmpxchg(&shared[ref].flags, flags, 0)) != flags);
+	} while ((nflags = sync_cmpxchg(&gnttab_shared.v1[ref].flags, flags, 0)) != flags);
 
 	return 1;
 }
+
+int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
+{
+	return gnttab_interface->end_foreign_access_ref(ref, readonly);
+}
 EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
 
 void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
@@ -246,11 +308,11 @@ EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
 void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
 				       unsigned long pfn)
 {
-	update_grant_entry(ref, domid, pfn, GTF_accept_transfer);
+	gnttab_interface->update_entry(ref, domid, pfn, GTF_accept_transfer);
 }
 EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
 
-unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
+static unsigned long gnttab_end_foreign_transfer_ref_v1(grant_ref_t ref)
 {
 	unsigned long frame;
 	u16           flags;
@@ -259,24 +321,29 @@ unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
 	 * If a transfer is not even yet started, try to reclaim the grant
 	 * reference and return failure (== 0).
 	 */
-	while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
-		if (sync_cmpxchg(&shared[ref].flags, flags, 0) == flags)
+	while (!((flags = gnttab_shared.v1[ref].flags) & GTF_transfer_committed)) {
+		if (sync_cmpxchg(&gnttab_shared.v1[ref].flags, flags, 0) == flags)
 			return 0;
 		cpu_relax();
 	}
 
 	/* If a transfer is in progress then wait until it is completed. */
 	while (!(flags & GTF_transfer_completed)) {
-		flags = shared[ref].flags;
+		flags = gnttab_shared.v1[ref].flags;
 		cpu_relax();
 	}
 
 	rmb();	/* Read the frame number /after/ reading completion status. */
-	frame = shared[ref].frame;
+	frame = gnttab_shared.v1[ref].frame;
 	BUG_ON(frame == 0);
 
 	return frame;
 }
+
+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
+{
+	return gnttab_interface->end_foreign_transfer_ref(ref);
+}
 EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
 
 unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
@@ -520,6 +587,23 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
 }
 EXPORT_SYMBOL_GPL(gnttab_unmap_refs);
 
+static int gnttab_map_frames_v1(unsigned long *frames, unsigned int nr_gframes)
+{
+	int rc;
+
+	rc = arch_gnttab_map_shared(frames, nr_gframes,
+				    gnttab_max_grant_frames(),
+				    &gnttab_shared.addr);
+	BUG_ON(rc);
+
+	return 0;
+}
+
+static void gnttab_unmap_frames_v1(void)
+{
+	arch_gnttab_unmap_shared(gnttab_shared.addr, nr_grant_frames);
+}
+
 static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 {
 	struct gnttab_setup_table setup;
@@ -567,19 +651,35 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 
 	BUG_ON(rc || setup.status);
 
-	rc = arch_gnttab_map_shared(frames, nr_gframes, gnttab_max_grant_frames(),
-				    &shared);
-	BUG_ON(rc);
+	rc = gnttab_interface->map_frames(frames, nr_gframes);
 
 	kfree(frames);
 
-	return 0;
+	return rc;
+}
+
+static struct gnttab_ops gnttab_v1_ops = {
+	.map_frames			= gnttab_map_frames_v1,
+	.unmap_frames			= gnttab_unmap_frames_v1,
+	.update_entry			= gnttab_update_entry_v1,
+	.end_foreign_access_ref		= gnttab_end_foreign_access_ref_v1,
+	.end_foreign_transfer_ref	= gnttab_end_foreign_transfer_ref_v1,
+	.query_foreign_access		= gnttab_query_foreign_access_v1,
+};
+
+static void gnttab_request_version(void)
+{
+	grant_table_version = 1;
+	gnttab_interface = &gnttab_v1_ops;
+	printk(KERN_INFO "Grant tables using version %d layout.\n",
+		grant_table_version);
 }
 
 int gnttab_resume(void)
 {
 	unsigned int max_nr_gframes;
 
+	gnttab_request_version();
 	max_nr_gframes = gnttab_max_grant_frames();
 	if (max_nr_gframes < nr_grant_frames)
 		return -ENOSYS;
@@ -587,9 +687,10 @@ int gnttab_resume(void)
 	if (xen_pv_domain())
 		return gnttab_map(0, nr_grant_frames - 1);
 
-	if (!shared) {
-		shared = ioremap(xen_hvm_resume_frames, PAGE_SIZE * max_nr_gframes);
-		if (shared == NULL) {
+	if (gnttab_shared.addr == NULL) {
+		gnttab_shared.addr = ioremap(xen_hvm_resume_frames,
+						PAGE_SIZE * max_nr_gframes);
+		if (gnttab_shared.addr == NULL) {
 			printk(KERN_WARNING
 					"Failed to ioremap gnttab share frames!");
 			return -ENOMEM;
@@ -603,7 +704,7 @@ int gnttab_resume(void)
 
 int gnttab_suspend(void)
 {
-	arch_gnttab_unmap_shared(shared, nr_grant_frames);
+	gnttab_interface->unmap_frames();
 	return 0;
 }
 
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index 11e2dfce42f8..c7a40f8d455a 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -145,8 +145,8 @@ gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr,
 
 int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
 			   unsigned long max_nr_gframes,
-			   struct grant_entry **__shared);
-void arch_gnttab_unmap_shared(struct grant_entry *shared,
+			   void **__shared);
+void arch_gnttab_unmap_shared(void *shared,
 			      unsigned long nr_gframes);
 
 extern unsigned long xen_hvm_resume_frames;
diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h
index 39e571796e32..a17d84433e6a 100644
--- a/include/xen/interface/grant_table.h
+++ b/include/xen/interface/grant_table.h
@@ -84,13 +84,23 @@
  *  Use SMP-safe bit-setting instruction.
  */
 
+/*
+ * Reference to a grant entry in a specified domain's grant table.
+ */
+typedef uint32_t grant_ref_t;
+
 /*
  * A grant table comprises a packed array of grant entries in one or more
  * page frames shared between Xen and a guest.
  * [XEN]: This field is written by Xen and read by the sharing guest.
  * [GST]: This field is written by the guest and read by Xen.
  */
-struct grant_entry {
+
+/*
+ * Version 1 of the grant table entry structure is maintained purely
+ * for backwards compatibility.  New guests should use version 2.
+ */
+struct grant_entry_v1 {
     /* GTF_xxx: various type and flag information.  [XEN,GST] */
     uint16_t flags;
     /* The domain being granted foreign privileges. [GST] */
@@ -108,10 +118,13 @@ struct grant_entry {
  *  GTF_permit_access: Allow @domid to map/access @frame.
  *  GTF_accept_transfer: Allow @domid to transfer ownership of one page frame
  *                       to this guest. Xen writes the page number to @frame.
+ *  GTF_transitive: Allow @domid to transitively access a subrange of
+ *                  @trans_grant in @trans_domid.  No mappings are allowed.
  */
 #define GTF_invalid         (0U<<0)
 #define GTF_permit_access   (1U<<0)
 #define GTF_accept_transfer (2U<<0)
+#define GTF_transitive      (3U<<0)
 #define GTF_type_mask       (3U<<0)
 
 /*
@@ -119,6 +132,9 @@ struct grant_entry {
  *  GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
  *  GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
  *  GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
+ *  GTF_sub_page: Grant access to only a subrange of the page.  @domid
+ *                will only be allowed to copy from the grant, and not
+ *                map it. [GST]
  */
 #define _GTF_readonly       (2)
 #define GTF_readonly        (1U<<_GTF_readonly)
@@ -126,6 +142,8 @@ struct grant_entry {
 #define GTF_reading         (1U<<_GTF_reading)
 #define _GTF_writing        (4)
 #define GTF_writing         (1U<<_GTF_writing)
+#define _GTF_sub_page       (8)
+#define GTF_sub_page        (1U<<_GTF_sub_page)
 
 /*
  * Subflags for GTF_accept_transfer:
@@ -142,15 +160,81 @@ struct grant_entry {
 #define _GTF_transfer_completed (3)
 #define GTF_transfer_completed  (1U<<_GTF_transfer_completed)
 
+/*
+ * Version 2 grant table entries.  These fulfil the same role as
+ * version 1 entries, but can represent more complicated operations.
+ * Any given domain will have either a version 1 or a version 2 table,
+ * and every entry in the table will be the same version.
+ *
+ * The interface by which domains use grant references does not depend
+ * on the grant table version in use by the other domain.
+ */
 
-/***********************************
- * GRANT TABLE QUERIES AND USES
+/*
+ * Version 1 and version 2 grant entries share a common prefix.  The
+ * fields of the prefix are documented as part of struct
+ * grant_entry_v1.
  */
+struct grant_entry_header {
+    uint16_t flags;
+    domid_t  domid;
+};
 
 /*
- * Reference to a grant entry in a specified domain's grant table.
+ * Version 2 of the grant entry structure, here is an union because three
+ * different types are suppotted: full_page, sub_page and transitive.
+ */
+union grant_entry_v2 {
+    struct grant_entry_header hdr;
+
+    /*
+     * This member is used for V1-style full page grants, where either:
+     *
+     * -- hdr.type is GTF_accept_transfer, or
+     * -- hdr.type is GTF_permit_access and GTF_sub_page is not set.
+     *
+     * In that case, the frame field has the same semantics as the
+     * field of the same name in the V1 entry structure.
+     */
+    struct {
+	struct grant_entry_header hdr;
+	uint32_t pad0;
+	uint64_t frame;
+    } full_page;
+
+    /*
+     * If the grant type is GTF_grant_access and GTF_sub_page is set,
+     * @domid is allowed to access bytes [@page_off,@page_off+@length)
+     * in frame @frame.
+     */
+    struct {
+	struct grant_entry_header hdr;
+	uint16_t page_off;
+	uint16_t length;
+	uint64_t frame;
+    } sub_page;
+
+    /*
+     * If the grant is GTF_transitive, @domid is allowed to use the
+     * grant @gref in domain @trans_domid, as if it was the local
+     * domain.  Obviously, the transitive access must be compatible
+     * with the original grant.
+     */
+    struct {
+	struct grant_entry_header hdr;
+	domid_t trans_domid;
+	uint16_t pad0;
+	grant_ref_t gref;
+    } transitive;
+
+    uint32_t __spacer[4]; /* Pad to a power of two */
+};
+
+typedef uint16_t grant_status_t;
+
+/***********************************
+ * GRANT TABLE QUERIES AND USES
  */
-typedef uint32_t grant_ref_t;
 
 /*
  * Handle to track a mapping created via a grant reference.
@@ -321,6 +405,79 @@ struct gnttab_query_size {
 };
 DEFINE_GUEST_HANDLE_STRUCT(gnttab_query_size);
 
+/*
+ * GNTTABOP_unmap_and_replace: Destroy one or more grant-reference mappings
+ * tracked by <handle> but atomically replace the page table entry with one
+ * pointing to the machine address under <new_addr>.  <new_addr> will be
+ * redirected to the null entry.
+ * NOTES:
+ *  1. The call may fail in an undefined manner if either mapping is not
+ *     tracked by <handle>.
+ *  2. After executing a batch of unmaps, it is guaranteed that no stale
+ *     mappings will remain in the device or host TLBs.
+ */
+#define GNTTABOP_unmap_and_replace    7
+struct gnttab_unmap_and_replace {
+    /* IN parameters. */
+    uint64_t host_addr;
+    uint64_t new_addr;
+    grant_handle_t handle;
+    /* OUT parameters. */
+    int16_t  status;              /* GNTST_* */
+};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_and_replace);
+
+/*
+ * GNTTABOP_set_version: Request a particular version of the grant
+ * table shared table structure.  This operation can only be performed
+ * once in any given domain.  It must be performed before any grants
+ * are activated; otherwise, the domain will be stuck with version 1.
+ * The only defined versions are 1 and 2.
+ */
+#define GNTTABOP_set_version          8
+struct gnttab_set_version {
+    /* IN parameters */
+    uint32_t version;
+};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_set_version);
+
+/*
+ * GNTTABOP_get_status_frames: Get the list of frames used to store grant
+ * status for <dom>. In grant format version 2, the status is separated
+ * from the other shared grant fields to allow more efficient synchronization
+ * using barriers instead of atomic cmpexch operations.
+ * <nr_frames> specify the size of vector <frame_list>.
+ * The frame addresses are returned in the <frame_list>.
+ * Only <nr_frames> addresses are returned, even if the table is larger.
+ * NOTES:
+ *  1. <dom> may be specified as DOMID_SELF.
+ *  2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
+ */
+#define GNTTABOP_get_status_frames     9
+struct gnttab_get_status_frames {
+    /* IN parameters. */
+    uint32_t nr_frames;
+    domid_t  dom;
+    /* OUT parameters. */
+    int16_t  status;              /* GNTST_* */
+    GUEST_HANDLE(uint64_t) frame_list;
+};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_status_frames);
+
+/*
+ * GNTTABOP_get_version: Get the grant table version which is in
+ * effect for domain <dom>.
+ */
+#define GNTTABOP_get_version          10
+struct gnttab_get_version {
+    /* IN parameters */
+    domid_t dom;
+    uint16_t pad;
+    /* OUT parameters */
+    uint32_t version;
+};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version);
+
 /*
  * Bitfield values for update_pin_status.flags.
  */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 6a6e91449347..a890804945e3 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -523,6 +523,8 @@ struct tmem_op {
 	} u;
 };
 
+DEFINE_GUEST_HANDLE(u64);
+
 #else /* __ASSEMBLY__ */
 
 /* In assembly code we cannot use C numeric constant suffixes. */
-- 
cgit v1.2.3-55-g7522


From 85ff6acb075a484780b3d763fdf41596d8fc0970 Mon Sep 17 00:00:00 2001
From: Annie Li
Date: Tue, 22 Nov 2011 09:59:21 +0800
Subject: xen/granttable: Grant tables V2 implementation

Receiver-side copying of packets is based on this implementation, it gives
better performance and better CPU accounting. It totally supports three types:
full-page, sub-page and transitive grants.

However this patch does not cover sub-page and transitive grants, it mainly
focus on Full-page part and implements grant table V2 interfaces corresponding
to what already exists in grant table V1, such as: grant table V2
initialization, mapping, releasing and exported interfaces.

Each guest can only supports one type of grant table type, every entry in grant
table should be the same version. It is necessary to set V1 or V2 version before
initializing the grant table.

Grant table exported interfaces of V2 are same with those of V1, Xen is
responsible to judge what grant table version guests are using in every grant
operation.

V2 fulfills the same role of V1, and it is totally backwards compitable with V1.
If dom0 support grant table V2, the guests runing on it can run with either V1
or V2.

Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Annie Li <annie.li@oracle.com>
[v1: Modified alloc_vm_area call (new parameters), indentation, and cleanpatch
     warnings]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/grant-table.c |  39 ++++++++++-
 drivers/xen/grant-table.c  | 171 +++++++++++++++++++++++++++++++++++++++++++--
 include/xen/grant_table.h  |   6 +-
 3 files changed, 209 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index c6ab2e7ca3a6..65160a8a0ba3 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -54,6 +54,20 @@ static int map_pte_fn(pte_t *pte, struct page *pmd_page,
 	return 0;
 }
 
+/*
+ * This function is used to map shared frames to store grant status. It is
+ * different from map_pte_fn above, the frames type here is uint64_t.
+ */
+static int map_pte_fn_status(pte_t *pte, struct page *pmd_page,
+			     unsigned long addr, void *data)
+{
+	uint64_t **frames = (uint64_t **)data;
+
+	set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
+	(*frames)++;
+	return 0;
+}
+
 static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
 			unsigned long addr, void *data)
 {
@@ -83,7 +97,30 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
 	return rc;
 }
 
-void arch_gnttab_unmap_shared(void *shared, unsigned long nr_gframes)
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+			   unsigned long max_nr_gframes,
+			   grant_status_t **__shared)
+{
+	int rc;
+	grant_status_t *shared = *__shared;
+
+	if (shared == NULL) {
+		/* No need to pass in PTE as we are going to do it
+		 * in apply_to_page_range anyhow. */
+		struct vm_struct *area =
+			alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
+		BUG_ON(area == NULL);
+		shared = area->addr;
+		*__shared = shared;
+	}
+
+	rc = apply_to_page_range(&init_mm, (unsigned long)shared,
+				 PAGE_SIZE * nr_gframes,
+				 map_pte_fn_status, &frames);
+	return rc;
+}
+
+void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
 {
 	apply_to_page_range(&init_mm, (unsigned long)shared,
 			    PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 0518d0404942..301869f60dc7 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -44,6 +44,7 @@
 #include <xen/page.h>
 #include <xen/grant_table.h>
 #include <xen/interface/memory.h>
+#include <xen/hvc-console.h>
 #include <asm/xen/hypercall.h>
 
 #include <asm/pgtable.h>
@@ -53,7 +54,10 @@
 /* External tools reserve first few grant table entries. */
 #define NR_RESERVED_ENTRIES 8
 #define GNTTAB_LIST_END 0xffffffff
-#define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(struct grant_entry_v1))
+#define GREFS_PER_GRANT_FRAME \
+(grant_table_version == 1 ?                      \
+(PAGE_SIZE / sizeof(struct grant_entry_v1)) :   \
+(PAGE_SIZE / sizeof(union grant_entry_v2)))
 
 static grant_ref_t **gnttab_list;
 static unsigned int nr_grant_frames;
@@ -66,6 +70,7 @@ EXPORT_SYMBOL_GPL(xen_hvm_resume_frames);
 
 static union {
 	struct grant_entry_v1 *v1;
+	union grant_entry_v2 *v2;
 	void *addr;
 } gnttab_shared;
 
@@ -120,6 +125,9 @@ struct gnttab_ops {
 
 static struct gnttab_ops *gnttab_interface;
 
+/*This reflects status of grant entries, so act as a global value*/
+static grant_status_t *grstatus;
+
 static int grant_table_version;
 
 static struct gnttab_free_callback *gnttab_free_callback_list;
@@ -127,6 +135,7 @@ static struct gnttab_free_callback *gnttab_free_callback_list;
 static int gnttab_expand(unsigned int req_entries);
 
 #define RPP (PAGE_SIZE / sizeof(grant_ref_t))
+#define SPP (PAGE_SIZE / sizeof(grant_status_t))
 
 static inline grant_ref_t *__gnttab_entry(grant_ref_t entry)
 {
@@ -199,6 +208,7 @@ static void put_free_entry(grant_ref_t ref)
 }
 
 /*
+ * Following applies to gnttab_update_entry_v1 and gnttab_update_entry_v2.
  * Introducing a valid entry into the grant table:
  *  1. Write ent->domid.
  *  2. Write ent->frame:
@@ -217,6 +227,15 @@ static void gnttab_update_entry_v1(grant_ref_t ref, domid_t domid,
 	gnttab_shared.v1[ref].flags = flags;
 }
 
+static void gnttab_update_entry_v2(grant_ref_t ref, domid_t domid,
+				   unsigned long frame, unsigned flags)
+{
+	gnttab_shared.v2[ref].hdr.domid = domid;
+	gnttab_shared.v2[ref].full_page.frame = frame;
+	wmb();
+	gnttab_shared.v2[ref].hdr.flags = GTF_permit_access | flags;
+}
+
 /*
  * Public grant-issuing interface functions
  */
@@ -248,6 +267,11 @@ static int gnttab_query_foreign_access_v1(grant_ref_t ref)
 	return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing);
 }
 
+static int gnttab_query_foreign_access_v2(grant_ref_t ref)
+{
+	return grstatus[ref] & (GTF_reading|GTF_writing);
+}
+
 int gnttab_query_foreign_access(grant_ref_t ref)
 {
 	return gnttab_interface->query_foreign_access(ref);
@@ -272,6 +296,29 @@ static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly)
 	return 1;
 }
 
+static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref, int readonly)
+{
+	gnttab_shared.v2[ref].hdr.flags = 0;
+	mb();
+	if (grstatus[ref] & (GTF_reading|GTF_writing)) {
+		return 0;
+	} else {
+		/* The read of grstatus needs to have acquire
+		semantics.  On x86, reads already have
+		that, and we just need to protect against
+		compiler reorderings.  On other
+		architectures we may need a full
+		barrier. */
+#ifdef CONFIG_X86
+		barrier();
+#else
+		mb();
+#endif
+	}
+
+	return 1;
+}
+
 int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
 {
 	return gnttab_interface->end_foreign_access_ref(ref, readonly);
@@ -345,6 +392,37 @@ static unsigned long gnttab_end_foreign_transfer_ref_v1(grant_ref_t ref)
 	return frame;
 }
 
+static unsigned long gnttab_end_foreign_transfer_ref_v2(grant_ref_t ref)
+{
+	unsigned long frame;
+	u16           flags;
+	u16          *pflags;
+
+	pflags = &gnttab_shared.v2[ref].hdr.flags;
+
+	/*
+	 * If a transfer is not even yet started, try to reclaim the grant
+	 * reference and return failure (== 0).
+	 */
+	while (!((flags = *pflags) & GTF_transfer_committed)) {
+		if (sync_cmpxchg(pflags, flags, 0) == flags)
+			return 0;
+		cpu_relax();
+	}
+
+	/* If a transfer is in progress then wait until it is completed. */
+	while (!(flags & GTF_transfer_completed)) {
+		flags = *pflags;
+		cpu_relax();
+	}
+
+	rmb();  /* Read the frame number /after/ reading completion status. */
+	frame = gnttab_shared.v2[ref].full_page.frame;
+	BUG_ON(frame == 0);
+
+	return frame;
+}
+
 unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
 {
 	return gnttab_interface->end_foreign_transfer_ref(ref);
@@ -592,6 +670,11 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
 }
 EXPORT_SYMBOL_GPL(gnttab_unmap_refs);
 
+static unsigned nr_status_frames(unsigned nr_grant_frames)
+{
+	return (nr_grant_frames * GREFS_PER_GRANT_FRAME + SPP - 1) / SPP;
+}
+
 static int gnttab_map_frames_v1(unsigned long *frames, unsigned int nr_gframes)
 {
 	int rc;
@@ -606,7 +689,56 @@ static int gnttab_map_frames_v1(unsigned long *frames, unsigned int nr_gframes)
 
 static void gnttab_unmap_frames_v1(void)
 {
-	arch_gnttab_unmap_shared(gnttab_shared.addr, nr_grant_frames);
+	arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames);
+}
+
+static int gnttab_map_frames_v2(unsigned long *frames, unsigned int nr_gframes)
+{
+	uint64_t *sframes;
+	unsigned int nr_sframes;
+	struct gnttab_get_status_frames getframes;
+	int rc;
+
+	nr_sframes = nr_status_frames(nr_gframes);
+
+	/* No need for kzalloc as it is initialized in following hypercall
+	 * GNTTABOP_get_status_frames.
+	 */
+	sframes = kmalloc(nr_sframes  * sizeof(uint64_t), GFP_ATOMIC);
+	if (!sframes)
+		return -ENOMEM;
+
+	getframes.dom        = DOMID_SELF;
+	getframes.nr_frames  = nr_sframes;
+	set_xen_guest_handle(getframes.frame_list, sframes);
+
+	rc = HYPERVISOR_grant_table_op(GNTTABOP_get_status_frames,
+				       &getframes, 1);
+	if (rc == -ENOSYS) {
+		kfree(sframes);
+		return -ENOSYS;
+	}
+
+	BUG_ON(rc || getframes.status);
+
+	rc = arch_gnttab_map_status(sframes, nr_sframes,
+				    nr_status_frames(gnttab_max_grant_frames()),
+				    &grstatus);
+	BUG_ON(rc);
+	kfree(sframes);
+
+	rc = arch_gnttab_map_shared(frames, nr_gframes,
+				    gnttab_max_grant_frames(),
+				    &gnttab_shared.addr);
+	BUG_ON(rc);
+
+	return 0;
+}
+
+static void gnttab_unmap_frames_v2(void)
+{
+	arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames);
+	arch_gnttab_unmap(grstatus, nr_status_frames(nr_grant_frames));
 }
 
 static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
@@ -640,6 +772,9 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 		return rc;
 	}
 
+	/* No need for kzalloc as it is initialized in following hypercall
+	 * GNTTABOP_setup_table.
+	 */
 	frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
 	if (!frames)
 		return -ENOMEM;
@@ -672,10 +807,38 @@ static struct gnttab_ops gnttab_v1_ops = {
 	.query_foreign_access		= gnttab_query_foreign_access_v1,
 };
 
+static struct gnttab_ops gnttab_v2_ops = {
+	.map_frames			= gnttab_map_frames_v2,
+	.unmap_frames			= gnttab_unmap_frames_v2,
+	.update_entry			= gnttab_update_entry_v2,
+	.end_foreign_access_ref		= gnttab_end_foreign_access_ref_v2,
+	.end_foreign_transfer_ref	= gnttab_end_foreign_transfer_ref_v2,
+	.query_foreign_access		= gnttab_query_foreign_access_v2,
+};
+
 static void gnttab_request_version(void)
 {
-	grant_table_version = 1;
-	gnttab_interface = &gnttab_v1_ops;
+	int rc;
+	struct gnttab_set_version gsv;
+
+	gsv.version = 2;
+	rc = HYPERVISOR_grant_table_op(GNTTABOP_set_version, &gsv, 1);
+	if (rc == 0) {
+		grant_table_version = 2;
+		gnttab_interface = &gnttab_v2_ops;
+	} else if (grant_table_version == 2) {
+		/*
+		 * If we've already used version 2 features,
+		 * but then suddenly discover that they're not
+		 * available (e.g. migrating to an older
+		 * version of Xen), almost unbounded badness
+		 * can happen.
+		 */
+		panic("we need grant tables version 2, but only version 1 is available");
+	} else {
+		grant_table_version = 1;
+		gnttab_interface = &gnttab_v1_ops;
+	}
 	printk(KERN_INFO "Grant tables using version %d layout.\n",
 		grant_table_version);
 }
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index c7a40f8d455a..5494c402c83a 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -146,8 +146,10 @@ gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr,
 int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
 			   unsigned long max_nr_gframes,
 			   void **__shared);
-void arch_gnttab_unmap_shared(void *shared,
-			      unsigned long nr_gframes);
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+			   unsigned long max_nr_gframes,
+			   grant_status_t **__shared);
+void arch_gnttab_unmap(void *shared, unsigned long nr_gframes);
 
 extern unsigned long xen_hvm_resume_frames;
 unsigned int gnttab_max_grant_frames(void);
-- 
cgit v1.2.3-55-g7522


From 282e5aaba2a0cdfde4d2c2e34bc7438cd6f7a00f Mon Sep 17 00:00:00 2001
From: Paul Bolle
Date: Thu, 17 Nov 2011 11:41:31 +0100
Subject: x86: Kconfig: drop unknown symbol 'APM_MODULE'

There's no Kconfig symbol APM_MODULE, so the check for it will always
fail. There's no need to append _MODULE to tristate symbols anyhow,
because the config tools will do the right thing automagically.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6a47bb22657f..771a1852a07d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1718,7 +1718,7 @@ source "drivers/sfi/Kconfig"
 
 config X86_APM_BOOT
 	def_bool y
-	depends on APM || APM_MODULE
+	depends on APM
 
 menuconfig APM
 	tristate "APM (Advanced Power Management) BIOS support"
-- 
cgit v1.2.3-55-g7522


From 4673ca8eb3690832e76371371955a8b02e1f59d4 Mon Sep 17 00:00:00 2001
From: Michael S. Tsirkin
Date: Thu, 24 Nov 2011 14:54:28 +0200
Subject: lib: move GENERIC_IOMAP to lib/Kconfig

define GENERIC_IOMAP in a central location
instead of all architectures. This will be helpful
for the follow-up patch which makes it select
other configs. Code is also a bit shorter this way.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 arch/alpha/Kconfig             | 4 ----
 arch/cris/Kconfig              | 5 +----
 arch/hexagon/Kconfig           | 4 +---
 arch/ia64/Kconfig              | 5 +----
 arch/m68k/Kconfig              | 4 +---
 arch/openrisc/Kconfig          | 3 ---
 arch/powerpc/platforms/Kconfig | 3 ---
 arch/score/Kconfig             | 4 +---
 arch/sh/Kconfig                | 3 ---
 arch/unicore32/Kconfig         | 4 +---
 arch/x86/Kconfig               | 4 +---
 lib/Kconfig                    | 3 +++
 12 files changed, 10 insertions(+), 36 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 3d74801a4015..3636b11ddff0 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -70,10 +70,6 @@ config GENERIC_ISA_DMA
 	bool
 	default y
 
-config GENERIC_IOMAP
-	bool
-	default n
-
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
 
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 408b055c585f..b3abfb08aa5c 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -19,10 +19,6 @@ config GENERIC_CMOS_UPDATE
 config ARCH_USES_GETTIMEOFFSET
 	def_bool n
 
-config GENERIC_IOMAP
-       bool
-       default y
-
 config ARCH_HAS_ILOG2_U32
 	bool
 	default n
@@ -52,6 +48,7 @@ config CRIS
 	select HAVE_IDE
 	select HAVE_GENERIC_HARDIRQS
 	select GENERIC_IRQ_SHOW
+	select GENERIC_IOMAP
 
 config HZ
 	int
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 02513c2dd5ec..9059e3905887 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -26,6 +26,7 @@ config HEXAGON
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_TRACEHOOK
 	select NO_IOPORT
+	select GENERIC_IOMAP
 	# mostly generic routines, with some accelerated ones
 	---help---
 	  Qualcomm Hexagon is a processor architecture designed for high
@@ -73,9 +74,6 @@ config GENERIC_CSUM
 config GENERIC_IRQ_PROBE
 	def_bool y
 
-config GENERIC_IOMAP
-	def_bool y
-
 #config ZONE_DMA
 #	bool
 #	default y
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 27489b6dd533..2732e1b0aa3d 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -29,6 +29,7 @@ config IA64
 	select GENERIC_IRQ_SHOW
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
+	select GENERIC_IOMAP
 	default y
 	help
 	  The Itanium Processor Family is Intel's 64-bit successor to
@@ -102,10 +103,6 @@ config EFI
 	bool
 	default y
 
-config GENERIC_IOMAP
-	bool
-	default y
-
 config ARCH_CLOCKSOURCE_DATA
 	def_bool y
 
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 361d54019bb0..973e68614f28 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -38,9 +38,6 @@ config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 
-config GENERIC_IOMAP
-	def_bool MMU
-
 config TIME_LOW_RES
 	bool
 	default y
@@ -73,6 +70,7 @@ source "kernel/Kconfig.freezer"
 config MMU
 	bool "MMU-based Paged Memory Management Support"
 	default y
+	select GENERIC_IOMAP
 	help
 	  Select if you want MMU-based virtualised addressing space
 	  support by paged memory management. If unsure, say 'Y'.
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index e518a5a4cf4c..081a54f1a93d 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -38,9 +38,6 @@ config RWSEM_XCHGADD_ALGORITHM
 config GENERIC_HWEIGHT
 	def_bool y
 
-config GENERIC_IOMAP
-	def_bool y
-
 config NO_IOPORT
 	def_bool y
 
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index 3fe6d927ad70..100feed97940 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -175,9 +175,6 @@ config PPC_INDIRECT_MMIO
 config PPC_IO_WORKAROUNDS
 	bool
 
-config GENERIC_IOMAP
-	bool
-
 source "drivers/cpufreq/Kconfig"
 
 menu "CPU Frequency drivers"
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index df169e84db4e..455ce2d76823 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -4,6 +4,7 @@ config SCORE
        def_bool y
        select HAVE_GENERIC_HARDIRQS
        select GENERIC_IRQ_SHOW
+       select GENERIC_IOMAP
 
 choice
 	prompt "System type"
@@ -33,9 +34,6 @@ endmenu
 config CPU_SCORE7
 	bool
 
-config GENERIC_IOMAP
-	def_bool y
-
 config NO_DMA
 	bool
 	default y
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 5629e2099130..5aeab58ac8be 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -84,9 +84,6 @@ config GENERIC_GPIO
 config GENERIC_CALIBRATE_DELAY
 	bool
 
-config GENERIC_IOMAP
-	bool
-
 config GENERIC_CLOCKEVENTS
 	def_bool y
 
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index 942ed6174f1d..eeb8054c7cd8 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -12,6 +12,7 @@ config UNICORE32
 	select GENERIC_IRQ_PROBE
 	select GENERIC_IRQ_SHOW
 	select ARCH_WANT_FRAME_POINTERS
+	select GENERIC_IOMAP
 	help
 	  UniCore-32 is 32-bit Instruction Set Architecture,
 	  including a series of low-power-consumption RISC chip
@@ -30,9 +31,6 @@ config GENERIC_CLOCKEVENTS
 config GENERIC_CSUM
 	def_bool y
 
-config GENERIC_IOMAP
-	def_bool y
-
 config NO_IOPORT
 	bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cb9a1044a771..08af6457de72 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -75,6 +75,7 @@ config X86
 	select HAVE_BPF_JIT if (X86_64 && NET)
 	select CLKEVT_I8253
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
+	select GENERIC_IOMAP
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
@@ -140,9 +141,6 @@ config NEED_SG_DMA_LENGTH
 config GENERIC_ISA_DMA
 	def_bool ISA_DMA_API
 
-config GENERIC_IOMAP
-	def_bool y
-
 config GENERIC_BUG
 	def_bool y
 	depends on BUG
diff --git a/lib/Kconfig b/lib/Kconfig
index 32f3e5ae2be5..005892723a52 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -19,6 +19,9 @@ config RATIONAL
 config GENERIC_FIND_FIRST_BIT
 	bool
 
+config GENERIC_IOMAP
+	bool
+
 config CRC_CCITT
 	tristate "CRC-CCITT functions"
 	help
-- 
cgit v1.2.3-55-g7522


From 37fe6a42b3433b79a159ceb06a94cd1ef00e279d Mon Sep 17 00:00:00 2001
From: Mitsuo Hayasaka
Date: Tue, 29 Nov 2011 15:08:29 +0900
Subject: x86: Check stack overflow in detail

Currently, only kernel stack is checked for the overflow, which
is not sufficient for systems that need a high reliability. To
enhance it, it is required to check the IRQ and exception
stacks, as well.

This patch checks all the stack types and will cause messages of
stacks in detail when free stack space drops below a certain
limit except user stack.

Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: Randy Dunlap <rdunlap@xenotime.net>
Link: http://lkml.kernel.org/r/20111129060829.11076.51733.stgit@ltc219.sdl.hitachi.co.jp
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/Kconfig.debug   |  7 +++++--
 arch/x86/kernel/irq_64.c | 29 +++++++++++++++++++++++------
 2 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bf56e1793272..4caec1261f12 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -63,8 +63,11 @@ config DEBUG_STACKOVERFLOW
 	bool "Check for stack overflows"
 	depends on DEBUG_KERNEL
 	---help---
-	  This option will cause messages to be printed if free stack space
-	  drops below a certain limit.
+	  Say Y here if you want to check the overflows of kernel, IRQ
+	  and exception stacks. This option will cause messages of the
+	  stacks in detail when free stack space drops below a certain
+	  limit.
+	  If in doubt, say "N".
 
 config X86_PTDUMP
 	bool "Export kernel pagetable layout to userspace via debugfs"
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 69bca468c47a..928a7e909619 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -36,18 +36,35 @@ EXPORT_PER_CPU_SYMBOL(irq_regs);
 static inline void stack_overflow_check(struct pt_regs *regs)
 {
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
+	struct orig_ist *oist;
+	u64 irq_stack_top, irq_stack_bottom;
+	u64 estack_top, estack_bottom;
 	u64 curbase = (u64)task_stack_page(current);
 
 	if (user_mode_vm(regs))
 		return;
 
-	WARN_ONCE(regs->sp >= curbase &&
-		  regs->sp <= curbase + THREAD_SIZE &&
-		  regs->sp <  curbase + sizeof(struct thread_info) +
-					sizeof(struct pt_regs) + 128,
+	if (regs->sp >= curbase &&
+	    regs->sp <= curbase + THREAD_SIZE &&
+	    regs->sp >= curbase + sizeof(struct thread_info) +
+				  sizeof(struct pt_regs) + 128)
+		return;
+
+	irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack);
+	irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr);
+	if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
+		return;
+
+	oist = &__get_cpu_var(orig_ist);
+	estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ;
+	estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
+	if (regs->sp >= estack_top && regs->sp <= estack_bottom)
+		return;
 
-		  "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
-			current->comm, curbase, regs->sp);
+	WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
+		current->comm, curbase, regs->sp,
+		irq_stack_top, irq_stack_bottom,
+		estack_top, estack_bottom);
 #endif
 }
 
-- 
cgit v1.2.3-55-g7522


From 55af77969fbd7a841838220ea2287432e0da8ae5 Mon Sep 17 00:00:00 2001
From: Mitsuo Hayasaka
Date: Tue, 29 Nov 2011 15:08:36 +0900
Subject: x86: Panic on detection of stack overflow

Currently, messages are just output on the detection of stack
overflow, which is not sufficient for systems that need a
high reliability. This is because in general the overflow may
corrupt data, and the additional corruption may occur due to
reading them unless systems stop.

This patch adds the sysctl parameter
kernel.panic_on_stackoverflow and causes a panic when detecting
the overflows of kernel, IRQ and exception stacks except user
stack according to the parameter. It is disabled by default.

Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: http://lkml.kernel.org/r/20111129060836.11076.12323.stgit@ltc219.sdl.hitachi.co.jp
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/sysctl/kernel.txt | 14 ++++++++++++++
 arch/x86/kernel/irq_32.c        |  2 ++
 arch/x86/kernel/irq_64.c        |  5 +++++
 include/linux/kernel.h          |  1 +
 kernel/sysctl.c                 |  9 +++++++++
 5 files changed, 31 insertions(+)

(limited to 'arch/x86')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 1f2463671a1a..6d8cd8b2c30d 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -49,6 +49,7 @@ show up in /proc/sys/kernel:
 - panic
 - panic_on_oops
 - panic_on_unrecovered_nmi
+- panic_on_stackoverflow
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -393,6 +394,19 @@ Controls the kernel's behaviour when an oops or BUG is encountered.
 
 ==============================================================
 
+panic_on_stackoverflow:
+
+Controls the kernel's behavior when detecting the overflows of
+kernel, IRQ and exception stacks except a user stack.
+This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled.
+
+0: try to continue operation.
+
+1: panic immediately.
+
+==============================================================
+
+
 pid_max:
 
 PID allocation wrap value.  When the kernel's next PID value
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 72090705a656..e16e99ebd7ad 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -43,6 +43,8 @@ static void print_stack_overflow(void)
 {
 	printk(KERN_WARNING "low stack detected by irq handler\n");
 	dump_stack();
+	if (sysctl_panic_on_stackoverflow)
+		panic("low stack detected by irq handler - check messages\n");
 }
 
 #else
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 928a7e909619..42552b0dce6a 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -26,6 +26,8 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
 DEFINE_PER_CPU(struct pt_regs *, irq_regs);
 EXPORT_PER_CPU_SYMBOL(irq_regs);
 
+int sysctl_panic_on_stackoverflow;
+
 /*
  * Probabilistic stack overflow check:
  *
@@ -65,6 +67,9 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 		current->comm, curbase, regs->sp,
 		irq_stack_top, irq_stack_bottom,
 		estack_top, estack_bottom);
+
+	if (sysctl_panic_on_stackoverflow)
+		panic("low stack detected by irq handler - check messages\n");
 #endif
 }
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e8b1597b5cf2..ff83683c0b9d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -341,6 +341,7 @@ extern int panic_timeout;
 extern int panic_on_oops;
 extern int panic_on_unrecovered_nmi;
 extern int panic_on_io_nmi;
+extern int sysctl_panic_on_stackoverflow;
 extern const char *print_tainted(void);
 extern void add_taint(unsigned flag);
 extern int test_taint(unsigned flag);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae2719643854..f487f257e05e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -803,6 +803,15 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+	{
+		.procname	= "panic_on_stackoverflow",
+		.data		= &sysctl_panic_on_stackoverflow,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 	{
 		.procname	= "bootloader_type",
 		.data		= &bootloader_type,
-- 
cgit v1.2.3-55-g7522


From 467e6b7a7c0eb792ebaf322ddb7363742b4ead40 Mon Sep 17 00:00:00 2001
From: Mitsuo Hayasaka
Date: Tue, 29 Nov 2011 15:08:45 +0900
Subject: x86: Clean up the range of stack overflow checking

The overflow checking of kernel stack checks if the stack
pointer points to the available kernel stack range, which is
derived from the original overflow checking.

It is clear that curbase address is always less than low
boundary of available kernel stack. So, this patch removes the
first condition that checks if the pointer is higher than
curbase.

Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: Randy Dunlap <rdunlap@xenotime.net>
Link: http://lkml.kernel.org/r/20111129060845.11076.40916.stgit@ltc219.sdl.hitachi.co.jp
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/kernel/irq_64.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 42552b0dce6a..54e2b2b2e250 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -46,10 +46,9 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 	if (user_mode_vm(regs))
 		return;
 
-	if (regs->sp >= curbase &&
-	    regs->sp <= curbase + THREAD_SIZE &&
-	    regs->sp >= curbase + sizeof(struct thread_info) +
-				  sizeof(struct pt_regs) + 128)
+	if (regs->sp >= curbase + sizeof(struct thread_info) +
+				  sizeof(struct pt_regs) + 128 &&
+	    regs->sp <= curbase + THREAD_SIZE)
 		return;
 
 	irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack);
-- 
cgit v1.2.3-55-g7522


From 3603a2512f9e69dc87914ba922eb4a0812b21cd6 Mon Sep 17 00:00:00 2001
From: Don Zickus
Date: Thu, 13 Oct 2011 15:14:25 -0400
Subject: x86, reboot: Use NMI instead of REBOOT_VECTOR to stop cpus

A recent discussion started talking about the locking on the
pstore fs and how it relates to the kmsg infrastructure.  We
noticed it was possible for userspace to r/w to the pstore fs
(grabbing the locks in the process) and block the panic path
from r/w to the same fs.

The reason was the cpu with the lock could be doing work while
the crashing cpu is panic'ing.  Busting those spinlocks might
cause those cpus to step on each other's data.  Fine, fair
enough.

It was suggested it would be nice to serialize the panic path
(ie stop the other cpus) and have only one cpu running.  This
would allow us to bust the spinlocks and not worry about another
cpu stepping on the data.

Of course, smp_send_stop() does this in the panic case.
kmsg_dump() would have to be moved to be called after it.  Easy
enough.

The only problem is on x86 the smp_send_stop() function calls
the REBOOT_VECTOR.  Any cpu with irqs disabled (which pstore and
its backend ERST would do), block this IPI and thus do not stop.
 This makes it difficult to reliably log data to the pstore fs.

The patch below switches from the REBOOT_VECTOR to NMI (and
mimics what kdump does).  Switching to NMI allows us to deliver
the IPI when irqs are disabled, increasing the reliability of
this function.

However, Andi carefully noted that on some machines this
approach does not work because of broken BIOSes or whatever.

To help accomodate this, the next couple of patches will run a
selftest and provide a knob to disable.

V2:
  uses atomic ops to serialize the cpu that shuts everyone down
V3:
  comment cleanup

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert Richter <robert.richter@amd.com>
Cc: seiji.aguchi@hds.com
Cc: vgoyal@redhat.com
Cc: mjg@redhat.com
Cc: tony.luck@intel.com
Cc: gong.chen@intel.com
Cc: satoru.moriya@hds.com
Cc: avi@redhat.com
Cc: Andi Kleen <andi@firstfloor.org>
Link: http://lkml.kernel.org/r/1318533267-18880-2-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smp.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 16204dc15484..e72b1754a2d7 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -29,6 +29,7 @@
 #include <asm/mmu_context.h>
 #include <asm/proto.h>
 #include <asm/apic.h>
+#include <asm/nmi.h>
 /*
  *	Some notes on x86 processor bugs affecting SMP operation:
  *
@@ -148,6 +149,60 @@ void native_send_call_func_ipi(const struct cpumask *mask)
 	free_cpumask_var(allbutself);
 }
 
+static atomic_t stopping_cpu = ATOMIC_INIT(-1);
+
+static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
+{
+	/* We are registered on stopping cpu too, avoid spurious NMI */
+	if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
+		return NMI_HANDLED;
+
+	stop_this_cpu(NULL);
+
+	return NMI_HANDLED;
+}
+
+static void native_nmi_stop_other_cpus(int wait)
+{
+	unsigned long flags;
+	unsigned long timeout;
+
+	if (reboot_force)
+		return;
+
+	/*
+	 * Use an own vector here because smp_call_function
+	 * does lots of things not suitable in a panic situation.
+	 */
+	if (num_online_cpus() > 1) {
+		/* did someone beat us here? */
+		if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id() != -1))
+			return;
+
+		if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
+					 NMI_FLAG_FIRST, "smp_stop"))
+			/* Note: we ignore failures here */
+			return;
+
+		/* sync above data before sending NMI */
+		wmb();
+
+		apic->send_IPI_allbutself(NMI_VECTOR);
+
+		/*
+		 * Don't wait longer than a second if the caller
+		 * didn't ask us to wait.
+		 */
+		timeout = USEC_PER_SEC;
+		while (num_online_cpus() > 1 && (wait || timeout--))
+			udelay(1);
+	}
+
+	local_irq_save(flags);
+	disable_local_APIC();
+	local_irq_restore(flags);
+}
+
 /*
  * this function calls the 'stop' function on all other CPUs in the system.
  */
@@ -160,7 +215,7 @@ asmlinkage void smp_reboot_interrupt(void)
 	irq_exit();
 }
 
-static void native_stop_other_cpus(int wait)
+static void native_irq_stop_other_cpus(int wait)
 {
 	unsigned long flags;
 	unsigned long timeout;
@@ -230,7 +285,7 @@ struct smp_ops smp_ops = {
 	.smp_prepare_cpus	= native_smp_prepare_cpus,
 	.smp_cpus_done		= native_smp_cpus_done,
 
-	.stop_other_cpus	= native_stop_other_cpus,
+	.stop_other_cpus	= native_nmi_stop_other_cpus,
 	.smp_send_reschedule	= native_smp_send_reschedule,
 
 	.cpu_up			= native_cpu_up,
-- 
cgit v1.2.3-55-g7522


From 99e8b9ca90d688c3ac7d3a141b701c9694a93925 Mon Sep 17 00:00:00 2001
From: Don Zickus
Date: Thu, 13 Oct 2011 15:14:26 -0400
Subject: x86, NMI: Add NMI IPI selftest

The previous patch modified the stop cpus path to use NMI
instead of IRQ as the way to communicate to the other cpus to
shutdown.  There were some concerns that various machines may
have problems with using an NMI IPI.

This patch creates a selftest to check if NMI is working at
boot. The idea is to help catch any issues before the machine
panics and we learn the hard way.

Loosely based on the locking-selftest.c file, this separate file
runs a couple of simple tests and reports the results.  The
output looks like:

...
Brought up 4 CPUs
----------------
| NMI testsuite:
--------------------
  remote IPI:  ok  |
   local IPI:  ok  |
--------------------
Good, all   2 testcases passed! |
---------------------------------
Total of 4 processors activated (21330.61 BogoMIPS).
...

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert Richter <robert.richter@amd.com>
Cc: seiji.aguchi@hds.com
Cc: vgoyal@redhat.com
Cc: mjg@redhat.com
Cc: tony.luck@intel.com
Cc: gong.chen@intel.com
Cc: satoru.moriya@hds.com
Cc: avi@redhat.com
Cc: Andi Kleen <andi@firstfloor.org>
Link: http://lkml.kernel.org/r/1318533267-18880-3-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.debug         |  12 +++
 arch/x86/include/asm/smp.h     |   6 ++
 arch/x86/kernel/Makefile       |   1 +
 arch/x86/kernel/nmi_selftest.c | 179 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/smpboot.c      |   1 +
 5 files changed, 199 insertions(+)
 create mode 100644 arch/x86/kernel/nmi_selftest.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 4caec1261f12..97da3c17b424 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -287,4 +287,16 @@ config DEBUG_STRICT_USER_COPY_CHECKS
 
 	  If unsure, or if you run an older (pre 4.4) gcc, say N.
 
+config DEBUG_NMI_SELFTEST
+	bool "NMI Selftest"
+	depends on DEBUG_KERNEL
+	---help---
+	  Enabling this option turns on a quick NMI selftest to verify
+	  that the NMI behaves correctly.
+
+	  This might help diagnose strange hangs that rely on NMI to
+	  function properly.
+
+	  If unsure, say N.
+
 endmenu
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 73b11bc0ae6f..0434c400287c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -225,5 +225,11 @@ extern int hard_smp_processor_id(void);
 
 #endif /* CONFIG_X86_LOCAL_APIC */
 
+#ifdef CONFIG_DEBUG_NMI_SELFTEST
+extern void nmi_selftest(void);
+#else
+#define nmi_selftest() do { } while (0)
+#endif
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8baca3c4871c..02b2f05b371e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -80,6 +80,7 @@ obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
 obj-$(CONFIG_AMD_NB)		+= amd_nb.o
 obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_rodata.o
 obj-$(CONFIG_DEBUG_NX_TEST)	+= test_nx.o
+obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
 
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o
 obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
new file mode 100644
index 000000000000..572adb622251
--- /dev/null
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -0,0 +1,179 @@
+/*
+ * arch/x86/kernel/nmi-selftest.c
+ *
+ * Testsuite for NMI: IPIs
+ *
+ * Started by Don Zickus:
+ * (using lib/locking-selftest.c as a guide)
+ *
+ *   Copyright (C) 2011 Red Hat, Inc., Don Zickus <dzickus@redhat.com>
+ */
+
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+
+#include <asm/apic.h>
+#include <asm/nmi.h>
+
+#define SUCCESS		0
+#define FAILURE		1
+#define TIMEOUT		2
+
+static int nmi_fail;
+
+/* check to see if NMI IPIs work on this machine */
+static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly;
+
+static int testcase_total;
+static int testcase_successes;
+static int expected_testcase_failures;
+static int unexpected_testcase_failures;
+static int unexpected_testcase_unknowns;
+
+static int nmi_unk_cb(unsigned int val, struct pt_regs *regs)
+{
+	unexpected_testcase_unknowns++;
+	return NMI_HANDLED;
+}
+
+static void init_nmi_testsuite(void)
+{
+	/* trap all the unknown NMIs we may generate */
+	register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
+}
+
+static void cleanup_nmi_testsuite(void)
+{
+	unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
+}
+
+static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
+{
+        int cpu = raw_smp_processor_id();
+
+        if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask)))
+                return NMI_HANDLED;
+
+        return NMI_DONE;
+}
+
+static void test_nmi_ipi(struct cpumask *mask)
+{
+	unsigned long timeout;
+
+	if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
+				 NMI_FLAG_FIRST, "nmi_selftest")) {
+		nmi_fail = FAILURE;
+		return;
+	}
+
+	/* sync above data before sending NMI */
+	wmb();
+
+	apic->send_IPI_mask(mask, NMI_VECTOR);
+
+	/* Don't wait longer than a second */
+	timeout = USEC_PER_SEC;
+	while (!cpumask_empty(mask) && timeout--)
+	        udelay(1);
+
+	/* What happens if we timeout, do we still unregister?? */
+	unregister_nmi_handler(NMI_LOCAL, "nmi_selftest");
+
+	if (!timeout)
+		nmi_fail = TIMEOUT;
+	return;
+}
+
+static void remote_ipi(void)
+{
+	cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
+	cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
+	test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+}
+
+static void local_ipi(void)
+{
+	cpumask_clear(to_cpumask(nmi_ipi_mask));
+	cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
+	test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+}
+
+static void reset_nmi(void)
+{
+	nmi_fail = 0;
+}
+
+static void dotest(void (*testcase_fn)(void), int expected)
+{
+	testcase_fn();
+	/*
+	 * Filter out expected failures:
+	 */
+	if (nmi_fail != expected) {
+		unexpected_testcase_failures++;
+
+		if (nmi_fail == FAILURE)
+			printk("FAILED |");
+		else if (nmi_fail == TIMEOUT)
+			printk("TIMEOUT|");
+		else
+			printk("ERROR  |");
+		dump_stack();
+	} else {
+		testcase_successes++;
+		printk("  ok  |");
+	}
+	testcase_total++;
+
+	reset_nmi();
+}
+
+static inline void print_testname(const char *testname)
+{
+	printk("%12s:", testname);
+}
+
+void nmi_selftest(void)
+{
+	init_nmi_testsuite();
+
+        /*
+	 * Run the testsuite:
+	 */
+	printk("----------------\n");
+	printk("| NMI testsuite:\n");
+	printk("--------------------\n");
+
+	print_testname("remote IPI");
+	dotest(remote_ipi, SUCCESS);
+	printk("\n");
+	print_testname("local IPI");
+	dotest(local_ipi, SUCCESS);
+	printk("\n");
+
+	cleanup_nmi_testsuite();
+
+	if (unexpected_testcase_failures) {
+		printk("--------------------\n");
+		printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n",
+			unexpected_testcase_failures, testcase_total);
+		printk("-----------------------------------------------------------------\n");
+	} else if (expected_testcase_failures && testcase_successes) {
+		printk("--------------------\n");
+		printk("%3d out of %3d testcases failed, as expected. |\n",
+			expected_testcase_failures, testcase_total);
+		printk("----------------------------------------------------\n");
+	} else if (expected_testcase_failures && !testcase_successes) {
+		printk("--------------------\n");
+		printk("All %3d testcases failed, as expected. |\n",
+			expected_testcase_failures);
+		printk("----------------------------------------\n");
+	} else {
+		printk("--------------------\n");
+		printk("Good, all %3d testcases passed! |\n",
+			testcase_successes);
+		printk("---------------------------------\n");
+	}
+}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9f548cb4a958..19277817effa 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1142,6 +1142,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 {
 	pr_debug("Boot done.\n");
 
+	nmi_selftest();
 	impress_friends();
 #ifdef CONFIG_X86_IO_APIC
 	setup_ioapic_dest();
-- 
cgit v1.2.3-55-g7522


From bda62633983f9db49ce0b1a9235b3709c1cda5f0 Mon Sep 17 00:00:00 2001
From: Don Zickus
Date: Thu, 13 Oct 2011 15:14:27 -0400
Subject: x86, NMI: Add knob to disable using NMI IPIs to stop cpus

Some machines may exhibit problems using the NMI to stop other
cpus. This knob just allows one to revert back to the original
behaviour to help diagnose the problem.

V2:
  make function static

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert Richter <robert.richter@amd.com>
Cc: seiji.aguchi@hds.com
Cc: vgoyal@redhat.com
Cc: mjg@redhat.com
Cc: tony.luck@intel.com
Cc: gong.chen@intel.com
Cc: satoru.moriya@hds.com
Cc: avi@redhat.com
Cc: Andi Kleen <andi@firstfloor.org>
Link: http://lkml.kernel.org/r/1318533267-18880-4-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/kernel-parameters.txt |  4 ++++
 arch/x86/kernel/smp.c               | 13 +++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a0c5c5f4fce6..b4339e5a50da 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1796,6 +1796,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	nomfgpt		[X86-32] Disable Multi-Function General Purpose
 			Timer usage (for AMD Geode machines).
 
+	nonmi_ipi	[X86] Disable using NMI IPIs during panic/reboot to
+			shutdown the other cpus.  Instead use the REBOOT_VECTOR
+			irq.
+
 	nopat		[X86] Disable PAT (page attribute table extension of
 			pagetables) support.
 
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index e72b1754a2d7..113acda5879e 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -249,6 +249,11 @@ static void native_irq_stop_other_cpus(int wait)
 	local_irq_restore(flags);
 }
 
+static void native_smp_disable_nmi_ipi(void)
+{
+	smp_ops.stop_other_cpus = native_irq_stop_other_cpus;
+}
+
 /*
  * Reschedule call back.
  */
@@ -280,6 +285,14 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
+static int __init nonmi_ipi_setup(char *str)
+{
+        native_smp_disable_nmi_ipi();
+        return 1;
+}
+
+__setup("nonmi_ipi", nonmi_ipi_setup);
+
 struct smp_ops smp_ops = {
 	.smp_prepare_boot_cpu	= native_smp_prepare_boot_cpu,
 	.smp_prepare_cpus	= native_smp_prepare_cpus,
-- 
cgit v1.2.3-55-g7522


From 53b5650273fea486ac8ac6c1d1e9a6cd17aa31ca Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Mon, 5 Dec 2011 12:25:44 +0100
Subject: x86: Fix the 32-bit stackoverflow-debug build

The panic_on_stackoverflow variable needs to be avilable
on the 32-bit side as well ...

Cc: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: http://lkml.kernel.org/r/20111129060836.11076.12323.stgit@ltc219.sdl.hitachi.co.jp
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq_32.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index e16e99ebd7ad..40fc86161d92 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -28,6 +28,9 @@ DEFINE_PER_CPU(struct pt_regs *, irq_regs);
 EXPORT_PER_CPU_SYMBOL(irq_regs);
 
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
+
+int sysctl_panic_on_stackoverflow __read_mostly;
+
 /* Debugging check for stack overflow: is there less than 1KB free? */
 static int check_stack_overflow(void)
 {
-- 
cgit v1.2.3-55-g7522


From 54b0264ec8c6e90f0413ad30e2f91c65e7844613 Mon Sep 17 00:00:00 2001
From: Alan Cox
Date: Wed, 16 Nov 2011 18:17:40 +0000
Subject: x86/sfi: Kill the IRQ as id hack

Nothing should now need it so take it out

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/platform/mrst/mrst.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index b1489a06a49d..6a21f603bd78 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -802,8 +802,7 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *entry)
 	if (mrst_has_msic())
 		return;
 
-	/* ID as IRQ is a hack that will go away */
-	pdev = platform_device_alloc(entry->name, entry->irq);
+	pdev = platform_device_alloc(entry->name, 0);
 	if (pdev == NULL) {
 		pr_err("out of memory for SFI platform device '%s'.\n",
 			entry->name);
-- 
cgit v1.2.3-55-g7522


From 1ea7c6737c8f68453f55c894b3d07d7f48fcbef8 Mon Sep 17 00:00:00 2001
From: Alan Cox
Date: Thu, 10 Nov 2011 13:29:14 +0000
Subject: x86/config: Revamp configuration for MID devices

This follows on from the patch applied in 3.2rc1 which creates
an INTEL_MID configuration. We can now add the entry for
Medfield specific code. After this is merged the final patch
will be submitted which moves the rest of the device Kconfig
dependancies to MRST/MEDFIELD/INTEL_MID as appropriate.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                | 17 +++++++++++++++++
 arch/x86/Kconfig.debug          |  6 +++---
 arch/x86/kernel/early_printk.c  |  2 +-
 arch/x86/platform/mrst/Makefile |  2 +-
 4 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cb9a1044a771..9e7a361423d6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -419,6 +419,23 @@ config X86_MRST
 	  nor standard legacy replacement devices/features. e.g. Moorestown does
 	  not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
 
+config X86_MDFLD
+       bool "Medfield MID platform"
+	depends on PCI
+	depends on PCI_GOANY
+	depends on X86_IO_APIC
+	select APB_TIMER
+	select I2C
+	select SPI
+	select INTEL_SCU_IPC
+	select X86_PLATFORM_DEVICES
+	---help---
+	  Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin
+	  Internet Device(MID) platform. 
+	  Unlike standard x86 PCs, Medfield does not have many legacy devices
+	  nor standard legacy replacement devices/features. e.g. Medfield does
+	  not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
+
 endif
 
 config X86_RDC321X
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bf56e1793272..28c3c73ab208 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,9 +43,9 @@ config EARLY_PRINTK
 	  with klogd/syslogd or the X server. You should normally N here,
 	  unless you want to debug such a crash.
 
-config EARLY_PRINTK_MRST
-	bool "Early printk for MRST platform support"
-	depends on EARLY_PRINTK && X86_MRST
+config EARLY_PRINTK_INTEL_MID
+	bool "Early printk for Intel MID platform support"
+	depends on EARLY_PRINTK && X86_INTEL_MID
 
 config EARLY_PRINTK_DBGP
 	bool "Early printk via EHCI debug port"
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index cd28a350f7f9..7a53da03086f 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -240,7 +240,7 @@ static int __init setup_early_printk(char *buf)
 		if (!strncmp(buf, "xen", 3))
 			early_console_register(&xenboot_console, keep);
 #endif
-#ifdef CONFIG_EARLY_PRINTK_MRST
+#ifdef CONFIG_EARLY_PRINTK_INTEL_MID
 		if (!strncmp(buf, "mrst", 4)) {
 			mrst_early_console_init();
 			early_console_register(&early_mrst_console, keep);
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
index 1ea38775a6d3..ddeec7300464 100644
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_X86_MRST)		+= mrst.o
 obj-$(CONFIG_X86_MRST)		+= vrtc.o
-obj-$(CONFIG_EARLY_PRINTK_MRST)	+= early_printk_mrst.o
+obj-$(CONFIG_EARLY_PRINTK_INTEL_MID)	+= early_printk_mrst.o
 obj-$(CONFIG_X86_MRST)		+= pmu.o
-- 
cgit v1.2.3-55-g7522


From 9af0c7a6fa860698d080481f24a342ba74b68982 Mon Sep 17 00:00:00 2001
From: Ludwig Nussel
Date: Tue, 15 Nov 2011 14:46:46 -0800
Subject: x86: Fix mmap random address range

On x86_32 casting the unsigned int result of get_random_int() to
long may result in a negative value.  On x86_32 the range of
mmap_rnd() therefore was -255 to 255.  The 32bit mode on x86_64
used 0 to 255 as intended.

The bug was introduced by 675a081 ("x86: unify mmap_{32|64}.c")
in January 2008.

Signed-off-by: Ludwig Nussel <ludwig.nussel@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: harvey.harrison@gmail.com
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/201111152246.pAFMklOB028527@wpaz5.hot.corp.google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/mmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 4b5ba85eb5c9..845df6835f9f 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -75,9 +75,9 @@ static unsigned long mmap_rnd(void)
 	*/
 	if (current->flags & PF_RANDOMIZE) {
 		if (mmap_is_ia32())
-			rnd = (long)get_random_int() % (1<<8);
+			rnd = get_random_int() % (1<<8);
 		else
-			rnd = (long)(get_random_int() % (1<<28));
+			rnd = get_random_int() % (1<<28);
 	}
 	return rnd << PAGE_SHIFT;
 }
-- 
cgit v1.2.3-55-g7522


From d1bbdd669298b7ca08284ddb29153dfc039dd89d Mon Sep 17 00:00:00 2001
From: Mike Ditto
Date: Tue, 15 Nov 2011 14:46:50 -0800
Subject: arch/x86/kernel/e820.c: Eliminate bubble sort from
 sanitize_e820_map()

Replace the bubble sort in sanitize_e820_map() with a call to
the generic kernel sort function to avoid pathological
performance with large maps.

On large (thousands of entries) E820 maps, the previous code
took minutes to run; with this change it's now milliseconds.

Signed-off-by: Mike Ditto <mditto@google.com>
Cc: sassmann@kpanic.de
Cc: yuenn@google.com
Cc: Stefan Assmann <sassmann@kpanic.de>
Cc: Nancy Yuen <yuenn@google.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 59 ++++++++++++++++++++------------------------------
 1 file changed, 24 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 303a0e48f076..f655f802260d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -19,6 +19,7 @@
 #include <linux/acpi.h>
 #include <linux/firmware-map.h>
 #include <linux/memblock.h>
+#include <linux/sort.h>
 
 #include <asm/e820.h>
 #include <asm/proto.h>
@@ -227,22 +228,38 @@ void __init e820_print_map(char *who)
  *	   ____________________33__
  *	   ______________________4_
  */
+struct change_member {
+	struct e820entry *pbios; /* pointer to original bios entry */
+	unsigned long long addr; /* address for this change point */
+};
+
+static int __init cpcompare(const void *a, const void *b)
+{
+	struct change_member * const *app = a, * const *bpp = b;
+	const struct change_member *ap = *app, *bp = *bpp;
+
+	/*
+	 * Inputs are pointers to two elements of change_point[].  If their
+	 * addresses are unequal, their difference dominates.  If the addresses
+	 * are equal, then consider one that represents the end of its region
+	 * to be greater than one that does not.
+	 */
+	if (ap->addr != bp->addr)
+		return ap->addr > bp->addr ? 1 : -1;
+
+	return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr);
+}
 
 int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
 			     u32 *pnr_map)
 {
-	struct change_member {
-		struct e820entry *pbios; /* pointer to original bios entry */
-		unsigned long long addr; /* address for this change point */
-	};
 	static struct change_member change_point_list[2*E820_X_MAX] __initdata;
 	static struct change_member *change_point[2*E820_X_MAX] __initdata;
 	static struct e820entry *overlap_list[E820_X_MAX] __initdata;
 	static struct e820entry new_bios[E820_X_MAX] __initdata;
-	struct change_member *change_tmp;
 	unsigned long current_type, last_type;
 	unsigned long long last_addr;
-	int chgidx, still_changing;
+	int chgidx;
 	int overlap_entries;
 	int new_bios_entry;
 	int old_nr, new_nr, chg_nr;
@@ -279,35 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
 	chg_nr = chgidx;
 
 	/* sort change-point list by memory addresses (low -> high) */
-	still_changing = 1;
-	while (still_changing)	{
-		still_changing = 0;
-		for (i = 1; i < chg_nr; i++)  {
-			unsigned long long curaddr, lastaddr;
-			unsigned long long curpbaddr, lastpbaddr;
-
-			curaddr = change_point[i]->addr;
-			lastaddr = change_point[i - 1]->addr;
-			curpbaddr = change_point[i]->pbios->addr;
-			lastpbaddr = change_point[i - 1]->pbios->addr;
-
-			/*
-			 * swap entries, when:
-			 *
-			 * curaddr > lastaddr or
-			 * curaddr == lastaddr and curaddr == curpbaddr and
-			 * lastaddr != lastpbaddr
-			 */
-			if (curaddr < lastaddr ||
-			    (curaddr == lastaddr && curaddr == curpbaddr &&
-			     lastaddr != lastpbaddr)) {
-				change_tmp = change_point[i];
-				change_point[i] = change_point[i-1];
-				change_point[i-1] = change_tmp;
-				still_changing = 1;
-			}
-		}
-	}
+	sort(change_point, chg_nr, sizeof *change_point, cpcompare, 0);
 
 	/* create a new bios memory map, removing overlaps */
 	overlap_entries = 0;	 /* number of entries in the overlap table */
-- 
cgit v1.2.3-55-g7522


From 706d9a9c8b5758390036b9980a2b12d809599777 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten
Date: Tue, 15 Nov 2011 14:48:56 -0800
Subject: arch/x86/kernel/e820.c: quiet sparse noise about plain integer as
 NULL pointer

The last parameter to sort() is a pointer to the function used
to swap items.  This parameter should be NULL, not 0, when not
used.  This quiets the following sparse warning:

 warning: Using plain integer as NULL pointer

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: hartleys@visionengravers.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index f655f802260d..d6bd85352c81 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -296,7 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
 	chg_nr = chgidx;
 
 	/* sort change-point list by memory addresses (low -> high) */
-	sort(change_point, chg_nr, sizeof *change_point, cpcompare, 0);
+	sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL);
 
 	/* create a new bios memory map, removing overlaps */
 	overlap_entries = 0;	 /* number of entries in the overlap table */
-- 
cgit v1.2.3-55-g7522


From b565201cf75210614903ef2ae5917b4379681647 Mon Sep 17 00:00:00 2001
From: Jack Steiner
Date: Tue, 15 Nov 2011 15:33:56 -0800
Subject: x86: Reduce clock calibration time during slave cpu startup

Reduce the startup time for slave cpus.

Adds hooks for an arch-specific function for clock calibration.
These hooks are used on x86.  If a newly started cpu has the
same phys_proc_id as a core already active, uses the TSC for the
delay loop and has a CONSTANT_TSC, use the already-calculated
value of loops_per_jiffy.

This patch reduces the time required to start slave cpus on a
4096 cpu system from: 465 sec OLD 62 sec NEW

This reduces boot time on a 4096p system by almost 7 minutes.
Nice...

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: John Stultz <john.stultz@linaro.org>
[fix CONFIG_SMP=n build]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 16 +++++++++++-----
 arch/x86/kernel/tsc.c     | 20 ++++++++++++++++++++
 init/calibrate.c          | 15 +++++++++++++++
 3 files changed, 46 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9f548cb4a958..00eef55c8327 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -207,22 +207,28 @@ static void __cpuinit smp_callin(void)
 	 * Need to setup vector mappings before we enable interrupts.
 	 */
 	setup_vector_irq(smp_processor_id());
+
+	/*
+	 * Save our processor parameters. Note: this information
+	 * is needed for clock calibration.
+	 */
+	smp_store_cpu_info(cpuid);
+
 	/*
 	 * Get our bogomips.
+	 * Update loops_per_jiffy in cpu_data. Previous call to
+	 * smp_store_cpu_info() stored a value that is close but not as
+	 * accurate as the value just calculated.
 	 *
 	 * Need to enable IRQs because it can take longer and then
 	 * the NMI watchdog might kill us.
 	 */
 	local_irq_enable();
 	calibrate_delay();
+	cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
 	local_irq_disable();
 	pr_debug("Stack at about %p\n", &cpuid);
 
-	/*
-	 * Save our processor parameters
-	 */
-	smp_store_cpu_info(cpuid);
-
 	/*
 	 * This must be done before setting cpu_online_mask
 	 * or calling notify_cpu_starting.
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index db483369f10b..490fb330be87 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -995,3 +995,23 @@ void __init tsc_init(void)
 	check_system_tsc_reliable();
 }
 
+#ifdef CONFIG_SMP
+/*
+ * If we have a constant TSC and are using the TSC for the delay loop,
+ * we can skip clock calibration if another cpu in the same socket has already
+ * been calibrated. This assumes that CONSTANT_TSC applies to all
+ * cpus in the socket - this should be a safe assumption.
+ */
+unsigned long __cpuinit calibrate_delay_is_known(void)
+{
+	int i, cpu = smp_processor_id();
+
+	if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
+		return 0;
+
+	for_each_online_cpu(i)
+		if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id)
+			return cpu_data(i).loops_per_jiffy;
+	return 0;
+}
+#endif
diff --git a/init/calibrate.c b/init/calibrate.c
index 24df7976816c..5f117ca9e069 100644
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -246,6 +246,19 @@ recalibrate:
 
 static DEFINE_PER_CPU(unsigned long, cpu_loops_per_jiffy) = { 0 };
 
+/*
+ * Check if cpu calibration delay is already known. For example,
+ * some processors with multi-core sockets may have all cores
+ * with the same calibration delay.
+ *
+ * Architectures should override this function if a faster calibration
+ * method is available.
+ */
+unsigned long __attribute__((weak)) __cpuinit calibrate_delay_is_known(void)
+{
+	return 0;
+}
+
 void __cpuinit calibrate_delay(void)
 {
 	unsigned long lpj;
@@ -265,6 +278,8 @@ void __cpuinit calibrate_delay(void)
 		lpj = lpj_fine;
 		pr_info("Calibrating delay loop (skipped), "
 			"value calculated using timer frequency.. ");
+	} else if ((lpj = calibrate_delay_is_known())) {
+		;
 	} else if ((lpj = calibrate_delay_direct()) != 0) {
 		if (!printed)
 			pr_info("Calibrating delay using timer "
-- 
cgit v1.2.3-55-g7522


From f9b15df466ba923a5832c9121ad8327ccf5483ef Mon Sep 17 00:00:00 2001
From: Alessandro Rubini
Date: Sat, 29 Oct 2011 00:48:42 +0200
Subject: x86/Kconfig: Cyclone-timer depends on x86-summit

CONFIG_X86_CYCLONE_TIMER depends on CONFIG_X86_32_NON_STANDARD,
which forces drivers/clocksource/cyclone.c to be compiled. The
file doesn't do anything unless enabled by
arch/x86/kernel/apic/summit_32.c

Make CONFIG_X86_CYCLONE_TIMER depend by X86_SUMMIT instead, to
avoid unnecessary code in other non-standard systems.

Signed-off-by: Alessandro Rubini <rubini@gnudd.com>
Cc: john stultz <johnstul@us.ibm.com>
Link: http://lkml.kernel.org/r/20111028224842.GA7582@mail.gnudd.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9e7a361423d6..faf39a0d6242 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -633,7 +633,7 @@ config X86_SUMMIT_NUMA
 
 config X86_CYCLONE_TIMER
 	def_bool y
-	depends on X86_32_NON_STANDARD
+	depends on X86_SUMMIT
 
 source "arch/x86/Kconfig.cpu"
 
-- 
cgit v1.2.3-55-g7522


From 45db1c6176c8171d9ae6fa6d82e07d115a5950ca Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Mon, 5 Dec 2011 16:08:49 -0800
Subject: x86, um: Use the same style generated syscall tables as native

Now when the native kernel uses a single style of generated system
call table, follow suite for UML and implement the same style, all in
C.  This requires __NR_syscall_max and NR_syscalls to be generated; on
native this is done in asm-headers.h but that file is common to all
UML architectures; therefore put it in user-headers.h instead which
already have accommodations for architecture-specific values.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/um/Makefile            |  3 ++-
 arch/x86/um/sys_call_table_32.S | 26 -------------------
 arch/x86/um/sys_call_table_32.c | 55 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/um/sys_call_table_64.c | 31 +++++++++--------------
 arch/x86/um/user-offsets.c      | 15 +++++++++++
 5 files changed, 84 insertions(+), 46 deletions(-)
 delete mode 100644 arch/x86/um/sys_call_table_32.S
 create mode 100644 arch/x86/um/sys_call_table_32.c

(limited to 'arch/x86')

diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index 8fb58400e415..5d065b2222d3 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -37,7 +37,8 @@ subarch-$(CONFIG_MODULES) += ../kernel/module.o
 USER_OBJS := bugs_$(BITS).o ptrace_user.o fault.o
 
 extra-y += user-offsets.s
-$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS)
+$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) \
+	-Iarch/x86/include/generated
 
 UNPROFILE_OBJS := stub_segv.o
 CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING)
diff --git a/arch/x86/um/sys_call_table_32.S b/arch/x86/um/sys_call_table_32.S
deleted file mode 100644
index a7ca80d2dceb..000000000000
--- a/arch/x86/um/sys_call_table_32.S
+++ /dev/null
@@ -1,26 +0,0 @@
-#include <linux/linkage.h>
-/* Steal i386 syscall table for our purposes, but with some slight changes.*/
-
-#define sys_iopl sys_ni_syscall
-#define sys_ioperm sys_ni_syscall
-
-#define sys_vm86old sys_ni_syscall
-#define sys_vm86 sys_ni_syscall
-
-#define old_mmap sys_old_mmap
-
-#define ptregs_fork sys_fork
-#define ptregs_execve sys_execve
-#define ptregs_iopl sys_iopl
-#define ptregs_vm86old sys_vm86old
-#define ptregs_clone sys_clone
-#define ptregs_vm86 sys_vm86
-#define ptregs_sigaltstack sys_sigaltstack
-#define ptregs_vfork sys_vfork
-
-.section .rodata,"a"
-
-#include "../kernel/syscall_table_32.S"
-
-ENTRY(syscall_table_size)
-.long .-sys_call_table
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
new file mode 100644
index 000000000000..b897fcae6205
--- /dev/null
+++ b/arch/x86/um/sys_call_table_32.c
@@ -0,0 +1,55 @@
+/*
+ * System call table for UML/i386, copied from arch/x86/kernel/syscall_*.c
+ * with some changes for UML.
+ */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <generated/user_constants.h>
+
+#define __NO_STUBS
+
+/*
+ * Below you can see, in terms of #define's, the differences between the x86-64
+ * and the UML syscall table.
+ */
+
+/* Not going to be implemented by UML, since we have no hardware. */
+#define stub_iopl sys_ni_syscall
+#define sys_ioperm sys_ni_syscall
+
+#define sys_vm86old sys_ni_syscall
+#define sys_vm86 sys_ni_syscall
+
+#define old_mmap sys_old_mmap
+
+#define ptregs_fork sys_fork
+#define ptregs_execve sys_execve
+#define ptregs_iopl sys_iopl
+#define ptregs_vm86old sys_vm86old
+#define ptregs_clone sys_clone
+#define ptregs_vm86 sys_vm86
+#define ptregs_sigaltstack sys_sigaltstack
+#define ptregs_vfork sys_vfork
+
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
+#include <asm/syscalls_32.h>
+
+#undef __SYSCALL_I386
+#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym,
+
+typedef void (*sys_call_ptr_t)(void);
+
+extern void sys_ni_syscall(void);
+
+sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
+	/*
+	 * Smells like a compiler bug -- it doesn't work
+	 * when the & below is removed.
+	 */
+	[0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm/syscalls_32.h>
+};
+
+int syscall_table_size = sizeof(sys_call_table);
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 99522f78b162..797a639bcca5 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -1,11 +1,12 @@
 /*
- * System call table for UML/x86-64, copied from arch/x86_64/kernel/syscall.c
+ * System call table for UML/x86-64, copied from arch/x86/kernel/syscall_*.c
  * with some changes for UML.
  */
 
 #include <linux/linkage.h>
 #include <linux/sys.h>
 #include <linux/cache.h>
+#include <generated/user_constants.h>
 
 #define __NO_STUBS
 
@@ -34,31 +35,23 @@
 #define stub_sigaltstack sys_sigaltstack
 #define stub_rt_sigreturn sys_rt_sigreturn
 
-#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
-#undef _ASM_X86_UNISTD_64_H
-#include "../../x86/include/asm/unistd_64.h"
+#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
+#include <asm/syscalls_64.h>
 
-#undef __SYSCALL
-#define __SYSCALL(nr, sym) [ nr ] = sym,
-#undef _ASM_X86_UNISTD_64_H
+#undef __SYSCALL_64
+#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym,
 
 typedef void (*sys_call_ptr_t)(void);
 
 extern void sys_ni_syscall(void);
 
-/*
- * We used to have a trick here which made sure that holes in the
- * x86_64 table were filled in with sys_ni_syscall, but a comment in
- * unistd_64.h says that holes aren't allowed, so the trick was
- * removed.
- * The trick looked like this
- *	[0 ... UM_NR_syscall_max] = &sys_ni_syscall
- * before including unistd_64.h - the later initializations overwrote
- * the sys_ni_syscall filler.
- */
-
 sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
-#include <asm/unistd_64.h>
+	/*
+	 * Smells like a compiler bug -- it doesn't work
+	 * when the & below is removed.
+	 */
+	[0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm/syscalls_64.h>
 };
 
 int syscall_table_size = sizeof(sys_call_table);
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index ca49be8ddd0c..5edf4f4bbf53 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -8,6 +8,18 @@
 #include <asm/ptrace.h>
 #include <asm/types.h>
 
+#ifdef __i386__
+#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+static char syscalls[] = {
+#include <asm/syscalls_32.h>
+};
+#else
+#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+static char syscalls[] = {
+#include <asm/syscalls_64.h>
+};
+#endif
+
 #define DEFINE(sym, val) \
 	asm volatile("\n->" #sym " %0 " #val : : "i" (val))
 
@@ -77,4 +89,7 @@ void foo(void)
 	DEFINE(UM_PROT_READ, PROT_READ);
 	DEFINE(UM_PROT_WRITE, PROT_WRITE);
 	DEFINE(UM_PROT_EXEC, PROT_EXEC);
+
+	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
+	DEFINE(NR_syscalls, sizeof(syscalls));
 }
-- 
cgit v1.2.3-55-g7522


From a074335a370eca6d72f2ec890e4ae22923a2aea4 Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Mon, 5 Dec 2011 22:48:49 -0800
Subject: x86, um: Mark system call tables readonly

Mark the system call tables readonly, as they already are on native,
and the 32-bit UM version was in the previous assembly version.  The
32-bit version lost it due to copy and paste from the 64-bit version,
which was missing the const.

Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Link: http://lkml.kernel.org/r/tip-45db1c6176c8171d9ae6fa6d82e07d115a5950ca@git.kernel.org
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/um/sys_call_table_32.c | 2 +-
 arch/x86/um/sys_call_table_64.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index b897fcae6205..0606aa3e92ae 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -43,7 +43,7 @@ typedef void (*sys_call_ptr_t)(void);
 
 extern void sys_ni_syscall(void);
 
-sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
+const sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
 	/*
 	 * Smells like a compiler bug -- it doesn't work
 	 * when the & below is removed.
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 797a639bcca5..fe626c3ba01b 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -45,7 +45,7 @@ typedef void (*sys_call_ptr_t)(void);
 
 extern void sys_ni_syscall(void);
 
-sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
+const sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
 	/*
 	 * Smells like a compiler bug -- it doesn't work
 	 * when the & below is removed.
-- 
cgit v1.2.3-55-g7522


From 855c743a27bb58a9a521bdc485ef5acfdb69badc Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka
Date: Tue, 6 Dec 2011 09:08:34 +0100
Subject: x86/mm: Initialize high mem before free_all_bootmem()

Patch fixes a boot crash with pagealloc debugging enabled:

  Initializing HighMem for node 0 (000377fe:0003fff0)
  BUG: unable to handle kernel paging request at f6fefe80
  IP: [<c1621ab5>] find_range_array+0x5e/0x69
  [...]
  Call Trace:
   [<c1622064>] __get_free_all_memory_range+0x39/0xb4
   [<c1620dd0>] add_highpages_with_active_regions+0x18/0x9b
   [<c1621a2e>] set_highmem_pages_init+0x70/0x90
   [<c162122b>] mem_init+0x50/0x21b
   [<c16155bd>] start_kernel+0x1bf/0x31c
   [<c1615065>] i386_start_kernel+0x65/0x67

The crash happens when memblock wants to allocate big area for
temporary "struct range" array and reuses pages from top of low
memory, which were already passed to the buddy allocator.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: linux-mm@kvack.org
Cc: Mel Gorman <mgorman@suse.de>
Link: http://lkml.kernel.org/r/20111206080833.GB3105@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 3bebaed5021c..a2fecb1611cc 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -744,6 +744,17 @@ void __init mem_init(void)
 #ifdef CONFIG_FLATMEM
 	BUG_ON(!mem_map);
 #endif
+	/*
+	 * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
+	 * be done before free_all_bootmem(). Memblock use free low memory for
+	 * temporary data (see find_range_array()) and for this purpose can use
+	 * pages that was already passed to the buddy allocator, hence marked as
+	 * not accessible in the page tables when compiled with
+	 * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
+	 * important here.
+	 */
+	set_highmem_pages_init();
+
 	/* this will put all low memory onto the freelists */
 	totalram_pages += free_all_bootmem();
 
@@ -755,8 +766,6 @@ void __init mem_init(void)
 		if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
 			reservedpages++;
 
-	set_highmem_pages_init();
-
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
-- 
cgit v1.2.3-55-g7522


From 54c29c635ae91f5d75ced7bffeaa77ba37ca02bb Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka
Date: Tue, 29 Nov 2011 17:05:11 +0100
Subject: mm, x86: Remove debug_pagealloc_enabled

When (no)bootmem finish operation, it pass pages to buddy
allocator. Since debug_pagealloc_enabled is not set, we will do
not protect pages, what is not what we want with
CONFIG_DEBUG_PAGEALLOC=y.

To fix remove debug_pagealloc_enabled. That variable was
introduced by commit 12d6f21e "x86: do not PSE on
CONFIG_DEBUG_PAGEALLOC=y" to get more CPA (change page
attribude) code testing. But currently we have CONFIG_CPA_DEBUG,
which test CPA.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1322582711-14571-1-git-send-email-sgruszka@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pageattr.c |  6 ------
 include/linux/mm.h     | 10 ----------
 init/main.c            |  5 -----
 mm/debug-pagealloc.c   |  3 ---
 4 files changed, 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index f9e526742fa1..5031eefa051f 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1333,12 +1333,6 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 					   numpages * PAGE_SIZE);
 	}
 
-	/*
-	 * If page allocator is not up yet then do not call c_p_a():
-	 */
-	if (!debug_pagealloc_enabled)
-		return;
-
 	/*
 	 * The return value is ignored as the calls cannot fail.
 	 * Large pages for identity mappings are not used at boot time
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3dc3a8c2c485..0a22db144753 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1537,23 +1537,13 @@ static inline void vm_stat_account(struct mm_struct *mm,
 #endif /* CONFIG_PROC_FS */
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
-extern int debug_pagealloc_enabled;
-
 extern void kernel_map_pages(struct page *page, int numpages, int enable);
-
-static inline void enable_debug_pagealloc(void)
-{
-	debug_pagealloc_enabled = 1;
-}
 #ifdef CONFIG_HIBERNATION
 extern bool kernel_page_present(struct page *page);
 #endif /* CONFIG_HIBERNATION */
 #else
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable) {}
-static inline void enable_debug_pagealloc(void)
-{
-}
 #ifdef CONFIG_HIBERNATION
 static inline bool kernel_page_present(struct page *page) { return true; }
 #endif /* CONFIG_HIBERNATION */
diff --git a/init/main.c b/init/main.c
index 217ed23e9487..99c4ba30ba7e 100644
--- a/init/main.c
+++ b/init/main.c
@@ -282,10 +282,6 @@ static int __init unknown_bootoption(char *param, char *val)
 	return 0;
 }
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
-int __read_mostly debug_pagealloc_enabled = 0;
-#endif
-
 static int __init init_setup(char *str)
 {
 	unsigned int i;
@@ -597,7 +593,6 @@ asmlinkage void __init start_kernel(void)
 	}
 #endif
 	page_cgroup_init();
-	enable_debug_pagealloc();
 	debug_objects_mem_init();
 	kmemleak_init();
 	setup_per_cpu_pageset();
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
index 7cea557407f4..789ff70c8a4a 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/debug-pagealloc.c
@@ -95,9 +95,6 @@ static void unpoison_pages(struct page *page, int n)
 
 void kernel_map_pages(struct page *page, int numpages, int enable)
 {
-	if (!debug_pagealloc_enabled)
-		return;
-
 	if (enable)
 		unpoison_pages(page, numpages);
 	else
-- 
cgit v1.2.3-55-g7522


From 565cbc3e934f221369a656b4469a044aa4c3f2a8 Mon Sep 17 00:00:00 2001
From: Don Zickus
Date: Tue, 6 Dec 2011 13:08:59 -0500
Subject: x86, NMI: NMI-selftest should handle the UP case properly

If no remote cpus are online, then just quietly skip the remote
IPI test for now.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: andi@firstfloor.org
Cc: torvalds@linux-foundation.org
Cc: peterz@infradead.org
Cc: robert.richter@amd.com
Link: http://lkml.kernel.org/r/20111206180859.GR1669@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi_selftest.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 572adb622251..1e42a23c1f2a 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -90,7 +90,8 @@ static void remote_ipi(void)
 {
 	cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
 	cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
-	test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+	if (!cpumask_empty(nmi_ipi_mask))
+		test_nmi_ipi(to_cpumask(nmi_ipi_mask));
 }
 
 static void local_ipi(void)
-- 
cgit v1.2.3-55-g7522


From d2db6610219cbcadceea6c43ee03d89068b7d759 Mon Sep 17 00:00:00 2001
From: Mitsuo Hayasaka
Date: Wed, 7 Dec 2011 17:29:10 +0900
Subject: x86: Add stack top margin for stack overflow checking

It seems that a margin for stack overflow checking is added to
top of a kernel stack but is not added to IRQ and exception
stacks in stack_overflow_check(). Therefore, the overflows of
IRQ and exception stacks are always detected only after they
actually occurred and data corruption might occur due to them.

This patch adds the margin to top of IRQ and exception stacks
as well as a kernel stack to enhance reliability.

Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20111207082910.9847.3359.stgit@ltc219.sdl.hitachi.co.jp
[ removed the #undef - we typically don't do that for uncommon names ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq_64.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 54e2b2b2e250..d04d3ecded62 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -38,6 +38,7 @@ int sysctl_panic_on_stackoverflow;
 static inline void stack_overflow_check(struct pt_regs *regs)
 {
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
+#define STACK_TOP_MARGIN	128
 	struct orig_ist *oist;
 	u64 irq_stack_top, irq_stack_bottom;
 	u64 estack_top, estack_bottom;
@@ -47,17 +48,18 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 		return;
 
 	if (regs->sp >= curbase + sizeof(struct thread_info) +
-				  sizeof(struct pt_regs) + 128 &&
+				  sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
 	    regs->sp <= curbase + THREAD_SIZE)
 		return;
 
-	irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack);
+	irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack) +
+			STACK_TOP_MARGIN;
 	irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr);
 	if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
 		return;
 
 	oist = &__get_cpu_var(orig_ist);
-	estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ;
+	estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN;
 	estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
 	if (regs->sp >= estack_top && regs->sp <= estack_bottom)
 		return;
-- 
cgit v1.2.3-55-g7522


From 4f941c57fe7e04e38c2401d53516bfd16038c9ab Mon Sep 17 00:00:00 2001
From: Don Zickus
Date: Wed, 7 Dec 2011 16:06:30 -0500
Subject: x86, NMI: NMI selftest depends on the local apic

The selftest doesn't work with out a local apic for now.

Reported-by: Randy Durlap <rdunlap@xenotime.net>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Link: http://lkml.kernel.org/r/20111207210630.GI1669@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 97da3c17b424..aa4158f3ce62 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -289,7 +289,7 @@ config DEBUG_STRICT_USER_COPY_CHECKS
 
 config DEBUG_NMI_SELFTEST
 	bool "NMI Selftest"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL && X86_LOCAL_APIC
 	---help---
 	  Enabling this option turns on a quick NMI selftest to verify
 	  that the NMI behaves correctly.
-- 
cgit v1.2.3-55-g7522


From 3d6240b53e34e372be007e08f7066e7625910675 Mon Sep 17 00:00:00 2001
From: Dan Carpenter
Date: Wed, 7 Dec 2011 14:06:12 +0300
Subject: x86, NMI: Add to_cpumask() to silence compile warning

Gcc complains if we don't cast this to a struct cpumask pointer.
arch/x86/kernel/nmi_selftest.c:93:2:

	warning: passing argument 1 of ‘cpumask_empty’ from
	incompatible pointer type [enabled by default]

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Don Zickus <dzickus@redhat.com>
Link: http://lkml.kernel.org/r/20111207110612.GA3437@mwanda
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi_selftest.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 1e42a23c1f2a..0d01a8ea4e11 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -90,7 +90,7 @@ static void remote_ipi(void)
 {
 	cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
 	cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
-	if (!cpumask_empty(nmi_ipi_mask))
+	if (!cpumask_empty(to_cpumask(nmi_ipi_mask)))
 		test_nmi_ipi(to_cpumask(nmi_ipi_mask));
 }
 
-- 
cgit v1.2.3-55-g7522


From 54eed6cb16ec315565aaaf8e34252ca253a68b7b Mon Sep 17 00:00:00 2001
From: Petr Holasek
Date: Thu, 8 Dec 2011 13:16:41 +0100
Subject: x86/numa: Add constraints check for nid parameters

This patch adds constraint checks to the numa_set_distance()
function.

When the check triggers (this should not happen normally) it
emits a warning and avoids a store to a negative index in
numa_distance[] array - i.e. avoids memory corruption.

Negative ids can be passed when the pxm-to-nids mapping is not
properly filled while parsing the SRAT.

Signed-off-by: Petr Holasek <pholasek@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Anton Arapov <anton@redhat.com>
Link: http://lkml.kernel.org/r/20111208121640.GA2229@dhcp-27-244.brq.redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/numa.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index fbeaaf416610..cdc00543d375 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -430,8 +430,9 @@ static int __init numa_alloc_distance(void)
  * calls are ignored until the distance table is reset with
  * numa_reset_distance().
  *
- * If @from or @to is higher than the highest known node at the time of
- * table creation or @distance doesn't make sense, the call is ignored.
+ * If @from or @to is higher than the highest known node or lower than zero
+ * at the time of table creation or @distance doesn't make sense, the call
+ * is ignored.
  * This is to allow simplification of specific NUMA config implementations.
  */
 void __init numa_set_distance(int from, int to, int distance)
@@ -439,8 +440,9 @@ void __init numa_set_distance(int from, int to, int distance)
 	if (!numa_distance && numa_alloc_distance() < 0)
 		return;
 
-	if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
-		printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
+	if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
+			from < 0 || to < 0) {
+		pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
 			    from, to, distance);
 		return;
 	}
-- 
cgit v1.2.3-55-g7522


From 819a693b5a503788a7af54a3d95c4857780a7230 Mon Sep 17 00:00:00 2001
From: Wang YanQing
Date: Thu, 8 Dec 2011 12:00:27 +0800
Subject: typo fixes: aera -> area, exntension -> extension

One printk and one comment typo fix.

Signed-off-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/pci/pcbios.c            | 2 +-
 net/ipv4/netfilter/nf_nat_core.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index db0e9a51e611..da8fe0535ff4 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -44,7 +44,7 @@ static inline void set_bios_x(void)
 	pcibios_enabled = 1;
 	set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
 	if (__supported_pte_mask & _PAGE_NX)
-		printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n");
+		printk(KERN_INFO "PCI : PCI BIOS area is rw and x. Use pci=nobios if you want it NX.\n");
 }
 
 /*
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 447bc5cfdc6c..072bb5330315 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -318,7 +318,7 @@ nf_nat_setup_info(struct nf_conn *ct,
 		srchash = hash_by_src(net, nf_ct_zone(ct),
 				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 		spin_lock_bh(&nf_nat_lock);
-		/* nf_conntrack_alter_reply might re-allocate exntension aera */
+		/* nf_conntrack_alter_reply might re-allocate extension area */
 		nat = nfct_nat(ct);
 		nat->ct = ct;
 		hlist_add_head_rcu(&nat->bysource,
-- 
cgit v1.2.3-55-g7522


From 47db9e7c808a45b1f86971f25eca5e38fa95ab86 Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Fri, 9 Dec 2011 11:13:59 -0800
Subject: x86, um: Fix typo in 32-bit system call modifications

We override sys_iopl(), not stub_iopl(); the latter is a 64-bitism
that doesn't apply to i386 in the first place.

Reported-by: Richard Weinberger <richard@nod.at>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/um/sys_call_table_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 0606aa3e92ae..416bd40c0eba 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -16,7 +16,7 @@
  */
 
 /* Not going to be implemented by UML, since we have no hardware. */
-#define stub_iopl sys_ni_syscall
+#define sys_iopl sys_ni_syscall
 #define sys_ioperm sys_ni_syscall
 
 #define sys_vm86old sys_ni_syscall
-- 
cgit v1.2.3-55-g7522


From 8af21e7e71d1ac56d9b66fb787a14fd66af7f5f7 Mon Sep 17 00:00:00 2001
From: Matt Fleming
Date: Sat, 27 Aug 2011 09:35:45 +0100
Subject: x86: Add missing bzImage fields to struct setup_header

commit 37ba7ab5e33c ("x86, boot: make kernel_alignment adjustable; new
bzImage fields") introduced some new fields into the bzImage header
but struct setup_header was not updated accordingly. Add the missing
'pref_address' and 'init_size' fields.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1318848017-12301-1-git-send-email-matt@console-pimps.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/bootparam.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index e020d88ec02d..2f90c51cc49d 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -64,6 +64,8 @@ struct setup_header {
 	__u32	payload_offset;
 	__u32	payload_length;
 	__u64	setup_data;
+	__u64	pref_address;
+	__u32	init_size;
 } __attribute__((packed));
 
 struct sys_desc_table {
-- 
cgit v1.2.3-55-g7522


From f7d7d01be53cb47e0ae212c4e968aa28b82d2138 Mon Sep 17 00:00:00 2001
From: Matt Fleming
Date: Tue, 15 Nov 2011 12:56:14 +0000
Subject: x86: Don't use magic strings for EFI loader signature

Introduce a symbol, EFI_LOADER_SIGNATURE instead of using the magic
strings, which also helps to reduce the amount of ifdeffery.

Cc: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1318848017-12301-1-git-send-email-matt@console-pimps.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/efi.h | 4 ++++
 arch/x86/kernel/setup.c    | 7 +------
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index b8d8bfcd44a9..26d8c18d5faa 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -3,6 +3,8 @@
 
 #ifdef CONFIG_X86_32
 
+#define EFI_LOADER_SIGNATURE	"EL32"
+
 extern unsigned long asmlinkage efi_call_phys(void *, ...);
 
 #define efi_call_phys0(f)		efi_call_phys(f)
@@ -35,6 +37,8 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
 
 #else /* !CONFIG_X86_32 */
 
+#define EFI_LOADER_SIGNATURE	"EL64"
+
 extern u64 efi_call0(void *fp);
 extern u64 efi_call1(void *fp, u64 arg1);
 extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9a9e40fb091c..4d5243c31ac4 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -752,12 +752,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #ifdef CONFIG_EFI
 	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-#ifdef CONFIG_X86_32
-		     "EL32",
-#else
-		     "EL64",
-#endif
-	 4)) {
+		     EFI_LOADER_SIGNATURE, 4)) {
 		efi_enabled = 1;
 		efi_memblock_x86_reserve_range();
 	}
-- 
cgit v1.2.3-55-g7522


From 291f36325f9f252bd76ef5f603995f37e453fc60 Mon Sep 17 00:00:00 2001
From: Matt Fleming
Date: Mon, 12 Dec 2011 21:27:52 +0000
Subject: x86, efi: EFI boot stub support

There is currently a large divide between kernel development and the
development of EFI boot loaders. The idea behind this patch is to give
the kernel developers full control over the EFI boot process. As
H. Peter Anvin put it,

"The 'kernel carries its own stub' approach been very successful in
dealing with BIOS, and would make a lot of sense to me for EFI as
well."

This patch introduces an EFI boot stub that allows an x86 bzImage to
be loaded and executed by EFI firmware. The bzImage appears to the
firmware as an EFI application. Luckily there are enough free bits
within the bzImage header so that it can masquerade as an EFI
application, thereby coercing the EFI firmware into loading it and
jumping to its entry point. The beauty of this masquerading approach
is that both BIOS and EFI boot loaders can still load and run the same
bzImage, thereby allowing a single kernel image to work in any boot
environment.

The EFI boot stub supports multiple initrds, but they must exist on
the same partition as the bzImage. Command-line arguments for the
kernel can be appended after the bzImage name when run from the EFI
shell, e.g.

Shell> bzImage console=ttyS0 root=/dev/sdb initrd=initrd.img

v7:
 - Fix checkpatch warnings.

v6:

 - Try to allocate initrd memory just below hdr->inird_addr_max.

v5:

 - load_options_size is UTF-16, which needs dividing by 2 to convert
   to the corresponding ASCII size.

v4:

 - Don't read more than image->load_options_size

v3:

 - Fix following warnings when compiling CONFIG_EFI_STUB=n

   arch/x86/boot/tools/build.c: In function ‘main’:
   arch/x86/boot/tools/build.c:138:24: warning: unused variable ‘pe_header’
   arch/x86/boot/tools/build.c:138:15: warning: unused variable ‘file_sz’

 - As reported by Matthew Garrett, some Apple machines have GOPs that
   don't have hardware attached. We need to weed these out by
   searching for ones that handle the PCIIO protocol.

 - Don't allocate memory if no initrds are on cmdline
 - Don't trust image->load_options_size

Maarten Lankhorst noted:
 - Don't strip first argument when booted from efibootmgr
 - Don't allocate too much memory for cmdline
 - Don't update cmdline_size, the kernel considers it read-only
 - Don't accept '\n' for initrd names

v2:

 - File alignment was too large, was 8192 should be 512. Reported by
   Maarten Lankhorst on LKML.
 - Added UGA support for graphics
 - Use VIDEO_TYPE_EFI instead of hard-coded number.
 - Move linelength assignment until after we've assigned depth
 - Dynamically fill out AddressOfEntryPoint in tools/build.c
 - Don't use magic number for GDT/TSS stuff. Requested by Andi Kleen
 - The bzImage may need to be relocated as it may have been loaded at
   a high address address by the firmware. This was required to get my
   macbook booting because the firmware loaded it at 0x7cxxxxxx, which
   triggers this error in decompress_kernel(),

	if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))
		error("Destination address too large");

Cc: Mike Waychison <mikew@google.com>
Cc: Matthew Garrett <mjg@redhat.com>
Tested-by: Henrik Rydberg <rydberg@euromail.se>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1321383097.2657.9.camel@mfleming-mobl1.ger.corp.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kconfig                       |    7 +
 arch/x86/boot/compressed/Makefile      |   10 +-
 arch/x86/boot/compressed/eboot.c       | 1014 ++++++++++++++++++++++++++++++++
 arch/x86/boot/compressed/eboot.h       |   60 ++
 arch/x86/boot/compressed/efi_stub_32.S |   86 +++
 arch/x86/boot/compressed/efi_stub_64.S |    1 +
 arch/x86/boot/compressed/head_32.S     |   22 +
 arch/x86/boot/compressed/head_64.S     |   20 +
 arch/x86/boot/compressed/string.c      |    9 +
 arch/x86/boot/header.S                 |  158 +++++
 arch/x86/boot/string.c                 |   35 ++
 arch/x86/boot/tools/build.c            |   39 ++
 arch/x86/kernel/asm-offsets.c          |    2 +
 13 files changed, 1462 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/boot/compressed/eboot.c
 create mode 100644 arch/x86/boot/compressed/eboot.h
 create mode 100644 arch/x86/boot/compressed/efi_stub_32.S
 create mode 100644 arch/x86/boot/compressed/efi_stub_64.S

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index efb42949cc09..d71b656bcb97 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1478,6 +1478,13 @@ config EFI
 	  resultant kernel should continue to boot on existing non-EFI
 	  platforms.
 
+config EFI_STUB
+       bool "EFI stub support"
+       depends on EFI
+       ---help---
+          This kernel feature allows a bzImage to be loaded directly
+	  by EFI firmware without the use of a bootloader.
+
 config SECCOMP
 	def_bool y
 	prompt "Enable seccomp to safely compute untrusted bytecode"
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 09664efb9cee..b123b9a8f5b3 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -23,7 +23,15 @@ LDFLAGS_vmlinux := -T
 
 hostprogs-y	:= mkpiggy
 
-$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o $(obj)/piggy.o FORCE
+VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
+	$(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \
+	$(obj)/piggy.o
+
+ifeq ($(CONFIG_EFI_STUB), y)
+	VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o
+endif
+
+$(obj)/vmlinux: $(VMLINUX_OBJS) FORCE
 	$(call if_changed,ld)
 	@:
 
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
new file mode 100644
index 000000000000..4055e63d0b04
--- /dev/null
+++ b/arch/x86/boot/compressed/eboot.c
@@ -0,0 +1,1014 @@
+/* -----------------------------------------------------------------------
+ *
+ *   Copyright 2011 Intel Corporation; author Matt Fleming
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+#include <linux/efi.h>
+#include <asm/efi.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+
+#include "eboot.h"
+
+static efi_system_table_t *sys_table;
+
+static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size,
+			      unsigned long *desc_size)
+{
+	efi_memory_desc_t *m = NULL;
+	efi_status_t status;
+	unsigned long key;
+	u32 desc_version;
+
+	*map_size = sizeof(*m) * 32;
+again:
+	/*
+	 * Add an additional efi_memory_desc_t because we're doing an
+	 * allocation which may be in a new descriptor region.
+	 */
+	*map_size += sizeof(*m);
+	status = efi_call_phys3(sys_table->boottime->allocate_pool,
+				EFI_LOADER_DATA, *map_size, (void **)&m);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	status = efi_call_phys5(sys_table->boottime->get_memory_map, map_size,
+				m, &key, desc_size, &desc_version);
+	if (status == EFI_BUFFER_TOO_SMALL) {
+		efi_call_phys1(sys_table->boottime->free_pool, m);
+		goto again;
+	}
+
+	if (status != EFI_SUCCESS)
+		efi_call_phys1(sys_table->boottime->free_pool, m);
+
+fail:
+	*map = m;
+	return status;
+}
+
+/*
+ * Allocate at the highest possible address that is not above 'max'.
+ */
+static efi_status_t high_alloc(unsigned long size, unsigned long align,
+			      unsigned long *addr, unsigned long max)
+{
+	unsigned long map_size, desc_size;
+	efi_memory_desc_t *map;
+	efi_status_t status;
+	unsigned long nr_pages;
+	u64 max_addr = 0;
+	int i;
+
+	status = __get_map(&map, &map_size, &desc_size);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+again:
+	for (i = 0; i < map_size / desc_size; i++) {
+		efi_memory_desc_t *desc;
+		unsigned long m = (unsigned long)map;
+		u64 start, end;
+
+		desc = (efi_memory_desc_t *)(m + (i * desc_size));
+		if (desc->type != EFI_CONVENTIONAL_MEMORY)
+			continue;
+
+		if (desc->num_pages < nr_pages)
+			continue;
+
+		start = desc->phys_addr;
+		end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT);
+
+		if ((start + size) > end || (start + size) > max)
+			continue;
+
+		if (end - size > max)
+			end = max;
+
+		if (round_down(end - size, align) < start)
+			continue;
+
+		start = round_down(end - size, align);
+
+		/*
+		 * Don't allocate at 0x0. It will confuse code that
+		 * checks pointers against NULL.
+		 */
+		if (start == 0x0)
+			continue;
+
+		if (start > max_addr)
+			max_addr = start;
+	}
+
+	if (!max_addr)
+		status = EFI_NOT_FOUND;
+	else {
+		status = efi_call_phys4(sys_table->boottime->allocate_pages,
+					EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
+					nr_pages, &max_addr);
+		if (status != EFI_SUCCESS) {
+			max = max_addr;
+			max_addr = 0;
+			goto again;
+		}
+
+		*addr = max_addr;
+	}
+
+free_pool:
+	efi_call_phys1(sys_table->boottime->free_pool, map);
+
+fail:
+	return status;
+}
+
+/*
+ * Allocate at the lowest possible address.
+ */
+static efi_status_t low_alloc(unsigned long size, unsigned long align,
+			      unsigned long *addr)
+{
+	unsigned long map_size, desc_size;
+	efi_memory_desc_t *map;
+	efi_status_t status;
+	unsigned long nr_pages;
+	int i;
+
+	status = __get_map(&map, &map_size, &desc_size);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+	for (i = 0; i < map_size / desc_size; i++) {
+		efi_memory_desc_t *desc;
+		unsigned long m = (unsigned long)map;
+		u64 start, end;
+
+		desc = (efi_memory_desc_t *)(m + (i * desc_size));
+
+		if (desc->type != EFI_CONVENTIONAL_MEMORY)
+			continue;
+
+		if (desc->num_pages < nr_pages)
+			continue;
+
+		start = desc->phys_addr;
+		end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT);
+
+		/*
+		 * Don't allocate at 0x0. It will confuse code that
+		 * checks pointers against NULL. Skip the first 8
+		 * bytes so we start at a nice even number.
+		 */
+		if (start == 0x0)
+			start += 8;
+
+		start = round_up(start, align);
+		if ((start + size) > end)
+			continue;
+
+		status = efi_call_phys4(sys_table->boottime->allocate_pages,
+					EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
+					nr_pages, &start);
+		if (status == EFI_SUCCESS) {
+			*addr = start;
+			break;
+		}
+	}
+
+	if (i == map_size / desc_size)
+		status = EFI_NOT_FOUND;
+
+free_pool:
+	efi_call_phys1(sys_table->boottime->free_pool, map);
+fail:
+	return status;
+}
+
+static void low_free(unsigned long size, unsigned long addr)
+{
+	unsigned long nr_pages;
+
+	nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+	efi_call_phys2(sys_table->boottime->free_pages, addr, size);
+}
+
+static void find_bits(unsigned long mask, u8 *pos, u8 *size)
+{
+	u8 first, len;
+
+	first = 0;
+	len = 0;
+
+	if (mask) {
+		while (!(mask & 0x1)) {
+			mask = mask >> 1;
+			first++;
+		}
+
+		while (mask & 0x1) {
+			mask = mask >> 1;
+			len++;
+		}
+	}
+
+	*pos = first;
+	*size = len;
+}
+
+/*
+ * See if we have Graphics Output Protocol
+ */
+static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,
+			      unsigned long size)
+{
+	struct efi_graphics_output_protocol *gop, *first_gop;
+	struct efi_pixel_bitmask pixel_info;
+	unsigned long nr_gops;
+	efi_status_t status;
+	void **gop_handle;
+	u16 width, height;
+	u32 fb_base, fb_size;
+	u32 pixels_per_scan_line;
+	int pixel_format;
+	int i;
+
+	status = efi_call_phys3(sys_table->boottime->allocate_pool,
+				EFI_LOADER_DATA, size, &gop_handle);
+	if (status != EFI_SUCCESS)
+		return status;
+
+	status = efi_call_phys5(sys_table->boottime->locate_handle,
+				EFI_LOCATE_BY_PROTOCOL, proto,
+				NULL, &size, gop_handle);
+	if (status != EFI_SUCCESS)
+		goto free_handle;
+
+	first_gop = NULL;
+
+	nr_gops = size / sizeof(void *);
+	for (i = 0; i < nr_gops; i++) {
+		struct efi_graphics_output_mode_info *info;
+		efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
+		void *pciio;
+		void *h = gop_handle[i];
+
+		status = efi_call_phys3(sys_table->boottime->handle_protocol,
+					h, proto, &gop);
+		if (status != EFI_SUCCESS)
+			continue;
+
+		efi_call_phys3(sys_table->boottime->handle_protocol,
+			       h, &pciio_proto, &pciio);
+
+		status = efi_call_phys4(gop->query_mode, gop,
+					gop->mode->mode, &size, &info);
+		if (status == EFI_SUCCESS && (!first_gop || pciio)) {
+			/*
+			 * Apple provide GOPs that are not backed by
+			 * real hardware (they're used to handle
+			 * multiple displays). The workaround is to
+			 * search for a GOP implementing the PCIIO
+			 * protocol, and if one isn't found, to just
+			 * fallback to the first GOP.
+			 */
+			width = info->horizontal_resolution;
+			height = info->vertical_resolution;
+			fb_base = gop->mode->frame_buffer_base;
+			fb_size = gop->mode->frame_buffer_size;
+			pixel_format = info->pixel_format;
+			pixel_info = info->pixel_information;
+			pixels_per_scan_line = info->pixels_per_scan_line;
+
+			/*
+			 * Once we've found a GOP supporting PCIIO,
+			 * don't bother looking any further.
+			 */
+			if (pciio)
+				break;
+
+			first_gop = gop;
+		}
+	}
+
+	/* Did we find any GOPs? */
+	if (!first_gop)
+		goto free_handle;
+
+	/* EFI framebuffer */
+	si->orig_video_isVGA = VIDEO_TYPE_EFI;
+
+	si->lfb_width = width;
+	si->lfb_height = height;
+	si->lfb_base = fb_base;
+	si->lfb_size = fb_size;
+	si->pages = 1;
+
+	if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) {
+		si->lfb_depth = 32;
+		si->lfb_linelength = pixels_per_scan_line * 4;
+		si->red_size = 8;
+		si->red_pos = 0;
+		si->green_size = 8;
+		si->green_pos = 8;
+		si->blue_size = 8;
+		si->blue_pos = 16;
+		si->rsvd_size = 8;
+		si->rsvd_pos = 24;
+	} else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) {
+		si->lfb_depth = 32;
+		si->lfb_linelength = pixels_per_scan_line * 4;
+		si->red_size = 8;
+		si->red_pos = 16;
+		si->green_size = 8;
+		si->green_pos = 8;
+		si->blue_size = 8;
+		si->blue_pos = 0;
+		si->rsvd_size = 8;
+		si->rsvd_pos = 24;
+	} else if (pixel_format == PIXEL_BIT_MASK) {
+		find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size);
+		find_bits(pixel_info.green_mask, &si->green_pos,
+			  &si->green_size);
+		find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size);
+		find_bits(pixel_info.reserved_mask, &si->rsvd_pos,
+			  &si->rsvd_size);
+		si->lfb_depth = si->red_size + si->green_size +
+			si->blue_size + si->rsvd_size;
+		si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8;
+	} else {
+		si->lfb_depth = 4;
+		si->lfb_linelength = si->lfb_width / 2;
+		si->red_size = 0;
+		si->red_pos = 0;
+		si->green_size = 0;
+		si->green_pos = 0;
+		si->blue_size = 0;
+		si->blue_pos = 0;
+		si->rsvd_size = 0;
+		si->rsvd_pos = 0;
+	}
+
+free_handle:
+	efi_call_phys1(sys_table->boottime->free_pool, gop_handle);
+	return status;
+}
+
+/*
+ * See if we have Universal Graphics Adapter (UGA) protocol
+ */
+static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto,
+			      unsigned long size)
+{
+	struct efi_uga_draw_protocol *uga, *first_uga;
+	unsigned long nr_ugas;
+	efi_status_t status;
+	u32 width, height;
+	void **uga_handle = NULL;
+	int i;
+
+	status = efi_call_phys3(sys_table->boottime->allocate_pool,
+				EFI_LOADER_DATA, size, &uga_handle);
+	if (status != EFI_SUCCESS)
+		return status;
+
+	status = efi_call_phys5(sys_table->boottime->locate_handle,
+				EFI_LOCATE_BY_PROTOCOL, uga_proto,
+				NULL, &size, uga_handle);
+	if (status != EFI_SUCCESS)
+		goto free_handle;
+
+	first_uga = NULL;
+
+	nr_ugas = size / sizeof(void *);
+	for (i = 0; i < nr_ugas; i++) {
+		efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
+		void *handle = uga_handle[i];
+		u32 w, h, depth, refresh;
+		void *pciio;
+
+		status = efi_call_phys3(sys_table->boottime->handle_protocol,
+					handle, uga_proto, &uga);
+		if (status != EFI_SUCCESS)
+			continue;
+
+		efi_call_phys3(sys_table->boottime->handle_protocol,
+			       handle, &pciio_proto, &pciio);
+
+		status = efi_call_phys5(uga->get_mode, uga, &w, &h,
+					&depth, &refresh);
+		if (status == EFI_SUCCESS && (!first_uga || pciio)) {
+			width = w;
+			height = h;
+
+			/*
+			 * Once we've found a UGA supporting PCIIO,
+			 * don't bother looking any further.
+			 */
+			if (pciio)
+				break;
+
+			first_uga = uga;
+		}
+	}
+
+	if (!first_uga)
+		goto free_handle;
+
+	/* EFI framebuffer */
+	si->orig_video_isVGA = VIDEO_TYPE_EFI;
+
+	si->lfb_depth = 32;
+	si->lfb_width = width;
+	si->lfb_height = height;
+
+	si->red_size = 8;
+	si->red_pos = 16;
+	si->green_size = 8;
+	si->green_pos = 8;
+	si->blue_size = 8;
+	si->blue_pos = 0;
+	si->rsvd_size = 8;
+	si->rsvd_pos = 24;
+
+
+free_handle:
+	efi_call_phys1(sys_table->boottime->free_pool, uga_handle);
+	return status;
+}
+
+void setup_graphics(struct boot_params *boot_params)
+{
+	efi_guid_t graphics_proto = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID;
+	struct screen_info *si;
+	efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
+	efi_status_t status;
+	unsigned long size;
+	void **gop_handle = NULL;
+	void **uga_handle = NULL;
+
+	si = &boot_params->screen_info;
+	memset(si, 0, sizeof(*si));
+
+	size = 0;
+	status = efi_call_phys5(sys_table->boottime->locate_handle,
+				EFI_LOCATE_BY_PROTOCOL, &graphics_proto,
+				NULL, &size, gop_handle);
+	if (status == EFI_BUFFER_TOO_SMALL)
+		status = setup_gop(si, &graphics_proto, size);
+
+	if (status != EFI_SUCCESS) {
+		size = 0;
+		status = efi_call_phys5(sys_table->boottime->locate_handle,
+					EFI_LOCATE_BY_PROTOCOL, &uga_proto,
+					NULL, &size, uga_handle);
+		if (status == EFI_BUFFER_TOO_SMALL)
+			setup_uga(si, &uga_proto, size);
+	}
+}
+
+struct initrd {
+	efi_file_handle_t *handle;
+	u64 size;
+};
+
+/*
+ * Check the cmdline for a LILO-style initrd= arguments.
+ *
+ * We only support loading an initrd from the same filesystem as the
+ * kernel image.
+ */
+static efi_status_t handle_ramdisks(efi_loaded_image_t *image,
+				    struct setup_header *hdr)
+{
+	struct initrd *initrds;
+	unsigned long initrd_addr;
+	efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
+	u64 initrd_total;
+	efi_file_io_interface_t *io;
+	efi_file_handle_t *fh;
+	efi_status_t status;
+	int nr_initrds;
+	char *str;
+	int i, j, k;
+
+	initrd_addr = 0;
+	initrd_total = 0;
+
+	str = (char *)(unsigned long)hdr->cmd_line_ptr;
+
+	j = 0;			/* See close_handles */
+
+	if (!str || !*str)
+		return EFI_SUCCESS;
+
+	for (nr_initrds = 0; *str; nr_initrds++) {
+		str = strstr(str, "initrd=");
+		if (!str)
+			break;
+
+		str += 7;
+
+		/* Skip any leading slashes */
+		while (*str == '/' || *str == '\\')
+			str++;
+
+		while (*str && *str != ' ' && *str != '\n')
+			str++;
+	}
+
+	if (!nr_initrds)
+		return EFI_SUCCESS;
+
+	status = efi_call_phys3(sys_table->boottime->allocate_pool,
+				EFI_LOADER_DATA,
+				nr_initrds * sizeof(*initrds),
+				&initrds);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	str = (char *)(unsigned long)hdr->cmd_line_ptr;
+	for (i = 0; i < nr_initrds; i++) {
+		struct initrd *initrd;
+		efi_file_handle_t *h;
+		efi_file_info_t *info;
+		efi_char16_t filename[256];
+		unsigned long info_sz;
+		efi_guid_t info_guid = EFI_FILE_INFO_ID;
+		efi_char16_t *p;
+		u64 file_sz;
+
+		str = strstr(str, "initrd=");
+		if (!str)
+			break;
+
+		str += 7;
+
+		initrd = &initrds[i];
+		p = filename;
+
+		/* Skip any leading slashes */
+		while (*str == '/' || *str == '\\')
+			str++;
+
+		while (*str && *str != ' ' && *str != '\n') {
+			if (p >= filename + sizeof(filename))
+				break;
+
+			*p++ = *str++;
+		}
+
+		*p = '\0';
+
+		/* Only open the volume once. */
+		if (!i) {
+			efi_boot_services_t *boottime;
+
+			boottime = sys_table->boottime;
+
+			status = efi_call_phys3(boottime->handle_protocol,
+					image->device_handle, &fs_proto, &io);
+			if (status != EFI_SUCCESS)
+				goto free_initrds;
+
+			status = efi_call_phys2(io->open_volume, io, &fh);
+			if (status != EFI_SUCCESS)
+				goto free_initrds;
+		}
+
+		status = efi_call_phys5(fh->open, fh, &h, filename,
+					EFI_FILE_MODE_READ, (u64)0);
+		if (status != EFI_SUCCESS)
+			goto close_handles;
+
+		initrd->handle = h;
+
+		info_sz = 0;
+		status = efi_call_phys4(h->get_info, h, &info_guid,
+					&info_sz, NULL);
+		if (status != EFI_BUFFER_TOO_SMALL)
+			goto close_handles;
+
+grow:
+		status = efi_call_phys3(sys_table->boottime->allocate_pool,
+					EFI_LOADER_DATA, info_sz, &info);
+		if (status != EFI_SUCCESS)
+			goto close_handles;
+
+		status = efi_call_phys4(h->get_info, h, &info_guid,
+					&info_sz, info);
+		if (status == EFI_BUFFER_TOO_SMALL) {
+			efi_call_phys1(sys_table->boottime->free_pool, info);
+			goto grow;
+		}
+
+		file_sz = info->file_size;
+		efi_call_phys1(sys_table->boottime->free_pool, info);
+
+		if (status != EFI_SUCCESS)
+			goto close_handles;
+
+		initrd->size = file_sz;
+		initrd_total += file_sz;
+	}
+
+	if (initrd_total) {
+		unsigned long addr;
+
+		/*
+		 * Multiple initrd's need to be at consecutive
+		 * addresses in memory, so allocate enough memory for
+		 * all the initrd's.
+		 */
+		status = high_alloc(initrd_total, 0x1000,
+				   &initrd_addr, hdr->initrd_addr_max);
+		if (status != EFI_SUCCESS)
+			goto close_handles;
+
+		/* We've run out of free low memory. */
+		if (initrd_addr > hdr->initrd_addr_max) {
+			status = EFI_INVALID_PARAMETER;
+			goto free_initrd_total;
+		}
+
+		addr = initrd_addr;
+		for (j = 0; j < nr_initrds; j++) {
+			u64 size;
+
+			size = initrds[j].size;
+			status = efi_call_phys3(fh->read, initrds[j].handle,
+						&size, addr);
+			if (status != EFI_SUCCESS)
+				goto free_initrd_total;
+
+			efi_call_phys1(fh->close, initrds[j].handle);
+
+			addr += size;
+		}
+
+	}
+
+	efi_call_phys1(sys_table->boottime->free_pool, initrds);
+
+	hdr->ramdisk_image = initrd_addr;
+	hdr->ramdisk_size = initrd_total;
+
+	return status;
+
+free_initrd_total:
+	low_free(initrd_total, initrd_addr);
+
+close_handles:
+	for (k = j; k < nr_initrds; k++)
+		efi_call_phys1(fh->close, initrds[k].handle);
+free_initrds:
+	efi_call_phys1(sys_table->boottime->free_pool, initrds);
+fail:
+	hdr->ramdisk_image = 0;
+	hdr->ramdisk_size = 0;
+
+	return status;
+}
+
+/*
+ * Because the x86 boot code expects to be passed a boot_params we
+ * need to create one ourselves (usually the bootloader would create
+ * one for us).
+ */
+static efi_status_t make_boot_params(struct boot_params *boot_params,
+				     efi_loaded_image_t *image,
+				     void *handle)
+{
+	struct efi_info *efi = &boot_params->efi_info;
+	struct apm_bios_info *bi = &boot_params->apm_bios_info;
+	struct sys_desc_table *sdt = &boot_params->sys_desc_table;
+	struct e820entry *e820_map = &boot_params->e820_map[0];
+	struct e820entry *prev = NULL;
+	struct setup_header *hdr = &boot_params->hdr;
+	unsigned long size, key, desc_size, _size;
+	efi_memory_desc_t *mem_map;
+	void *options = image->load_options;
+	u32 load_options_size = image->load_options_size / 2; /* ASCII */
+	int options_size = 0;
+	efi_status_t status;
+	__u32 desc_version;
+	unsigned long cmdline;
+	u8 nr_entries;
+	u16 *s2;
+	u8 *s1;
+	int i;
+
+	hdr->type_of_loader = 0x21;
+
+	/* Convert unicode cmdline to ascii */
+	cmdline = 0;
+	s2 = (u16 *)options;
+
+	if (s2) {
+		while (*s2 && *s2 != '\n' && options_size < load_options_size) {
+			s2++;
+			options_size++;
+		}
+
+		if (options_size) {
+			if (options_size > hdr->cmdline_size)
+				options_size = hdr->cmdline_size;
+
+			options_size++;	/* NUL termination */
+
+			status = low_alloc(options_size, 1, &cmdline);
+			if (status != EFI_SUCCESS)
+				goto fail;
+
+			s1 = (u8 *)(unsigned long)cmdline;
+			s2 = (u16 *)options;
+
+			for (i = 0; i < options_size - 1; i++)
+				*s1++ = *s2++;
+
+			*s1 = '\0';
+		}
+	}
+
+	hdr->cmd_line_ptr = cmdline;
+
+	hdr->ramdisk_image = 0;
+	hdr->ramdisk_size = 0;
+
+	status = handle_ramdisks(image, hdr);
+	if (status != EFI_SUCCESS)
+		goto free_cmdline;
+
+	setup_graphics(boot_params);
+
+	/* Clear APM BIOS info */
+	memset(bi, 0, sizeof(*bi));
+
+	memset(sdt, 0, sizeof(*sdt));
+
+	memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32));
+
+	size = sizeof(*mem_map) * 32;
+
+again:
+	size += sizeof(*mem_map);
+	_size = size;
+	status = low_alloc(size, 1, (unsigned long *)&mem_map);
+	if (status != EFI_SUCCESS)
+		goto free_cmdline;
+
+	status = efi_call_phys5(sys_table->boottime->get_memory_map, &size,
+				mem_map, &key, &desc_size, &desc_version);
+	if (status == EFI_BUFFER_TOO_SMALL) {
+		low_free(_size, (unsigned long)mem_map);
+		goto again;
+	}
+
+	if (status != EFI_SUCCESS)
+		goto free_mem_map;
+
+	efi->efi_systab = (unsigned long)sys_table;
+	efi->efi_memdesc_size = desc_size;
+	efi->efi_memdesc_version = desc_version;
+	efi->efi_memmap = (unsigned long)mem_map;
+	efi->efi_memmap_size = size;
+
+#ifdef CONFIG_X86_64
+	efi->efi_systab_hi = (unsigned long)sys_table >> 32;
+	efi->efi_memmap_hi = (unsigned long)mem_map >> 32;
+#endif
+
+	/* Might as well exit boot services now */
+	status = efi_call_phys2(sys_table->boottime->exit_boot_services,
+				handle, key);
+	if (status != EFI_SUCCESS)
+		goto free_mem_map;
+
+	/* Historic? */
+	boot_params->alt_mem_k = 32 * 1024;
+
+	/*
+	 * Convert the EFI memory map to E820.
+	 */
+	nr_entries = 0;
+	for (i = 0; i < size / desc_size; i++) {
+		efi_memory_desc_t *d;
+		unsigned int e820_type = 0;
+		unsigned long m = (unsigned long)mem_map;
+
+		d = (efi_memory_desc_t *)(m + (i * desc_size));
+		switch (d->type) {
+		case EFI_RESERVED_TYPE:
+		case EFI_RUNTIME_SERVICES_CODE:
+		case EFI_RUNTIME_SERVICES_DATA:
+		case EFI_MEMORY_MAPPED_IO:
+		case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
+		case EFI_PAL_CODE:
+			e820_type = E820_RESERVED;
+			break;
+
+		case EFI_UNUSABLE_MEMORY:
+			e820_type = E820_UNUSABLE;
+			break;
+
+		case EFI_ACPI_RECLAIM_MEMORY:
+			e820_type = E820_ACPI;
+			break;
+
+		case EFI_LOADER_CODE:
+		case EFI_LOADER_DATA:
+		case EFI_BOOT_SERVICES_CODE:
+		case EFI_BOOT_SERVICES_DATA:
+		case EFI_CONVENTIONAL_MEMORY:
+			e820_type = E820_RAM;
+			break;
+
+		case EFI_ACPI_MEMORY_NVS:
+			e820_type = E820_NVS;
+			break;
+
+		default:
+			continue;
+		}
+
+		/* Merge adjacent mappings */
+		if (prev && prev->type == e820_type &&
+		    (prev->addr + prev->size) == d->phys_addr)
+			prev->size += d->num_pages << 12;
+		else {
+			e820_map->addr = d->phys_addr;
+			e820_map->size = d->num_pages << 12;
+			e820_map->type = e820_type;
+			prev = e820_map++;
+			nr_entries++;
+		}
+	}
+
+	boot_params->e820_entries = nr_entries;
+
+	return EFI_SUCCESS;
+
+free_mem_map:
+	low_free(_size, (unsigned long)mem_map);
+free_cmdline:
+	if (options_size)
+		low_free(options_size, hdr->cmd_line_ptr);
+fail:
+	return status;
+}
+
+/*
+ * On success we return a pointer to a boot_params structure, and NULL
+ * on failure.
+ */
+struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
+{
+	struct boot_params *boot_params;
+	unsigned long start, nr_pages;
+	struct desc_ptr *gdt, *idt;
+	efi_loaded_image_t *image;
+	struct setup_header *hdr;
+	efi_status_t status;
+	efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
+	struct desc_struct *desc;
+
+	sys_table = _table;
+
+	/* Check if we were booted by the EFI firmware */
+	if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
+		goto fail;
+
+	status = efi_call_phys3(sys_table->boottime->handle_protocol,
+				handle, &proto, (void *)&image);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	status = low_alloc(0x4000, 1, (unsigned long *)&boot_params);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	memset(boot_params, 0x0, 0x4000);
+
+	/* Copy first two sectors to boot_params */
+	memcpy(boot_params, image->image_base, 1024);
+
+	hdr = &boot_params->hdr;
+
+	/*
+	 * The EFI firmware loader could have placed the kernel image
+	 * anywhere in memory, but the kernel has various restrictions
+	 * on the max physical address it can run at. Attempt to move
+	 * the kernel to boot_params.pref_address, or as low as
+	 * possible.
+	 */
+	start = hdr->pref_address;
+	nr_pages = round_up(hdr->init_size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+
+	status = efi_call_phys4(sys_table->boottime->allocate_pages,
+				EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
+				nr_pages, &start);
+	if (status != EFI_SUCCESS) {
+		status = low_alloc(hdr->init_size, hdr->kernel_alignment,
+				   &start);
+		if (status != EFI_SUCCESS)
+			goto fail;
+	}
+
+	hdr->code32_start = (__u32)start;
+	hdr->pref_address = (__u64)(unsigned long)image->image_base;
+
+	memcpy((void *)start, image->image_base, image->image_size);
+
+	status = efi_call_phys3(sys_table->boottime->allocate_pool,
+				EFI_LOADER_DATA, sizeof(*gdt),
+				(void **)&gdt);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	gdt->size = 0x800;
+	status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	status = efi_call_phys3(sys_table->boottime->allocate_pool,
+				EFI_LOADER_DATA, sizeof(*idt),
+				(void **)&idt);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	idt->size = 0;
+	idt->address = 0;
+
+	status = make_boot_params(boot_params, image, handle);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	memset((char *)gdt->address, 0x0, gdt->size);
+	desc = (struct desc_struct *)gdt->address;
+
+	/* The first GDT is a dummy and the second is unused. */
+	desc += 2;
+
+	desc->limit0 = 0xffff;
+	desc->base0 = 0x0000;
+	desc->base1 = 0x0000;
+	desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
+	desc->s = DESC_TYPE_CODE_DATA;
+	desc->dpl = 0;
+	desc->p = 1;
+	desc->limit = 0xf;
+	desc->avl = 0;
+	desc->l = 0;
+	desc->d = SEG_OP_SIZE_32BIT;
+	desc->g = SEG_GRANULARITY_4KB;
+	desc->base2 = 0x00;
+
+	desc++;
+	desc->limit0 = 0xffff;
+	desc->base0 = 0x0000;
+	desc->base1 = 0x0000;
+	desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE;
+	desc->s = DESC_TYPE_CODE_DATA;
+	desc->dpl = 0;
+	desc->p = 1;
+	desc->limit = 0xf;
+	desc->avl = 0;
+	desc->l = 0;
+	desc->d = SEG_OP_SIZE_32BIT;
+	desc->g = SEG_GRANULARITY_4KB;
+	desc->base2 = 0x00;
+
+#ifdef CONFIG_X86_64
+	/* Task segment value */
+	desc++;
+	desc->limit0 = 0x0000;
+	desc->base0 = 0x0000;
+	desc->base1 = 0x0000;
+	desc->type = SEG_TYPE_TSS;
+	desc->s = 0;
+	desc->dpl = 0;
+	desc->p = 1;
+	desc->limit = 0x0;
+	desc->avl = 0;
+	desc->l = 0;
+	desc->d = 0;
+	desc->g = SEG_GRANULARITY_4KB;
+	desc->base2 = 0x00;
+#endif /* CONFIG_X86_64 */
+
+	asm volatile ("lidt %0" : : "m" (*idt));
+	asm volatile ("lgdt %0" : : "m" (*gdt));
+
+	asm volatile("cli");
+
+	return boot_params;
+fail:
+	return NULL;
+}
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
new file mode 100644
index 000000000000..f66d023e91ef
--- /dev/null
+++ b/arch/x86/boot/compressed/eboot.h
@@ -0,0 +1,60 @@
+#ifndef BOOT_COMPRESSED_EBOOT_H
+#define BOOT_COMPRESSED_EBOOT_H
+
+#define SEG_TYPE_DATA		(0 << 3)
+#define SEG_TYPE_READ_WRITE	(1 << 1)
+#define SEG_TYPE_CODE		(1 << 3)
+#define SEG_TYPE_EXEC_READ	(1 << 1)
+#define SEG_TYPE_TSS		((1 << 3) | (1 << 0))
+#define SEG_OP_SIZE_32BIT	(1 << 0)
+#define SEG_GRANULARITY_4KB	(1 << 0)
+
+#define DESC_TYPE_CODE_DATA	(1 << 0)
+
+#define EFI_PAGE_SIZE		(1UL << EFI_PAGE_SHIFT)
+
+#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR		0
+#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR		1
+#define PIXEL_BIT_MASK					2
+#define PIXEL_BLT_ONLY					3
+#define PIXEL_FORMAT_MAX				4
+
+struct efi_pixel_bitmask {
+	u32 red_mask;
+	u32 green_mask;
+	u32 blue_mask;
+	u32 reserved_mask;
+};
+
+struct efi_graphics_output_mode_info {
+	u32 version;
+	u32 horizontal_resolution;
+	u32 vertical_resolution;
+	int pixel_format;
+	struct efi_pixel_bitmask pixel_information;
+	u32 pixels_per_scan_line;
+} __packed;
+
+struct efi_graphics_output_protocol_mode {
+	u32 max_mode;
+	u32 mode;
+	unsigned long info;
+	unsigned long size_of_info;
+	u64 frame_buffer_base;
+	unsigned long frame_buffer_size;
+} __packed;
+
+struct efi_graphics_output_protocol {
+	void *query_mode;
+	unsigned long set_mode;
+	unsigned long blt;
+	struct efi_graphics_output_protocol_mode *mode;
+};
+
+struct efi_uga_draw_protocol {
+	void *get_mode;
+	void *set_mode;
+	void *blt;
+};
+
+#endif /* BOOT_COMPRESSED_EBOOT_H */
diff --git a/arch/x86/boot/compressed/efi_stub_32.S b/arch/x86/boot/compressed/efi_stub_32.S
new file mode 100644
index 000000000000..a53440e81d52
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_stub_32.S
@@ -0,0 +1,86 @@
+/*
+ * EFI call stub for IA32.
+ *
+ * This stub allows us to make EFI calls in physical mode with interrupts
+ * turned off. Note that this implementation is different from the one in
+ * arch/x86/platform/efi/efi_stub_32.S because we're _already_ in physical
+ * mode at this point.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+
+/*
+ * efi_call_phys(void *, ...) is a function with variable parameters.
+ * All the callers of this function assure that all the parameters are 4-bytes.
+ */
+
+/*
+ * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
+ * So we'd better save all of them at the beginning of this function and restore
+ * at the end no matter how many we use, because we can not assure EFI runtime
+ * service functions will comply with gcc calling convention, too.
+ */
+
+.text
+ENTRY(efi_call_phys)
+	/*
+	 * 0. The function can only be called in Linux kernel. So CS has been
+	 * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
+	 * the values of these registers are the same. And, the corresponding
+	 * GDT entries are identical. So I will do nothing about segment reg
+	 * and GDT, but change GDT base register in prelog and epilog.
+	 */
+
+	/*
+	 * 1. Because we haven't been relocated by this point we need to
+	 * use relative addressing.
+	 */
+	call	1f
+1:	popl	%edx
+	subl	$1b, %edx
+
+	/*
+	 * 2. Now on the top of stack is the return
+	 * address in the caller of efi_call_phys(), then parameter 1,
+	 * parameter 2, ..., param n. To make things easy, we save the return
+	 * address of efi_call_phys in a global variable.
+	 */
+	popl	%ecx
+	movl	%ecx, saved_return_addr(%edx)
+	/* get the function pointer into ECX*/
+	popl	%ecx
+	movl	%ecx, efi_rt_function_ptr(%edx)
+
+	/*
+	 * 3. Call the physical function.
+	 */
+	call	*%ecx
+
+	/*
+	 * 4. Balance the stack. And because EAX contain the return value,
+	 * we'd better not clobber it. We need to calculate our address
+	 * again because %ecx and %edx are not preserved across EFI function
+	 * calls.
+	 */
+	call	1f
+1:	popl	%edx
+	subl	$1b, %edx
+
+	movl	efi_rt_function_ptr(%edx), %ecx
+	pushl	%ecx
+
+	/*
+	 * 10. Push the saved return address onto the stack and return.
+	 */
+	movl	saved_return_addr(%edx), %ecx
+	pushl	%ecx
+	ret
+ENDPROC(efi_call_phys)
+.previous
+
+.data
+saved_return_addr:
+	.long 0
+efi_rt_function_ptr:
+	.long 0
diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S
new file mode 100644
index 000000000000..cedc60de86eb
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_stub_64.S
@@ -0,0 +1 @@
+#include "../../platform/efi/efi_stub_64.S"
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 67a655a39ce4..a0559930a180 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -32,6 +32,28 @@
 
 	__HEAD
 ENTRY(startup_32)
+#ifdef CONFIG_EFI_STUB
+	/*
+	 * We don't need the return address, so set up the stack so
+	 * efi_main() can find its arugments.
+	 */
+	add	$0x4, %esp
+
+	call	efi_main
+	cmpl	$0, %eax
+	je	preferred_addr
+	movl	%eax, %esi
+	call	1f
+1:
+	popl	%eax
+	subl	$1b, %eax
+	subl	BP_pref_address(%esi), %eax
+	add	BP_code32_start(%esi), %eax
+	leal	preferred_addr(%eax), %eax
+	jmp	*%eax
+
+preferred_addr:
+#endif
 	cld
 	/*
 	 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 35af09d13dc1..558d76ce23bc 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -199,6 +199,26 @@ ENTRY(startup_64)
 	 * an identity mapped page table being provied that maps our
 	 * entire text+data+bss and hopefully all of memory.
 	 */
+#ifdef CONFIG_EFI_STUB
+	pushq	%rsi
+	mov	%rcx, %rdi
+	mov	%rdx, %rsi
+	call	efi_main
+	popq	%rsi
+	cmpq	$0,%rax
+	je	preferred_addr
+	movq	%rax,%rsi
+	call	1f
+1:
+	popq	%rax
+	subq	$1b, %rax
+	subq	BP_pref_address(%rsi), %rax
+	add	BP_code32_start(%esi), %eax
+	leaq	preferred_addr(%rax), %rax
+	jmp	*%rax
+
+preferred_addr:
+#endif
 
 	/* Setup data segments. */
 	xorl	%eax, %eax
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
index 19b3e693cd72..ffb9c5c9d748 100644
--- a/arch/x86/boot/compressed/string.c
+++ b/arch/x86/boot/compressed/string.c
@@ -1,2 +1,11 @@
 #include "misc.h"
+
+int memcmp(const void *s1, const void *s2, size_t len)
+{
+	u8 diff;
+	asm("repe; cmpsb; setnz %0"
+	    : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
+	return diff;
+}
+
 #include "../string.c"
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index bdb4d458ec8c..f1bbeeb09148 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -45,6 +45,11 @@ SYSSEG		= 0x1000		/* historical load address >> 4 */
 
 	.global bootsect_start
 bootsect_start:
+#ifdef CONFIG_EFI_STUB
+	# "MZ", MS-DOS header
+	.byte 0x4d
+	.byte 0x5a
+#endif
 
 	# Normalize the start address
 	ljmp	$BOOTSEG, $start2
@@ -79,6 +84,14 @@ bs_die:
 	# invoke the BIOS reset code...
 	ljmp	$0xf000,$0xfff0
 
+#ifdef CONFIG_EFI_STUB
+	.org	0x3c
+	#
+	# Offset to the PE header.
+	#
+	.long	pe_header
+#endif /* CONFIG_EFI_STUB */
+
 	.section ".bsdata", "a"
 bugger_off_msg:
 	.ascii	"Direct booting from floppy is no longer supported.\r\n"
@@ -87,6 +100,141 @@ bugger_off_msg:
 	.ascii	"Remove disk and press any key to reboot . . .\r\n"
 	.byte	0
 
+#ifdef CONFIG_EFI_STUB
+pe_header:
+	.ascii	"PE"
+	.word 	0
+
+coff_header:
+#ifdef CONFIG_X86_32
+	.word	0x14c				# i386
+#else
+	.word	0x8664				# x86-64
+#endif
+	.word	2				# nr_sections
+	.long	0 				# TimeDateStamp
+	.long	0				# PointerToSymbolTable
+	.long	1				# NumberOfSymbols
+	.word	section_table - optional_header	# SizeOfOptionalHeader
+#ifdef CONFIG_X86_32
+	.word	0x306				# Characteristics.
+						# IMAGE_FILE_32BIT_MACHINE |
+						# IMAGE_FILE_DEBUG_STRIPPED |
+						# IMAGE_FILE_EXECUTABLE_IMAGE |
+						# IMAGE_FILE_LINE_NUMS_STRIPPED
+#else
+	.word	0x206				# Characteristics
+						# IMAGE_FILE_DEBUG_STRIPPED |
+						# IMAGE_FILE_EXECUTABLE_IMAGE |
+						# IMAGE_FILE_LINE_NUMS_STRIPPED
+#endif
+
+optional_header:
+#ifdef CONFIG_X86_32
+	.word	0x10b				# PE32 format
+#else
+	.word	0x20b 				# PE32+ format
+#endif
+	.byte	0x02				# MajorLinkerVersion
+	.byte	0x14				# MinorLinkerVersion
+
+	# Filled in by build.c
+	.long	0				# SizeOfCode
+
+	.long	0				# SizeOfInitializedData
+	.long	0				# SizeOfUninitializedData
+
+	# Filled in by build.c
+	.long	0x0000				# AddressOfEntryPoint
+
+	.long	0x0000				# BaseOfCode
+#ifdef CONFIG_X86_32
+	.long	0				# data
+#endif
+
+extra_header_fields:
+#ifdef CONFIG_X86_32
+	.long	0				# ImageBase
+#else
+	.quad	0				# ImageBase
+#endif
+	.long	0x1000				# SectionAlignment
+	.long	0x200				# FileAlignment
+	.word	0				# MajorOperatingSystemVersion
+	.word	0				# MinorOperatingSystemVersion
+	.word	0				# MajorImageVersion
+	.word	0				# MinorImageVersion
+	.word	0				# MajorSubsystemVersion
+	.word	0				# MinorSubsystemVersion
+	.long	0				# Win32VersionValue
+
+	#
+	# The size of the bzImage is written in tools/build.c
+	#
+	.long	0				# SizeOfImage
+
+	.long	0x200				# SizeOfHeaders
+	.long	0				# CheckSum
+	.word	0xa				# Subsystem (EFI application)
+	.word	0				# DllCharacteristics
+#ifdef CONFIG_X86_32
+	.long	0				# SizeOfStackReserve
+	.long	0				# SizeOfStackCommit
+	.long	0				# SizeOfHeapReserve
+	.long	0				# SizeOfHeapCommit
+#else
+	.quad	0				# SizeOfStackReserve
+	.quad	0				# SizeOfStackCommit
+	.quad	0				# SizeOfHeapReserve
+	.quad	0				# SizeOfHeapCommit
+#endif
+	.long	0				# LoaderFlags
+	.long	0x1				# NumberOfRvaAndSizes
+
+	.quad	0				# ExportTable
+	.quad	0				# ImportTable
+	.quad	0				# ResourceTable
+	.quad	0				# ExceptionTable
+	.quad	0				# CertificationTable
+	.quad	0				# BaseRelocationTable
+
+	# Section table
+section_table:
+	.ascii	".text"
+	.byte	0
+	.byte	0
+	.byte	0
+	.long	0
+	.long	0x0				# startup_{32,64}
+	.long	0				# Size of initialized data
+						# on disk
+	.long	0x0				# startup_{32,64}
+	.long	0				# PointerToRelocations
+	.long	0				# PointerToLineNumbers
+	.word	0				# NumberOfRelocations
+	.word	0				# NumberOfLineNumbers
+	.long	0x60500020			# Characteristics (section flags)
+
+	#
+	# The EFI application loader requires a relocation section
+	# because EFI applications are relocatable and not having
+	# this section seems to confuse it. But since we don't need
+	# the loader to fixup any relocs for us just fill it with a
+	# single dummy reloc.
+	#
+	.ascii	".reloc"
+	.byte	0
+	.byte	0
+	.long	reloc_end - reloc_start
+	.long	reloc_start
+	.long	reloc_end - reloc_start		# SizeOfRawData
+	.long	reloc_start			# PointerToRawData
+	.long	0				# PointerToRelocations
+	.long	0				# PointerToLineNumbers
+	.word	0				# NumberOfRelocations
+	.word	0				# NumberOfLineNumbers
+	.long	0x42100040			# Characteristics (section flags)
+#endif /* CONFIG_EFI_STUB */
 
 	# Kernel attributes; used by setup.  This is part 1 of the
 	# header, from the old boot sector.
@@ -318,3 +466,13 @@ die:
 setup_corrupt:
 	.byte	7
 	.string	"No setup signature found...\n"
+
+	.data
+dummy:	.long	0
+
+	.section .reloc
+reloc_start:
+	.long	dummy - reloc_start
+	.long	10
+	.word	0
+reloc_end:
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 3cbc4058dd26..574dedfe2890 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -111,3 +111,38 @@ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int bas
 
 	return result;
 }
+
+/**
+ * strlen - Find the length of a string
+ * @s: The string to be sized
+ */
+size_t strlen(const char *s)
+{
+	const char *sc;
+
+	for (sc = s; *sc != '\0'; ++sc)
+		/* nothing */;
+	return sc - s;
+}
+
+/**
+ * strstr - Find the first substring in a %NUL terminated string
+ * @s1: The string to be searched
+ * @s2: The string to search for
+ */
+char *strstr(const char *s1, const char *s2)
+{
+	size_t l1, l2;
+
+	l2 = strlen(s2);
+	if (!l2)
+		return (char *)s1;
+	l1 = strlen(s1);
+	while (l1 >= l2) {
+		l1--;
+		if (!memcmp(s1, s2, l2))
+			return (char *)s1;
+		s1++;
+	}
+	return NULL;
+}
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index fdc60a0b3c20..4e9bd6bcafa6 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -135,6 +135,9 @@ static void usage(void)
 
 int main(int argc, char ** argv)
 {
+#ifdef CONFIG_EFI_STUB
+	unsigned int file_sz, pe_header;
+#endif
 	unsigned int i, sz, setup_sectors;
 	int c;
 	u32 sys_size;
@@ -194,6 +197,42 @@ int main(int argc, char ** argv)
 	buf[0x1f6] = sys_size >> 16;
 	buf[0x1f7] = sys_size >> 24;
 
+#ifdef CONFIG_EFI_STUB
+	file_sz = sz + i + ((sys_size * 16) - sz);
+
+	pe_header = *(unsigned int *)&buf[0x3c];
+
+	/* Size of code */
+	*(unsigned int *)&buf[pe_header + 0x1c] = file_sz;
+
+	/* Size of image */
+	*(unsigned int *)&buf[pe_header + 0x50] = file_sz;
+
+#ifdef CONFIG_X86_32
+	/* Address of entry point */
+	*(unsigned int *)&buf[pe_header + 0x28] = i;
+
+	/* .text size */
+	*(unsigned int *)&buf[pe_header + 0xb0] = file_sz;
+
+	/* .text size of initialised data */
+	*(unsigned int *)&buf[pe_header + 0xb8] = file_sz;
+#else
+	/*
+	 * Address of entry point. startup_32 is at the beginning and
+	 * the 64-bit entry point (startup_64) is always 512 bytes
+	 * after.
+	 */
+	*(unsigned int *)&buf[pe_header + 0x28] = i + 512;
+
+	/* .text size */
+	*(unsigned int *)&buf[pe_header + 0xc0] = file_sz;
+
+	/* .text size of initialised data */
+	*(unsigned int *)&buf[pe_header + 0xc8] = file_sz;
+#endif /* CONFIG_X86_32 */
+#endif /* CONFIG_EFI_STUB */
+
 	crc = partial_crc32(buf, i, crc);
 	if (fwrite(buf, 1, i, stdout) != i)
 		die("Writing setup failed");
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 4f13fafc5264..68de2dc962ec 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -67,4 +67,6 @@ void common(void) {
 	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
 	OFFSET(BP_version, boot_params, hdr.version);
 	OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
+	OFFSET(BP_pref_address, boot_params, hdr.pref_address);
+	OFFSET(BP_code32_start, boot_params, hdr.code32_start);
 }
-- 
cgit v1.2.3-55-g7522


From 3e8f9451d3db669d7c0d8b330d4f5770149d90d5 Mon Sep 17 00:00:00 2001
From: Alan Cox
Date: Thu, 15 Dec 2011 22:19:41 +0000
Subject: x86: Fix INTEL_MID silly

Doh.. pass the brown paper bags - preferably filled with mince
pies..

This fixes occasional build failures.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Link: http://lkml.kernel.org/n/tip-r0oc1knlvzuqr69artaeq8s8@git.kernel.org
[ extended the changelog a bit ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index faf39a0d6242..2b54a2fb3ab0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -429,6 +429,7 @@ config X86_MDFLD
 	select SPI
 	select INTEL_SCU_IPC
 	select X86_PLATFORM_DEVICES
+	select X86_INTEL_MID
 	---help---
 	  Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin
 	  Internet Device(MID) platform. 
-- 
cgit v1.2.3-55-g7522


From 2d2da60fb40a80cc59383121ccf763e0e0e8a42a Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst
Date: Fri, 16 Dec 2011 13:30:58 +0100
Subject: x86, efi: Break up large initrd reads

The efi boot stub tries to read the entire initrd in 1 go, however
some efi implementations hang if too much if asked to read too much
data at the same time. After some experimentation I found out that my
asrock p67 board will hang if asked to read chunks of 4MiB, so use a
safe value.

elilo reads in chunks of 16KiB, but since that requires many read
calls I use a value of 1 MiB.  hpa suggested adding individual
blacklists for when systems are found where this value causes a crash.

Signed-off-by: Maarten Lankhorst <m.b.lankhorst@gmail.com>
Link: http://lkml.kernel.org/r/4EEB3A02.3090201@gmail.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/compressed/eboot.c | 20 ++++++++++++++------
 arch/x86/boot/compressed/eboot.h |  1 +
 2 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 4055e63d0b04..fec216f4fbc3 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -643,14 +643,22 @@ grow:
 			u64 size;
 
 			size = initrds[j].size;
-			status = efi_call_phys3(fh->read, initrds[j].handle,
-						&size, addr);
-			if (status != EFI_SUCCESS)
-				goto free_initrd_total;
+			while (size) {
+				u64 chunksize;
+				if (size > EFI_READ_CHUNK_SIZE)
+					chunksize = EFI_READ_CHUNK_SIZE;
+				else
+					chunksize = size;
+				status = efi_call_phys3(fh->read,
+							initrds[j].handle,
+							&chunksize, addr);
+				if (status != EFI_SUCCESS)
+					goto free_initrd_total;
+				addr += chunksize;
+				size -= chunksize;
+			}
 
 			efi_call_phys1(fh->close, initrds[j].handle);
-
-			addr += size;
 		}
 
 	}
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
index f66d023e91ef..39251663e65b 100644
--- a/arch/x86/boot/compressed/eboot.h
+++ b/arch/x86/boot/compressed/eboot.h
@@ -12,6 +12,7 @@
 #define DESC_TYPE_CODE_DATA	(1 << 0)
 
 #define EFI_PAGE_SIZE		(1UL << EFI_PAGE_SHIFT)
+#define EFI_READ_CHUNK_SIZE	(1024 * 1024)
 
 #define PIXEL_RGB_RESERVED_8BIT_PER_COLOR		0
 #define PIXEL_BGR_RESERVED_8BIT_PER_COLOR		1
-- 
cgit v1.2.3-55-g7522


From a0c3832a578c84d4a93c61e22cb09c99fa9447ea Mon Sep 17 00:00:00 2001
From: Alan Cox
Date: Sat, 17 Dec 2011 21:57:25 +0000
Subject: x86/apb: Fix configuration constraints

The APB timer requires SFI, SCU and MID support

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Link: http://lkml.kernel.org/r/20111217215719.3743.93550.stgit@bob.linux.org.uk
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2b54a2fb3ab0..ca4ee7644855 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -665,6 +665,7 @@ config APB_TIMER
        def_bool y if MRST
        prompt "Langwell APB Timer Support" if X86_MRST
        select DW_APB_TIMER
+       depends on X86_INTEL_MID && SFI
        help
          APB timer is the replacement for 8254, HPET on X86 MID platforms.
          The APBT provides a stable time base on SMP
-- 
cgit v1.2.3-55-g7522


From 933b9463a0ef75da681747b2dac06c1754465372 Mon Sep 17 00:00:00 2001
From: Alan Cox
Date: Sat, 17 Dec 2011 17:43:40 +0000
Subject: x86/intel config: Revamp configuration to allow for Moorestown and
 Medfield

This sets all up the other bits that need to be INTEL_MID
specific rather than Moorestown specific.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Link: http://lkml.kernel.org/r/20111217174318.7207.91543.stgit@bob.linux.org.uk
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                | 4 ++--
 arch/x86/include/asm/fixmap.h   | 2 +-
 arch/x86/include/asm/setup.h    | 2 +-
 arch/x86/pci/Makefile           | 2 +-
 arch/x86/platform/mrst/Makefile | 4 ++--
 drivers/rtc/Kconfig             | 6 +++---
 6 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ca4ee7644855..c3c9343e4498 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -662,8 +662,8 @@ config HPET_EMULATE_RTC
 	depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
 
 config APB_TIMER
-       def_bool y if MRST
-       prompt "Langwell APB Timer Support" if X86_MRST
+       def_bool y if X86_INTEL_MID
+       prompt "Intel MID APB Timer Support" if X86_INTEL_MID
        select DW_APB_TIMER
        depends on X86_INTEL_MID && SFI
        help
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 460c74e4852c..4da3c0c4c974 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -117,7 +117,7 @@ enum fixed_addresses {
 #endif
 	FIX_TEXT_POKE1,	/* reserve 2 pages for text_poke() */
 	FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
-#ifdef	CONFIG_X86_MRST
+#ifdef	CONFIG_X86_INTEL_MID
 	FIX_LNW_VRTC,
 #endif
 	__end_of_permanent_fixed_addresses,
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 9756551ec760..d0f19f9fb846 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -47,7 +47,7 @@ extern void reserve_standard_io_resources(void);
 extern void i386_reserve_resources(void);
 extern void setup_default_timer_irq(void);
 
-#ifdef CONFIG_X86_MRST
+#ifdef CONFIG_X86_INTEL_MID
 extern void x86_mrst_early_setup(void);
 #else
 static inline void x86_mrst_early_setup(void) { }
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 6b8759f7634e..75b06f34b1f2 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -15,7 +15,7 @@ obj-$(CONFIG_X86_VISWS)		+= visws.o
 
 obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
 
-obj-$(CONFIG_X86_MRST)		+= mrst.o
+obj-$(CONFIG_X86_INTEL_MID)	+= mrst.o
 
 obj-y				+= common.o early.o
 obj-y				+= amd_bus.o bus_numa.o
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
index ddeec7300464..7baed5135e0f 100644
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_X86_MRST)		+= mrst.o
-obj-$(CONFIG_X86_MRST)		+= vrtc.o
+obj-$(CONFIG_X86_INTEL_MID)	+= mrst.o
+obj-$(CONFIG_X86_INTEL_MID)	+= vrtc.o
 obj-$(CONFIG_EARLY_PRINTK_INTEL_MID)	+= early_printk_mrst.o
 obj-$(CONFIG_X86_MRST)		+= pmu.o
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 53eb4e55b289..3a125b835546 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -498,9 +498,9 @@ config RTC_DRV_CMOS
 	  will be called rtc-cmos.
 
 config RTC_DRV_VRTC
-	tristate "Virtual RTC for Moorestown platforms"
-	depends on X86_MRST
-	default y if X86_MRST
+	tristate "Virtual RTC for Intel MID platforms"
+	depends on X86_INTEL_MID
+	default y if X86_INTEL_MID
 
 	help
 	Say "yes" here to get direct support for the real time clock
-- 
cgit v1.2.3-55-g7522


From d79a8869d8a4b565b12a88faeff834b09a36368c Mon Sep 17 00:00:00 2001
From: Michael Demeter
Date: Thu, 15 Dec 2011 22:31:23 +0000
Subject: x86/mrst: Add additional debug prints for pb_keys

Added additional debug output that we always seem to add
during power ons to validate firmware operation.

Signed-off-by: Michael Demeter <michael.demeter@intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Link: http://lkml.kernel.org/r/20111215223116.10166.50803.stgit@bob.linux.org.uk
[ fixed line breaks, formatting and commit title. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/platform/mrst/mrst.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 6a21f603bd78..b6a33d2bd4d6 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -983,6 +983,7 @@ static int __init pb_keys_init(void)
 	num = sizeof(gpio_button) / sizeof(struct gpio_keys_button);
 	for (i = 0; i < num; i++) {
 		gb[i].gpio = get_gpio_by_name(gb[i].desc);
+		pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc, gb[i].gpio);
 		if (gb[i].gpio == -1)
 			continue;
 
-- 
cgit v1.2.3-55-g7522


From 88715b9ade718564fd8b1318735826370481366b Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna
Date: Tue, 13 Dec 2011 12:53:07 +0200
Subject: crypto: twofish-x86_64-3way - remove unneeded LRW/XTS #ifdefs

Since LRW & XTS are selected by twofish-x86_64-3way, we don't need these
#ifdefs anymore.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/twofish_glue_3way.c | 32 --------------------------------
 1 file changed, 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 954f59eeb7b4..7fee8c152f93 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -35,14 +35,6 @@
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
 
-#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE)
-#define HAS_LRW
-#endif
-
-#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE)
-#define HAS_XTS
-#endif
-
 /* regular block cipher functions from twofish_x86_64 module */
 asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
 				const u8 *src);
@@ -442,8 +434,6 @@ static struct crypto_alg blk_ctr_alg = {
 	},
 };
 
-#if defined(HAS_LRW) || defined(HAS_XTS)
-
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = TF_BLOCK_SIZE;
@@ -474,10 +464,6 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 		twofish_dec_blk(ctx, srcdst, srcdst);
 }
 
-#endif
-
-#ifdef HAS_LRW
-
 struct twofish_lrw_ctx {
 	struct lrw_table_ctx lrw_table;
 	struct twofish_ctx twofish_ctx;
@@ -562,10 +548,6 @@ static struct crypto_alg blk_lrw_alg = {
 	},
 };
 
-#endif
-
-#ifdef HAS_XTS
-
 struct twofish_xts_ctx {
 	struct twofish_ctx tweak_ctx;
 	struct twofish_ctx crypt_ctx;
@@ -655,8 +637,6 @@ static struct crypto_alg blk_xts_alg = {
 	},
 };
 
-#endif
-
 int __init init(void)
 {
 	int err;
@@ -670,27 +650,19 @@ int __init init(void)
 	err = crypto_register_alg(&blk_ctr_alg);
 	if (err)
 		goto ctr_err;
-#ifdef HAS_LRW
 	err = crypto_register_alg(&blk_lrw_alg);
 	if (err)
 		goto blk_lrw_err;
-#endif
-#ifdef HAS_XTS
 	err = crypto_register_alg(&blk_xts_alg);
 	if (err)
 		goto blk_xts_err;
-#endif
 
 	return 0;
 
-#ifdef HAS_XTS
 	crypto_unregister_alg(&blk_xts_alg);
 blk_xts_err:
-#endif
-#ifdef HAS_LRW
 	crypto_unregister_alg(&blk_lrw_alg);
 blk_lrw_err:
-#endif
 	crypto_unregister_alg(&blk_ctr_alg);
 ctr_err:
 	crypto_unregister_alg(&blk_cbc_alg);
@@ -702,12 +674,8 @@ ecb_err:
 
 void __exit fini(void)
 {
-#ifdef HAS_XTS
 	crypto_unregister_alg(&blk_xts_alg);
-#endif
-#ifdef HAS_LRW
 	crypto_unregister_alg(&blk_lrw_alg);
-#endif
 	crypto_unregister_alg(&blk_ctr_alg);
 	crypto_unregister_alg(&blk_cbc_alg);
 	crypto_unregister_alg(&blk_ecb_alg);
-- 
cgit v1.2.3-55-g7522


From 7ba8babf84fa4e9b648e247223043785f596dd23 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna
Date: Tue, 13 Dec 2011 12:53:17 +0200
Subject: crypto: serpent-sse2 - remove unneeded LRW/XTS #ifdefs

Since LRW & XTS are selected by serpent-sse2, we don't need these #ifdefs
anymore.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent_sse2_glue.c | 40 -------------------------------------
 1 file changed, 40 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 2f5c304653f4..7955a9b76b91 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -47,14 +47,6 @@
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
 
-#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE)
-#define HAS_LRW
-#endif
-
-#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE)
-#define HAS_XTS
-#endif
-
 struct async_serpent_ctx {
 	struct cryptd_ablkcipher *cryptd_tfm;
 };
@@ -470,8 +462,6 @@ static struct crypto_alg blk_ctr_alg = {
 	},
 };
 
-#if defined(HAS_LRW) || defined(HAS_XTS)
-
 struct crypt_priv {
 	struct serpent_ctx *ctx;
 	bool fpu_enabled;
@@ -511,10 +501,6 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 		__serpent_decrypt(ctx->ctx, srcdst, srcdst);
 }
 
-#endif
-
-#ifdef HAS_LRW
-
 struct serpent_lrw_ctx {
 	struct lrw_table_ctx lrw_table;
 	struct serpent_ctx serpent_ctx;
@@ -620,10 +606,6 @@ static struct crypto_alg blk_lrw_alg = {
 	},
 };
 
-#endif
-
-#ifdef HAS_XTS
-
 struct serpent_xts_ctx {
 	struct serpent_ctx tweak_ctx;
 	struct serpent_ctx crypt_ctx;
@@ -730,8 +712,6 @@ static struct crypto_alg blk_xts_alg = {
 	},
 };
 
-#endif
-
 static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
 			unsigned int key_len)
 {
@@ -930,8 +910,6 @@ static struct crypto_alg ablk_ctr_alg = {
 	},
 };
 
-#ifdef HAS_LRW
-
 static int ablk_lrw_init(struct crypto_tfm *tfm)
 {
 	struct cryptd_ablkcipher *cryptd_tfm;
@@ -970,10 +948,6 @@ static struct crypto_alg ablk_lrw_alg = {
 	},
 };
 
-#endif
-
-#ifdef HAS_XTS
-
 static int ablk_xts_init(struct crypto_tfm *tfm)
 {
 	struct cryptd_ablkcipher *cryptd_tfm;
@@ -1010,8 +984,6 @@ static struct crypto_alg ablk_xts_alg = {
 	},
 };
 
-#endif
-
 static int __init serpent_sse2_init(void)
 {
 	int err;
@@ -1039,36 +1011,28 @@ static int __init serpent_sse2_init(void)
 	err = crypto_register_alg(&ablk_ctr_alg);
 	if (err)
 		goto ablk_ctr_err;
-#ifdef HAS_LRW
 	err = crypto_register_alg(&blk_lrw_alg);
 	if (err)
 		goto blk_lrw_err;
 	err = crypto_register_alg(&ablk_lrw_alg);
 	if (err)
 		goto ablk_lrw_err;
-#endif
-#ifdef HAS_XTS
 	err = crypto_register_alg(&blk_xts_alg);
 	if (err)
 		goto blk_xts_err;
 	err = crypto_register_alg(&ablk_xts_alg);
 	if (err)
 		goto ablk_xts_err;
-#endif
 	return err;
 
-#ifdef HAS_XTS
 	crypto_unregister_alg(&ablk_xts_alg);
 ablk_xts_err:
 	crypto_unregister_alg(&blk_xts_alg);
 blk_xts_err:
-#endif
-#ifdef HAS_LRW
 	crypto_unregister_alg(&ablk_lrw_alg);
 ablk_lrw_err:
 	crypto_unregister_alg(&blk_lrw_alg);
 blk_lrw_err:
-#endif
 	crypto_unregister_alg(&ablk_ctr_alg);
 ablk_ctr_err:
 	crypto_unregister_alg(&ablk_cbc_alg);
@@ -1086,14 +1050,10 @@ blk_ecb_err:
 
 static void __exit serpent_sse2_exit(void)
 {
-#ifdef HAS_XTS
 	crypto_unregister_alg(&ablk_xts_alg);
 	crypto_unregister_alg(&blk_xts_alg);
-#endif
-#ifdef HAS_LRW
 	crypto_unregister_alg(&ablk_lrw_alg);
 	crypto_unregister_alg(&blk_lrw_alg);
-#endif
 	crypto_unregister_alg(&ablk_ctr_alg);
 	crypto_unregister_alg(&ablk_cbc_alg);
 	crypto_unregister_alg(&ablk_ecb_alg);
-- 
cgit v1.2.3-55-g7522


From 549c89b98c4530b278dde1a3f68ce5ebbb1e6304 Mon Sep 17 00:00:00 2001
From: Linus Torvalds
Date: Tue, 29 Nov 2011 12:44:55 -0800
Subject: x86: Do not schedule while still in NMI context

The NMI handler uses the paranoid_exit routine that checks the
NEED_RESCHED flag, and if it is set and the return is for userspace,
then interrupts are enabled, the stack is swapped to the thread's stack,
and schedule is called. The problem with this is that we are still in an
NMI context until an iret is executed. This means that any new NMIs are
now starved until an interrupt or exception occurs and does the iret.

As NMIs can not be masked and can interrupt any location, they are
treated as a special case. NEED_RESCHED should not be set in an NMI
handler. The interruption by the NMI should not disturb the work flow
for scheduling. Any IPI sent to a processor after sending the
NEED_RESCHED would have to wait for the NMI anyway, and after the IPI
finishes the schedule would be called as required.

There is no reason to do anything special leaving an NMI. Remove the
call to paranoid_exit and do a simple return. This not only fixes the
bug of starved NMIs, but it also cleans up the code.

Link: http://lkml.kernel.org/r/CA+55aFzgM55hXTs4griX5e9=v_O+=ue+7Rj0PTD=M7hFYpyULQ@mail.gmail.com

Acked-by: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul Turner <pjt@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/entry_64.S | 32 --------------------------------
 1 file changed, 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index faf8d5e74b0b..3819ea907339 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1489,46 +1489,14 @@ ENTRY(nmi)
 	movq %rsp,%rdi
 	movq $-1,%rsi
 	call do_nmi
-#ifdef CONFIG_TRACE_IRQFLAGS
-	/* paranoidexit; without TRACE_IRQS_OFF */
-	/* ebx:	no swapgs flag */
-	DISABLE_INTERRUPTS(CLBR_NONE)
 	testl %ebx,%ebx				/* swapgs needed? */
 	jnz nmi_restore
-	testl $3,CS(%rsp)
-	jnz nmi_userspace
 nmi_swapgs:
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
 	RESTORE_ALL 8
 	jmp irq_return
-nmi_userspace:
-	GET_THREAD_INFO(%rcx)
-	movl TI_flags(%rcx),%ebx
-	andl $_TIF_WORK_MASK,%ebx
-	jz nmi_swapgs
-	movq %rsp,%rdi			/* &pt_regs */
-	call sync_regs
-	movq %rax,%rsp			/* switch stack for scheduling */
-	testl $_TIF_NEED_RESCHED,%ebx
-	jnz nmi_schedule
-	movl %ebx,%edx			/* arg3: thread flags */
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	xorl %esi,%esi 			/* arg2: oldset */
-	movq %rsp,%rdi 			/* arg1: &pt_regs */
-	call do_notify_resume
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	jmp nmi_userspace
-nmi_schedule:
-	ENABLE_INTERRUPTS(CLBR_ANY)
-	call schedule
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	jmp nmi_userspace
 	CFI_ENDPROC
-#else
-	jmp paranoid_exit
-	CFI_ENDPROC
-#endif
 END(nmi)
 
 ENTRY(ignore_sysret)
-- 
cgit v1.2.3-55-g7522


From 1fd466efc88c48f50e5ee29f4dbb4e210a889172 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Thu, 8 Dec 2011 12:32:27 -0500
Subject: x86: Document the NMI handler about not using paranoid_exit

Linus cleaned up the NMI handler but it still needs some comments to
explain why it uses save_paranoid but not paranoid_exit. Just to keep
others from adding that in the future, document why it's not used.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/entry_64.S | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3819ea907339..d1d5434e7f6a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1480,9 +1480,16 @@ END(error_exit)
 ENTRY(nmi)
 	INTR_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
-	pushq_cfi $-1
+	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
 	subq $ORIG_RAX-R15, %rsp
 	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+	/*
+	 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
+	 * as we should not be calling schedule in NMI context.
+	 * Even with normal interrupts enabled. An NMI should not be
+	 * setting NEED_RESCHED or anything that normal interrupts and
+	 * exceptions might do.
+	 */
 	call save_paranoid
 	DEFAULT_FRAME 0
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
-- 
cgit v1.2.3-55-g7522


From 3f3c8b8c4b2a34776c3470142a7c8baafcda6eb0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Thu, 8 Dec 2011 12:36:23 -0500
Subject: x86: Add workaround to NMI iret woes

In x86, when an NMI goes off, the CPU goes into an NMI context that
prevents other NMIs to trigger on that CPU. If an NMI is suppose to
trigger, it has to wait till the previous NMI leaves NMI context.
At that time, the next NMI can trigger (note, only one more NMI will
trigger, as only one can be latched at a time).

The way x86 gets out of NMI context is by calling iret. The problem
with this is that this causes problems if the NMI handle either
triggers an exception, or a breakpoint. Both the exception and the
breakpoint handlers will finish with an iret. If this happens while
in NMI context, the CPU will leave NMI context and a new NMI may come
in. As NMI handlers are not made to be re-entrant, this can cause
havoc with the system, not to mention, the nested NMI will write
all over the previous NMI's stack.

Linus Torvalds proposed the following workaround to this problem:

https://lkml.org/lkml/2010/7/14/264

"In fact, I wonder if we couldn't just do a software NMI disable
instead? Hav ea per-cpu variable (in the _core_ percpu areas that get
allocated statically) that points to the NMI stack frame, and just
make the NMI code itself do something like

 NMI entry:
 - load percpu NMI stack frame pointer
 - if non-zero we know we're nested, and should ignore this NMI:
    - we're returning to kernel mode, so return immediately by using
"popf/ret", which also keeps NMI's disabled in the hardware until the
"real" NMI iret happens.
    - before the popf/iret, use the NMI stack pointer to make the NMI
return stack be invalid and cause a fault
  - set the NMI stack pointer to the current stack pointer

 NMI exit (not the above "immediate exit because we nested"):
   clear the percpu NMI stack pointer
   Just do the iret.

Now, the thing is, now the "iret" is atomic. If we had a nested NMI,
we'll take a fault, and that re-does our "delayed" NMI - and NMI's
will stay masked.

And if we didn't have a nested NMI, that iret will now unmask NMI's,
and everything is happy."

I first tried to follow this advice but as I started implementing this
code, a few gotchas showed up.

One, is accessing per-cpu variables in the NMI handler.

The problem is that per-cpu variables use the %gs register to get the
variable for the given CPU. But as the NMI may happen in userspace,
we must first perform a SWAPGS to get to it. The NMI handler already
does this later in the code, but its too late as we have saved off
all the registers and we don't want to do that for a disabled NMI.

Peter Zijlstra suggested to keep all variables on the stack. This
simplifies things greatly and it has the added benefit of cache locality.

Two, faulting on the iret.

I really wanted to make this work, but it was becoming very hacky, and
I never got it to be stable. The iret already had a fault handler for
userspace faulting with bad segment registers, and getting NMI to trigger
a fault and detect it was very tricky. But for strange reasons, the system
would usually take a double fault and crash. I never figured out why
and decided to go with a simple "jmp" approach. The new approach I took
also simplified things.

Finally, the last problem with Linus's approach was to have the nested
NMI handler do a ret instead of an iret to give the first NMI NMI-context
again.

The problem is that ret is much more limited than an iret. I couldn't figure
out how to get the stack back where it belonged. I could have copied the
current stack, pushed the return onto it, but my fear here is that there
may be some place that writes data below the stack pointer. I know that
is not something code should depend on, but I don't want to chance it.
I may add this feature later, but for now, an NMI handler that loses NMI
context will not get it back.

Here's what is done:

When an NMI comes in, the HW pushes the interrupt stack frame onto the
per cpu NMI stack that is selected by the IST.

A special location on the NMI stack holds a variable that is set when
the first NMI handler runs. If this variable is set then we know that
this is a nested NMI and we process the nested NMI code.

There is still a race when this variable is cleared and an NMI comes
in just before the first NMI does the return. For this case, if the
variable is cleared, we also check if the interrupted stack is the
NMI stack. If it is, then we process the nested NMI code.

Why the two tests and not just test the interrupted stack?

If the first NMI hits a breakpoint and loses NMI context, and then it
hits another breakpoint and while processing that breakpoint we get a
nested NMI. When processing a breakpoint, the stack changes to the
breakpoint stack. If another NMI comes in here we can't rely on the
interrupted stack to be the NMI stack.

If the variable is not set and the interrupted task's stack is not the
NMI stack, then we know this is the first NMI and we can process things
normally. But in order to do so, we need to do a few things first.

1) Set the stack variable that tells us that we are in an NMI handler

2) Make two copies of the interrupt stack frame.
   One copy is used to return on iret
   The other is used to restore the first one if we have a nested NMI.

This is what the stack will look like:

	  +-------------------------+
	  | original SS             |
	  | original Return RSP     |
	  | original RFLAGS         |
	  | original CS             |
	  | original RIP            |
	  +-------------------------+
	  | temp storage for rdx    |
	  +-------------------------+
	  | NMI executing variable  |
	  +-------------------------+
	  | Saved SS                |
	  | Saved Return RSP        |
	  | Saved RFLAGS            |
	  | Saved CS                |
	  | Saved RIP               |
	  +-------------------------+
	  | copied SS               |
	  | copied Return RSP       |
	  | copied RFLAGS           |
	  | copied CS               |
	  | copied RIP              |
	  +-------------------------+
	  | pt_regs                 |
	  +-------------------------+

The original stack frame contains what the HW put in when we entered
the NMI.

We store %rdx as a temp variable to use. Both the original HW stack
frame and this %rdx storage will be clobbered by nested NMIs so we
can not rely on them later in the first NMI handler.

The next item is the special stack variable that is set when we execute
the rest of the NMI handler.

Then we have two copies of the interrupt stack. The second copy is
modified by any nested NMIs to let the first NMI know that we triggered
a second NMI (latched) and that we should repeat the NMI handler.

If the first NMI hits an exception or breakpoint that takes it out of
NMI context, if a second NMI comes in before the first one finishes,
it will update the copied interrupt stack to point to a fix up location
to trigger another NMI.

When the first NMI calls iret, it will instead jump to the fix up
location. This fix up location will copy the saved interrupt stack back
to the copy and execute the nmi handler again.

Note, the nested NMI knows enough to check if it preempted a previous
NMI handler while it is in the fixup location. If it has, it will not
modify the copied interrupt stack and will just leave as if nothing
happened. As the NMI handle is about to execute again, there's no reason
to latch now.

To test all this, I forced the NMI handler to call iret and take itself
out of NMI context. I also added assemble code to write to the serial to
make sure that it hits the nested path as well as the fix up path.
Everything seems to be working fine.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul Turner <pjt@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/entry_64.S | 177 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index d1d5434e7f6a..b62aa298df7f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1475,11 +1475,166 @@ ENTRY(error_exit)
 	CFI_ENDPROC
 END(error_exit)
 
+/*
+ * Test if a given stack is an NMI stack or not.
+ */
+	.macro test_in_nmi reg stack nmi_ret normal_ret
+	cmpq %\reg, \stack
+	ja \normal_ret
+	subq $EXCEPTION_STKSZ, %\reg
+	cmpq %\reg, \stack
+	jb \normal_ret
+	jmp \nmi_ret
+	.endm
 
 	/* runs on exception stack */
 ENTRY(nmi)
 	INTR_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
+	/*
+	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
+	 * the iretq it performs will take us out of NMI context.
+	 * This means that we can have nested NMIs where the next
+	 * NMI is using the top of the stack of the previous NMI. We
+	 * can't let it execute because the nested NMI will corrupt the
+	 * stack of the previous NMI. NMI handlers are not re-entrant
+	 * anyway.
+	 *
+	 * To handle this case we do the following:
+	 *  Check the a special location on the stack that contains
+	 *  a variable that is set when NMIs are executing.
+	 *  The interrupted task's stack is also checked to see if it
+	 *  is an NMI stack.
+	 *  If the variable is not set and the stack is not the NMI
+	 *  stack then:
+	 *    o Set the special variable on the stack
+	 *    o Copy the interrupt frame into a "saved" location on the stack
+	 *    o Copy the interrupt frame into a "copy" location on the stack
+	 *    o Continue processing the NMI
+	 *  If the variable is set or the previous stack is the NMI stack:
+	 *    o Modify the "copy" location to jump to the repeate_nmi
+	 *    o return back to the first NMI
+	 *
+	 * Now on exit of the first NMI, we first clear the stack variable
+	 * The NMI stack will tell any nested NMIs at that point that it is
+	 * nested. Then we pop the stack normally with iret, and if there was
+	 * a nested NMI that updated the copy interrupt stack frame, a
+	 * jump will be made to the repeat_nmi code that will handle the second
+	 * NMI.
+	 */
+
+	/* Use %rdx as out temp variable throughout */
+	pushq_cfi %rdx
+
+	/*
+	 * Check the special variable on the stack to see if NMIs are
+	 * executing.
+	 */
+	cmp $1, -8(%rsp)
+	je nested_nmi
+
+	/*
+	 * Now test if the previous stack was an NMI stack.
+	 * We need the double check. We check the NMI stack to satisfy the
+	 * race when the first NMI clears the variable before returning.
+	 * We check the variable because the first NMI could be in a
+	 * breakpoint routine using a breakpoint stack.
+	 */
+	lea 6*8(%rsp), %rdx
+	test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+
+nested_nmi:
+	/*
+	 * Do nothing if we interrupted the fixup in repeat_nmi.
+	 * It's about to repeat the NMI handler, so we are fine
+	 * with ignoring this one.
+	 */
+	movq $repeat_nmi, %rdx
+	cmpq 8(%rsp), %rdx
+	ja 1f
+	movq $end_repeat_nmi, %rdx
+	cmpq 8(%rsp), %rdx
+	ja nested_nmi_out
+
+1:
+	/* Set up the interrupted NMIs stack to jump to repeat_nmi */
+	leaq -6*8(%rsp), %rdx
+	movq %rdx, %rsp
+	CFI_ADJUST_CFA_OFFSET 6*8
+	pushq_cfi $__KERNEL_DS
+	pushq_cfi %rdx
+	pushfq_cfi
+	pushq_cfi $__KERNEL_CS
+	pushq_cfi $repeat_nmi
+
+	/* Put stack back */
+	addq $(11*8), %rsp
+	CFI_ADJUST_CFA_OFFSET -11*8
+
+nested_nmi_out:
+	popq_cfi %rdx
+
+	/* No need to check faults here */
+	INTERRUPT_RETURN
+
+first_nmi:
+	/*
+	 * Because nested NMIs will use the pushed location that we
+	 * stored in rdx, we must keep that space available.
+	 * Here's what our stack frame will look like:
+	 * +-------------------------+
+	 * | original SS             |
+	 * | original Return RSP     |
+	 * | original RFLAGS         |
+	 * | original CS             |
+	 * | original RIP            |
+	 * +-------------------------+
+	 * | temp storage for rdx    |
+	 * +-------------------------+
+	 * | NMI executing variable  |
+	 * +-------------------------+
+	 * | Saved SS                |
+	 * | Saved Return RSP        |
+	 * | Saved RFLAGS            |
+	 * | Saved CS                |
+	 * | Saved RIP               |
+	 * +-------------------------+
+	 * | copied SS               |
+	 * | copied Return RSP       |
+	 * | copied RFLAGS           |
+	 * | copied CS               |
+	 * | copied RIP              |
+	 * +-------------------------+
+	 * | pt_regs                 |
+	 * +-------------------------+
+	 *
+	 * The saved RIP is used to fix up the copied RIP that a nested
+	 * NMI may zero out. The original stack frame and the temp storage
+	 * is also used by nested NMIs and can not be trusted on exit.
+	 */
+	/* Set the NMI executing variable on the stack. */
+	pushq_cfi $1
+
+	/* Copy the stack frame to the Saved frame */
+	.rept 5
+	pushq_cfi 6*8(%rsp)
+	.endr
+
+	/* Make another copy, this one may be modified by nested NMIs */
+	.rept 5
+	pushq_cfi 4*8(%rsp)
+	.endr
+
+	/* Do not pop rdx, nested NMIs will corrupt it */
+	movq 11*8(%rsp), %rdx
+
+	/*
+	 * Everything below this point can be preempted by a nested
+	 * NMI if the first NMI took an exception. Repeated NMIs
+	 * caused by an exception and nested NMI will start here, and
+	 * can still be preempted by another NMI.
+	 */
+restart_nmi:
 	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
 	subq $ORIG_RAX-R15, %rsp
 	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
@@ -1502,10 +1657,32 @@ nmi_swapgs:
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
 	RESTORE_ALL 8
+	/* Clear the NMI executing stack variable */
+	movq $0, 10*8(%rsp)
 	jmp irq_return
 	CFI_ENDPROC
 END(nmi)
 
+	/*
+	 * If an NMI hit an iret because of an exception or breakpoint,
+	 * it can lose its NMI context, and a nested NMI may come in.
+	 * In that case, the nested NMI will change the preempted NMI's
+	 * stack to jump to here when it does the final iret.
+	 */
+repeat_nmi:
+	INTR_FRAME
+	/* Update the stack variable to say we are still in NMI */
+	movq $1, 5*8(%rsp)
+
+	/* copy the saved stack back to copy stack */
+	.rept 5
+	pushq_cfi 4*8(%rsp)
+	.endr
+
+	jmp restart_nmi
+	CFI_ENDPROC
+end_repeat_nmi:
+
 ENTRY(ignore_sysret)
 	CFI_STARTPROC
 	mov $-ENOSYS,%eax
-- 
cgit v1.2.3-55-g7522


From 228bdaa95fb830e08b6acd1afd4d2c55093cabfa Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Fri, 9 Dec 2011 03:02:19 -0500
Subject: x86: Keep current stack in NMI breakpoints

We want to allow NMI handlers to have breakpoints to be able to
remove stop_machine from ftrace, kprobes and jump_labels. But if
an NMI interrupts a current breakpoint, and then it triggers a
breakpoint itself, it will switch to the breakpoint stack and
corrupt the data on it for the breakpoint processing that it
interrupted.

Instead, have the NMI check if it interrupted breakpoint processing
by checking if the stack that is currently used is a breakpoint
stack. If it is, then load a special IDT that changes the IST
for the debug exception to keep the same stack in kernel context.
When the NMI is done, it puts it back.

This way, if the NMI does trigger a breakpoint, it will keep
using the same stack and not stomp on the breakpoint data for
the breakpoint it interrupted.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/include/asm/desc.h      | 12 ++++++++++++
 arch/x86/include/asm/processor.h |  6 ++++++
 arch/x86/kernel/cpu/common.c     | 22 ++++++++++++++++++++++
 arch/x86/kernel/head_64.S        |  4 ++++
 arch/x86/kernel/nmi.c            | 15 +++++++++++++++
 arch/x86/kernel/traps.c          |  6 ++++++
 6 files changed, 65 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 41935fadfdfc..e95822d683f4 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -35,6 +35,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
 
 extern struct desc_ptr idt_descr;
 extern gate_desc idt_table[];
+extern struct desc_ptr nmi_idt_descr;
+extern gate_desc nmi_idt_table[];
 
 struct gdt_page {
 	struct desc_struct gdt[GDT_ENTRIES];
@@ -307,6 +309,16 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
 	desc->limit = (limit >> 16) & 0xf;
 }
 
+#ifdef CONFIG_X86_64
+static inline void set_nmi_gate(int gate, void *addr)
+{
+	gate_desc s;
+
+	pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS);
+	write_idt_entry(nmi_idt_table, gate, &s);
+}
+#endif
+
 static inline void _set_gate(int gate, unsigned type, void *addr,
 			     unsigned dpl, unsigned ist, unsigned seg)
 {
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b650435ffb53..4b39d6d7e3a1 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -402,6 +402,9 @@ DECLARE_PER_CPU(char *, irq_stack_ptr);
 DECLARE_PER_CPU(unsigned int, irq_count);
 extern unsigned long kernel_eflags;
 extern asmlinkage void ignore_sysret(void);
+int is_debug_stack(unsigned long addr);
+void debug_stack_set_zero(void);
+void debug_stack_reset(void);
 #else	/* X86_64 */
 #ifdef CONFIG_CC_STACKPROTECTOR
 /*
@@ -416,6 +419,9 @@ struct stack_canary {
 };
 DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
+static inline int is_debug_stack(unsigned long addr) { return 0; }
+static inline void debug_stack_set_zero(void) { }
+static inline void debug_stack_reset(void) { }
 #endif	/* X86_64 */
 
 extern unsigned int xstate_size;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index aa003b13a831..caa404556b9c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1026,6 +1026,8 @@ __setup("clearcpuid=", setup_disablecpuid);
 
 #ifdef CONFIG_X86_64
 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
+struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1,
+				    (unsigned long) nmi_idt_table };
 
 DEFINE_PER_CPU_FIRST(union irq_stack_union,
 		     irq_stack_union) __aligned(PAGE_SIZE);
@@ -1090,6 +1092,24 @@ unsigned long kernel_eflags;
  */
 DEFINE_PER_CPU(struct orig_ist, orig_ist);
 
+static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
+
+int is_debug_stack(unsigned long addr)
+{
+	return addr <= __get_cpu_var(debug_stack_addr) &&
+		addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ);
+}
+
+void debug_stack_set_zero(void)
+{
+	load_idt((const struct desc_ptr *)&nmi_idt_descr);
+}
+
+void debug_stack_reset(void)
+{
+	load_idt((const struct desc_ptr *)&idt_descr);
+}
+
 #else	/* CONFIG_X86_64 */
 
 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
@@ -1208,6 +1228,8 @@ void __cpuinit cpu_init(void)
 			estacks += exception_stack_sizes[v];
 			oist->ist[v] = t->x86_tss.ist[v] =
 					(unsigned long)estacks;
+			if (v == DEBUG_STACK-1)
+				per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
 		}
 	}
 
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index e11e39478a49..40f4eb3766d1 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -417,6 +417,10 @@ ENTRY(phys_base)
 ENTRY(idt_table)
 	.skip IDT_ENTRIES * 16
 
+	.align L1_CACHE_BYTES
+ENTRY(nmi_idt_table)
+	.skip IDT_ENTRIES * 16
+
 	__PAGE_ALIGNED_BSS
 	.align PAGE_SIZE
 ENTRY(empty_zero_page)
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index e88f37b58ddd..de8d4b333f40 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -408,6 +408,18 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 dotraplinkage notrace __kprobes void
 do_nmi(struct pt_regs *regs, long error_code)
 {
+	int update_debug_stack = 0;
+
+	/*
+	 * If we interrupted a breakpoint, it is possible that
+	 * the nmi handler will have breakpoints too. We need to
+	 * change the IDT such that breakpoints that happen here
+	 * continue to use the NMI stack.
+	 */
+	if (unlikely(is_debug_stack(regs->sp))) {
+		debug_stack_set_zero();
+		update_debug_stack = 1;
+	}
 	nmi_enter();
 
 	inc_irq_stat(__nmi_count);
@@ -416,6 +428,9 @@ do_nmi(struct pt_regs *regs, long error_code)
 		default_do_nmi(regs);
 
 	nmi_exit();
+
+	if (unlikely(update_debug_stack))
+		debug_stack_reset();
 }
 
 void stop_nmi(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a8e3eb83466c..a93c5cabc36a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -723,4 +723,10 @@ void __init trap_init(void)
 	cpu_init();
 
 	x86_init.irqs.trap_init();
+
+#ifdef CONFIG_X86_64
+	memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
+	set_nmi_gate(1, &debug);
+	set_nmi_gate(3, &int3);
+#endif
 }
-- 
cgit v1.2.3-55-g7522


From ccd49c2391773ffbf52bb80d75c4a92b16972517 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Tue, 13 Dec 2011 16:44:16 -0500
Subject: x86: Allow NMIs to hit breakpoints in i386

With i386, NMIs and breakpoints use the current stack and they
do not reset the stack pointer to a fix point that might corrupt
a previous NMI or breakpoint (as it does in x86_64). But NMIs are
still not made to be re-entrant, and need to prevent the case that
an NMI hitting a breakpoint (which does an iret), doesn't allow
another NMI to run.

The fix is to let the NMI be in 3 different states:

1) not running
2) executing
3) latched

When no NMI is executing on a given CPU, the state is "not running".
When the first NMI comes in, the state is switched to "executing".
On exit of that NMI, a cmpxchg is performed to switch the state
back to "not running" and if that fails, the NMI is restarted.

If a breakpoint is hit and does an iret, which re-enables NMIs,
and another NMI comes in before the first NMI finished, it will
detect that the state is not in the "not running" state and the
current NMI is nested. In this case, the state is switched to "latched"
to let the interrupted NMI know to restart the NMI handler, and
the nested NMI exits without doing anything.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul Turner <pjt@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/nmi.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 94 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index de8d4b333f40..47acaf319165 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -405,11 +405,84 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 		unknown_nmi_error(reason, regs);
 }
 
-dotraplinkage notrace __kprobes void
-do_nmi(struct pt_regs *regs, long error_code)
-{
-	int update_debug_stack = 0;
+/*
+ * NMIs can hit breakpoints which will cause it to lose its
+ * NMI context with the CPU when the breakpoint does an iret.
+ */
+#ifdef CONFIG_X86_32
+/*
+ * For i386, NMIs use the same stack as the kernel, and we can
+ * add a workaround to the iret problem in C. Simply have 3 states
+ * the NMI can be in.
+ *
+ *  1) not running
+ *  2) executing
+ *  3) latched
+ *
+ * When no NMI is in progress, it is in the "not running" state.
+ * When an NMI comes in, it goes into the "executing" state.
+ * Normally, if another NMI is triggered, it does not interrupt
+ * the running NMI and the HW will simply latch it so that when
+ * the first NMI finishes, it will restart the second NMI.
+ * (Note, the latch is binary, thus multiple NMIs triggering,
+ *  when one is running, are ignored. Only one NMI is restarted.)
+ *
+ * If an NMI hits a breakpoint that executes an iret, another
+ * NMI can preempt it. We do not want to allow this new NMI
+ * to run, but we want to execute it when the first one finishes.
+ * We set the state to "latched", and the first NMI will perform
+ * an cmpxchg on the state, and if it doesn't successfully
+ * reset the state to "not running" it will restart the next
+ * NMI.
+ */
+enum nmi_states {
+	NMI_NOT_RUNNING,
+	NMI_EXECUTING,
+	NMI_LATCHED,
+};
+static DEFINE_PER_CPU(enum nmi_states, nmi_state);
+
+#define nmi_nesting_preprocess(regs)					\
+	do {								\
+		if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) {	\
+			__get_cpu_var(nmi_state) = NMI_LATCHED;		\
+			return;						\
+		}							\
+	nmi_restart:							\
+		__get_cpu_var(nmi_state) = NMI_EXECUTING;		\
+	} while (0)
+
+#define nmi_nesting_postprocess()					\
+	do {								\
+		if (cmpxchg(&__get_cpu_var(nmi_state),			\
+		    NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING)	\
+			goto nmi_restart;				\
+	} while (0)
+#else /* x86_64 */
+/*
+ * In x86_64 things are a bit more difficult. This has the same problem
+ * where an NMI hitting a breakpoint that calls iret will remove the
+ * NMI context, allowing a nested NMI to enter. What makes this more
+ * difficult is that both NMIs and breakpoints have their own stack.
+ * When a new NMI or breakpoint is executed, the stack is set to a fixed
+ * point. If an NMI is nested, it will have its stack set at that same
+ * fixed address that the first NMI had, and will start corrupting the
+ * stack. This is handled in entry_64.S, but the same problem exists with
+ * the breakpoint stack.
+ *
+ * If a breakpoint is being processed, and the debug stack is being used,
+ * if an NMI comes in and also hits a breakpoint, the stack pointer
+ * will be set to the same fixed address as the breakpoint that was
+ * interrupted, causing that stack to be corrupted. To handle this case,
+ * check if the stack that was interrupted is the debug stack, and if
+ * so, change the IDT so that new breakpoints will use the current stack
+ * and not switch to the fixed address. On return of the NMI, switch back
+ * to the original IDT.
+ */
+static DEFINE_PER_CPU(int, update_debug_stack);
 
+static inline void nmi_nesting_preprocess(struct pt_regs *regs)
+{
 	/*
 	 * If we interrupted a breakpoint, it is possible that
 	 * the nmi handler will have breakpoints too. We need to
@@ -418,8 +491,22 @@ do_nmi(struct pt_regs *regs, long error_code)
 	 */
 	if (unlikely(is_debug_stack(regs->sp))) {
 		debug_stack_set_zero();
-		update_debug_stack = 1;
+		__get_cpu_var(update_debug_stack) = 1;
 	}
+}
+
+static inline void nmi_nesting_postprocess(void)
+{
+	if (unlikely(__get_cpu_var(update_debug_stack)))
+		debug_stack_reset();
+}
+#endif
+
+dotraplinkage notrace __kprobes void
+do_nmi(struct pt_regs *regs, long error_code)
+{
+	nmi_nesting_preprocess(regs);
+
 	nmi_enter();
 
 	inc_irq_stat(__nmi_count);
@@ -429,8 +516,8 @@ do_nmi(struct pt_regs *regs, long error_code)
 
 	nmi_exit();
 
-	if (unlikely(update_debug_stack))
-		debug_stack_reset();
+	/* On i386, may loop back to preprocess */
+	nmi_nesting_postprocess();
 }
 
 void stop_nmi(void)
-- 
cgit v1.2.3-55-g7522


From 42181186ad4db986fcaa40ca95c6e407e9e79372 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Fri, 16 Dec 2011 11:43:02 -0500
Subject: x86: Add counter when debug stack is used with interrupts enabled

Mathieu Desnoyers pointed out a case that can cause issues with
NMIs running on the debug stack:

  int3 -> interrupt -> NMI -> int3

Because the interrupt changes the stack, the NMI will not see that
it preempted the debug stack. Looking deeper at this case,
interrupts only happen when the int3 is from userspace or in
an a location in the exception table (fixup).

  userspace -> int3 -> interurpt -> NMI -> int3

All other int3s that happen in the kernel should be processed
without ever enabling interrupts, as the do_trap() call will
panic the kernel if it is called to process any other location
within the kernel.

Adding a counter around the sections that enable interrupts while
using the debug stack allows the NMI to also check that case.
If the NMI sees that it either interrupted a task using the debug
stack or the debug counter is non-zero, then it will have to
change the IDT table to make the int3 not change stacks (which will
corrupt the stack if it does).

Note, I had to move the debug_usage functions out of processor.h
and into debugreg.h because of the static inlined functions to
inc and dec the debug_usage counter. __get_cpu_var() requires
smp.h which includes processor.h, and would fail to build.

Link: http://lkml.kernel.org/r/1323976535.23971.112.camel@gandalf.stny.rr.com

Reported-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul Turner <pjt@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/include/asm/debugreg.h  | 22 ++++++++++++++++++++++
 arch/x86/include/asm/processor.h |  6 ------
 arch/x86/kernel/cpu/common.c     |  6 ++++--
 arch/x86/kernel/traps.c          | 14 ++++++++++++++
 4 files changed, 40 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 078ad0caefc6..b903d5ea3941 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -101,6 +101,28 @@ extern void aout_dump_debugregs(struct user *dump);
 
 extern void hw_breakpoint_restore(void);
 
+#ifdef CONFIG_X86_64
+DECLARE_PER_CPU(int, debug_stack_usage);
+static inline void debug_stack_usage_inc(void)
+{
+	__get_cpu_var(debug_stack_usage)++;
+}
+static inline void debug_stack_usage_dec(void)
+{
+	__get_cpu_var(debug_stack_usage)--;
+}
+int is_debug_stack(unsigned long addr);
+void debug_stack_set_zero(void);
+void debug_stack_reset(void);
+#else /* !X86_64 */
+static inline int is_debug_stack(unsigned long addr) { return 0; }
+static inline void debug_stack_set_zero(void) { }
+static inline void debug_stack_reset(void) { }
+static inline void debug_stack_usage_inc(void) { }
+static inline void debug_stack_usage_dec(void) { }
+#endif /* X86_64 */
+
+
 #endif	/* __KERNEL__ */
 
 #endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4b39d6d7e3a1..b650435ffb53 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -402,9 +402,6 @@ DECLARE_PER_CPU(char *, irq_stack_ptr);
 DECLARE_PER_CPU(unsigned int, irq_count);
 extern unsigned long kernel_eflags;
 extern asmlinkage void ignore_sysret(void);
-int is_debug_stack(unsigned long addr);
-void debug_stack_set_zero(void);
-void debug_stack_reset(void);
 #else	/* X86_64 */
 #ifdef CONFIG_CC_STACKPROTECTOR
 /*
@@ -419,9 +416,6 @@ struct stack_canary {
 };
 DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
-static inline int is_debug_stack(unsigned long addr) { return 0; }
-static inline void debug_stack_set_zero(void) { }
-static inline void debug_stack_reset(void) { }
 #endif	/* X86_64 */
 
 extern unsigned int xstate_size;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index caa404556b9c..266e4649b1da 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1093,11 +1093,13 @@ unsigned long kernel_eflags;
 DEFINE_PER_CPU(struct orig_ist, orig_ist);
 
 static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
+DEFINE_PER_CPU(int, debug_stack_usage);
 
 int is_debug_stack(unsigned long addr)
 {
-	return addr <= __get_cpu_var(debug_stack_addr) &&
-		addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ);
+	return __get_cpu_var(debug_stack_usage) ||
+		(addr <= __get_cpu_var(debug_stack_addr) &&
+		 addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
 }
 
 void debug_stack_set_zero(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a93c5cabc36a..0072b38e3ea1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -316,9 +316,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 		return;
 #endif
 
+	/*
+	 * Let others (NMI) know that the debug stack is in use
+	 * as we may switch to the interrupt stack.
+	 */
+	debug_stack_usage_inc();
 	preempt_conditional_sti(regs);
 	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
 	preempt_conditional_cli(regs);
+	debug_stack_usage_dec();
 }
 
 #ifdef CONFIG_X86_64
@@ -411,6 +417,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 							SIGTRAP) == NOTIFY_STOP)
 		return;
 
+	/*
+	 * Let others (NMI) know that the debug stack is in use
+	 * as we may switch to the interrupt stack.
+	 */
+	debug_stack_usage_inc();
+
 	/* It's safe to allow irq's after DR6 has been saved */
 	preempt_conditional_sti(regs);
 
@@ -418,6 +430,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 		handle_vm86_trap((struct kernel_vm86_regs *) regs,
 				error_code, 1);
 		preempt_conditional_cli(regs);
+		debug_stack_usage_dec();
 		return;
 	}
 
@@ -437,6 +450,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
 		send_sigtrap(tsk, regs, error_code, si_code);
 	preempt_conditional_cli(regs);
+	debug_stack_usage_dec();
 
 	return;
 }
-- 
cgit v1.2.3-55-g7522


From 8a25a2fd126c621f44f3aeaef80d51f00fc11639 Mon Sep 17 00:00:00 2001
From: Kay Sievers
Date: Wed, 21 Dec 2011 14:29:42 -0800
Subject: cpu: convert 'cpu' and 'machinecheck' sysdev_class to a regular
 subsystem

This moves the 'cpu sysdev_class' over to a regular 'cpu' subsystem
and converts the devices to regular devices. The sysdev drivers are
implemented as subsystem interfaces now.

After all sysdev classes are ported to regular driver core entities, the
sysdev implementation will be entirely removed from the kernel.

Userspace relies on events and generic sysfs subsystem infrastructure
from sysdev devices, which are made available with this conversion.

Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Borislav Petkov <bp@amd64.org>
Cc: Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
Cc: Len Brown <lenb@kernel.org>
Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/avr32/kernel/cpu.c                         |  74 +++----
 arch/ia64/kernel/err_inject.c                   |  52 ++---
 arch/ia64/kernel/topology.c                     |  10 +-
 arch/powerpc/include/asm/spu.h                  |  12 +-
 arch/powerpc/include/asm/topology.h             |  10 +-
 arch/powerpc/kernel/cacheinfo.c                 |  10 +-
 arch/powerpc/kernel/smp.c                       |   2 +-
 arch/powerpc/kernel/sysfs.c                     | 257 ++++++++++++------------
 arch/powerpc/mm/numa.c                          |   8 +-
 arch/powerpc/platforms/cell/cbe_thermal.c       | 144 ++++++-------
 arch/powerpc/platforms/cell/spu_base.c          |  61 +++---
 arch/powerpc/platforms/pseries/pseries_energy.c |  71 ++++---
 arch/powerpc/sysdev/ppc4xx_cpm.c                |   6 +-
 arch/s390/kernel/smp.c                          |  76 +++----
 arch/s390/kernel/topology.c                     |   6 +-
 arch/sh/kernel/cpu/sh4/sq.c                     |  24 ++-
 arch/sparc/kernel/sysfs.c                       | 122 +++++------
 arch/tile/kernel/sysfs.c                        |  61 +++---
 arch/x86/include/asm/mce.h                      |   2 +-
 arch/x86/kernel/cpu/intel_cacheinfo.c           |  25 ++-
 arch/x86/kernel/cpu/mcheck/mce-internal.h       |   4 +-
 arch/x86/kernel/cpu/mcheck/mce.c                | 128 ++++++------
 arch/x86/kernel/cpu/mcheck/mce_amd.c            |  11 +-
 arch/x86/kernel/cpu/mcheck/therm_throt.c        |  63 +++---
 arch/x86/kernel/microcode_core.c                |  58 +++---
 drivers/acpi/processor_driver.c                 |   6 +-
 drivers/acpi/processor_thermal.c                |   1 -
 drivers/base/cpu.c                              | 146 +++++++-------
 drivers/base/node.c                             |   8 +-
 drivers/base/topology.c                         |  51 +++--
 drivers/cpufreq/cpufreq.c                       |  79 ++++----
 drivers/cpufreq/cpufreq_stats.c                 |   1 -
 drivers/cpuidle/cpuidle.c                       |  12 +-
 drivers/cpuidle/cpuidle.h                       |  10 +-
 drivers/cpuidle/sysfs.c                         |  74 ++++---
 drivers/s390/char/sclp_config.c                 |   8 +-
 include/linux/cpu.h                             |  18 +-
 kernel/sched.c                                  |  40 ++--
 38 files changed, 874 insertions(+), 877 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/avr32/kernel/cpu.c b/arch/avr32/kernel/cpu.c
index e84faffbbeca..2233be71e2e8 100644
--- a/arch/avr32/kernel/cpu.c
+++ b/arch/avr32/kernel/cpu.c
@@ -6,7 +6,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/init.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/seq_file.h>
 #include <linux/cpu.h>
 #include <linux/module.h>
@@ -26,16 +26,16 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices);
  * XXX: If/when a SMP-capable implementation of AVR32 will ever be
  * made, we must make sure that the code executes on the correct CPU.
  */
-static ssize_t show_pc0event(struct sys_device *dev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t show_pc0event(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
 	unsigned long pccr;
 
 	pccr = sysreg_read(PCCR);
 	return sprintf(buf, "0x%lx\n", (pccr >> 12) & 0x3f);
 }
-static ssize_t store_pc0event(struct sys_device *dev,
-			struct sysdev_attribute *attr, const char *buf,
+static ssize_t store_pc0event(struct device *dev,
+			struct device_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long val;
@@ -48,16 +48,16 @@ static ssize_t store_pc0event(struct sys_device *dev,
 	sysreg_write(PCCR, val);
 	return count;
 }
-static ssize_t show_pc0count(struct sys_device *dev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t show_pc0count(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
 	unsigned long pcnt0;
 
 	pcnt0 = sysreg_read(PCNT0);
 	return sprintf(buf, "%lu\n", pcnt0);
 }
-static ssize_t store_pc0count(struct sys_device *dev,
-				struct sysdev_attribute *attr,
+static ssize_t store_pc0count(struct device *dev,
+				struct device_attribute *attr,
 				const char *buf, size_t count)
 {
 	unsigned long val;
@@ -71,16 +71,16 @@ static ssize_t store_pc0count(struct sys_device *dev,
 	return count;
 }
 
-static ssize_t show_pc1event(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t show_pc1event(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	unsigned long pccr;
 
 	pccr = sysreg_read(PCCR);
 	return sprintf(buf, "0x%lx\n", (pccr >> 18) & 0x3f);
 }
-static ssize_t store_pc1event(struct sys_device *dev,
-			      struct sysdev_attribute *attr, const char *buf,
+static ssize_t store_pc1event(struct device *dev,
+			      struct device_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long val;
@@ -93,16 +93,16 @@ static ssize_t store_pc1event(struct sys_device *dev,
 	sysreg_write(PCCR, val);
 	return count;
 }
-static ssize_t show_pc1count(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t show_pc1count(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	unsigned long pcnt1;
 
 	pcnt1 = sysreg_read(PCNT1);
 	return sprintf(buf, "%lu\n", pcnt1);
 }
-static ssize_t store_pc1count(struct sys_device *dev,
-				struct sysdev_attribute *attr, const char *buf,
+static ssize_t store_pc1count(struct device *dev,
+				struct device_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long val;
@@ -116,16 +116,16 @@ static ssize_t store_pc1count(struct sys_device *dev,
 	return count;
 }
 
-static ssize_t show_pccycles(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t show_pccycles(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	unsigned long pccnt;
 
 	pccnt = sysreg_read(PCCNT);
 	return sprintf(buf, "%lu\n", pccnt);
 }
-static ssize_t store_pccycles(struct sys_device *dev,
-				struct sysdev_attribute *attr, const char *buf,
+static ssize_t store_pccycles(struct device *dev,
+				struct device_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long val;
@@ -139,16 +139,16 @@ static ssize_t store_pccycles(struct sys_device *dev,
 	return count;
 }
 
-static ssize_t show_pcenable(struct sys_device *dev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t show_pcenable(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
 	unsigned long pccr;
 
 	pccr = sysreg_read(PCCR);
 	return sprintf(buf, "%c\n", (pccr & 1)?'1':'0');
 }
-static ssize_t store_pcenable(struct sys_device *dev,
-			      struct sysdev_attribute *attr, const char *buf,
+static ssize_t store_pcenable(struct device *dev,
+			      struct device_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long pccr, val;
@@ -167,12 +167,12 @@ static ssize_t store_pcenable(struct sys_device *dev,
 	return count;
 }
 
-static SYSDEV_ATTR(pc0event, 0600, show_pc0event, store_pc0event);
-static SYSDEV_ATTR(pc0count, 0600, show_pc0count, store_pc0count);
-static SYSDEV_ATTR(pc1event, 0600, show_pc1event, store_pc1event);
-static SYSDEV_ATTR(pc1count, 0600, show_pc1count, store_pc1count);
-static SYSDEV_ATTR(pccycles, 0600, show_pccycles, store_pccycles);
-static SYSDEV_ATTR(pcenable, 0600, show_pcenable, store_pcenable);
+static DEVICE_ATTR(pc0event, 0600, show_pc0event, store_pc0event);
+static DEVICE_ATTR(pc0count, 0600, show_pc0count, store_pc0count);
+static DEVICE_ATTR(pc1event, 0600, show_pc1event, store_pc1event);
+static DEVICE_ATTR(pc1count, 0600, show_pc1count, store_pc1count);
+static DEVICE_ATTR(pccycles, 0600, show_pccycles, store_pccycles);
+static DEVICE_ATTR(pcenable, 0600, show_pcenable, store_pcenable);
 
 #endif /* CONFIG_PERFORMANCE_COUNTERS */
 
@@ -186,12 +186,12 @@ static int __init topology_init(void)
 		register_cpu(c, cpu);
 
 #ifdef CONFIG_PERFORMANCE_COUNTERS
-		sysdev_create_file(&c->sysdev, &attr_pc0event);
-		sysdev_create_file(&c->sysdev, &attr_pc0count);
-		sysdev_create_file(&c->sysdev, &attr_pc1event);
-		sysdev_create_file(&c->sysdev, &attr_pc1count);
-		sysdev_create_file(&c->sysdev, &attr_pccycles);
-		sysdev_create_file(&c->sysdev, &attr_pcenable);
+		device_create_file(&c->dev, &dev_attr_pc0event);
+		device_create_file(&c->dev, &dev_attr_pc0count);
+		device_create_file(&c->dev, &dev_attr_pc1event);
+		device_create_file(&c->dev, &dev_attr_pc1count);
+		device_create_file(&c->dev, &dev_attr_pccycles);
+		device_create_file(&c->dev, &dev_attr_pcenable);
 #endif
 	}
 
diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c
index c539c689493b..2d67317a1ec2 100644
--- a/arch/ia64/kernel/err_inject.c
+++ b/arch/ia64/kernel/err_inject.c
@@ -24,7 +24,7 @@
  * Copyright (C) 2006, Intel Corp.  All rights reserved.
  *
  */
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/cpu.h>
@@ -35,10 +35,10 @@
 #define ERR_DATA_BUFFER_SIZE 3 		// Three 8-byte;
 
 #define define_one_ro(name) 						\
-static SYSDEV_ATTR(name, 0444, show_##name, NULL)
+static DEVICE_ATTR(name, 0444, show_##name, NULL)
 
 #define define_one_rw(name) 						\
-static SYSDEV_ATTR(name, 0644, show_##name, store_##name)
+static DEVICE_ATTR(name, 0644, show_##name, store_##name)
 
 static u64 call_start[NR_CPUS];
 static u64 phys_addr[NR_CPUS];
@@ -55,7 +55,7 @@ static u64 resources[NR_CPUS];
 
 #define show(name) 							\
 static ssize_t 								\
-show_##name(struct sys_device *dev, struct sysdev_attribute *attr,	\
+show_##name(struct device *dev, struct device_attribute *attr,	\
 		char *buf)						\
 {									\
 	u32 cpu=dev->id;						\
@@ -64,7 +64,7 @@ show_##name(struct sys_device *dev, struct sysdev_attribute *attr,	\
 
 #define store(name)							\
 static ssize_t 								\
-store_##name(struct sys_device *dev, struct sysdev_attribute *attr,	\
+store_##name(struct device *dev, struct device_attribute *attr,	\
 					const char *buf, size_t size)	\
 {									\
 	unsigned int cpu=dev->id;					\
@@ -78,7 +78,7 @@ show(call_start)
  * processor. The cpu number in driver is only used for storing data.
  */
 static ssize_t
-store_call_start(struct sys_device *dev, struct sysdev_attribute *attr,
+store_call_start(struct device *dev, struct device_attribute *attr,
 		const char *buf, size_t size)
 {
 	unsigned int cpu=dev->id;
@@ -127,7 +127,7 @@ show(err_type_info)
 store(err_type_info)
 
 static ssize_t
-show_virtual_to_phys(struct sys_device *dev, struct sysdev_attribute *attr,
+show_virtual_to_phys(struct device *dev, struct device_attribute *attr,
 			char *buf)
 {
 	unsigned int cpu=dev->id;
@@ -135,7 +135,7 @@ show_virtual_to_phys(struct sys_device *dev, struct sysdev_attribute *attr,
 }
 
 static ssize_t
-store_virtual_to_phys(struct sys_device *dev, struct sysdev_attribute *attr,
+store_virtual_to_phys(struct device *dev, struct device_attribute *attr,
 			const char *buf, size_t size)
 {
 	unsigned int cpu=dev->id;
@@ -159,8 +159,8 @@ show(err_struct_info)
 store(err_struct_info)
 
 static ssize_t
-show_err_data_buffer(struct sys_device *dev,
-			struct sysdev_attribute *attr, char *buf)
+show_err_data_buffer(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
 	unsigned int cpu=dev->id;
 
@@ -171,8 +171,8 @@ show_err_data_buffer(struct sys_device *dev,
 }
 
 static ssize_t
-store_err_data_buffer(struct sys_device *dev,
-			struct sysdev_attribute *attr,
+store_err_data_buffer(struct device *dev,
+			struct device_attribute *attr,
 			const char *buf, size_t size)
 {
 	unsigned int cpu=dev->id;
@@ -209,14 +209,14 @@ define_one_ro(capabilities);
 define_one_ro(resources);
 
 static struct attribute *default_attrs[] = {
-	&attr_call_start.attr,
-	&attr_virtual_to_phys.attr,
-	&attr_err_type_info.attr,
-	&attr_err_struct_info.attr,
-	&attr_err_data_buffer.attr,
-	&attr_status.attr,
-	&attr_capabilities.attr,
-	&attr_resources.attr,
+	&dev_attr_call_start.attr,
+	&dev_attr_virtual_to_phys.attr,
+	&dev_attr_err_type_info.attr,
+	&dev_attr_err_struct_info.attr,
+	&dev_attr_err_data_buffer.attr,
+	&dev_attr_status.attr,
+	&dev_attr_capabilities.attr,
+	&dev_attr_resources.attr,
 	NULL
 };
 
@@ -225,12 +225,12 @@ static struct attribute_group err_inject_attr_group = {
 	.name = "err_inject"
 };
 /* Add/Remove err_inject interface for CPU device */
-static int __cpuinit err_inject_add_dev(struct sys_device * sys_dev)
+static int __cpuinit err_inject_add_dev(struct device * sys_dev)
 {
 	return sysfs_create_group(&sys_dev->kobj, &err_inject_attr_group);
 }
 
-static int __cpuinit err_inject_remove_dev(struct sys_device * sys_dev)
+static int __cpuinit err_inject_remove_dev(struct device * sys_dev)
 {
 	sysfs_remove_group(&sys_dev->kobj, &err_inject_attr_group);
 	return 0;
@@ -239,9 +239,9 @@ static int __cpuinit err_inject_cpu_callback(struct notifier_block *nfb,
 		unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct sys_device *sys_dev;
+	struct device *sys_dev;
 
-	sys_dev = get_cpu_sysdev(cpu);
+	sys_dev = get_cpu_device(cpu);
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
@@ -283,13 +283,13 @@ static void __exit
 err_inject_exit(void)
 {
 	int i;
-	struct sys_device *sys_dev;
+	struct device *sys_dev;
 
 #ifdef ERR_INJ_DEBUG
 	printk(KERN_INFO "Exit error injection driver.\n");
 #endif
 	for_each_online_cpu(i) {
-		sys_dev = get_cpu_sysdev(i);
+		sys_dev = get_cpu_device(i);
 		sysfs_remove_group(&sys_dev->kobj, &err_inject_attr_group);
 	}
 	unregister_hotcpu_notifier(&err_inject_cpu_notifier);
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 9be1f11a01d9..9deb21dbf629 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -350,7 +350,7 @@ static int __cpuinit cpu_cache_sysfs_init(unsigned int cpu)
 }
 
 /* Add cache interface for CPU device */
-static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
+static int __cpuinit cache_add_dev(struct device * sys_dev)
 {
 	unsigned int cpu = sys_dev->id;
 	unsigned long i, j;
@@ -400,7 +400,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 }
 
 /* Remove cache interface for CPU device */
-static int __cpuinit cache_remove_dev(struct sys_device * sys_dev)
+static int __cpuinit cache_remove_dev(struct device * sys_dev)
 {
 	unsigned int cpu = sys_dev->id;
 	unsigned long i;
@@ -428,9 +428,9 @@ static int __cpuinit cache_cpu_callback(struct notifier_block *nfb,
 		unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct sys_device *sys_dev;
+	struct device *sys_dev;
 
-	sys_dev = get_cpu_sysdev(cpu);
+	sys_dev = get_cpu_device(cpu);
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
@@ -454,7 +454,7 @@ static int __init cache_sysfs_init(void)
 	int i;
 
 	for_each_online_cpu(i) {
-		struct sys_device *sys_dev = get_cpu_sysdev((unsigned int)i);
+		struct device *sys_dev = get_cpu_device((unsigned int)i);
 		cache_add_dev(sys_dev);
 	}
 
diff --git a/arch/powerpc/include/asm/spu.h b/arch/powerpc/include/asm/spu.h
index 4e360bd4a35a..fff921345ddc 100644
--- a/arch/powerpc/include/asm/spu.h
+++ b/arch/powerpc/include/asm/spu.h
@@ -25,7 +25,7 @@
 #ifdef __KERNEL__
 
 #include <linux/workqueue.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/mutex.h>
 
 #define LS_SIZE (256 * 1024)
@@ -166,7 +166,7 @@ struct spu {
 	/* beat only */
 	u64 shadow_int_mask_RW[3];
 
-	struct sys_device sysdev;
+	struct device dev;
 
 	int has_mem_affinity;
 	struct list_head aff_list;
@@ -270,11 +270,11 @@ struct spufs_calls {
 int register_spu_syscalls(struct spufs_calls *calls);
 void unregister_spu_syscalls(struct spufs_calls *calls);
 
-int spu_add_sysdev_attr(struct sysdev_attribute *attr);
-void spu_remove_sysdev_attr(struct sysdev_attribute *attr);
+int spu_add_dev_attr(struct device_attribute *attr);
+void spu_remove_dev_attr(struct device_attribute *attr);
 
-int spu_add_sysdev_attr_group(struct attribute_group *attrs);
-void spu_remove_sysdev_attr_group(struct attribute_group *attrs);
+int spu_add_dev_attr_group(struct attribute_group *attrs);
+void spu_remove_dev_attr_group(struct attribute_group *attrs);
 
 int spu_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
 		unsigned long dsisr, unsigned *flt);
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 1e104af08483..c97185885c6d 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -3,7 +3,7 @@
 #ifdef __KERNEL__
 
 
-struct sys_device;
+struct device;
 struct device_node;
 
 #ifdef CONFIG_NUMA
@@ -86,19 +86,19 @@ extern int __node_distance(int, int);
 
 extern void __init dump_numa_cpu_topology(void);
 
-extern int sysfs_add_device_to_node(struct sys_device *dev, int nid);
-extern void sysfs_remove_device_from_node(struct sys_device *dev, int nid);
+extern int sysfs_add_device_to_node(struct device *dev, int nid);
+extern void sysfs_remove_device_from_node(struct device *dev, int nid);
 
 #else
 
 static inline void dump_numa_cpu_topology(void) {}
 
-static inline int sysfs_add_device_to_node(struct sys_device *dev, int nid)
+static inline int sysfs_add_device_to_node(struct device *dev, int nid)
 {
 	return 0;
 }
 
-static inline void sysfs_remove_device_from_node(struct sys_device *dev,
+static inline void sysfs_remove_device_from_node(struct device *dev,
 						int nid)
 {
 }
diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index a3c684b4c862..92c6b008dd2b 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -451,15 +451,15 @@ out:
 static struct cache_dir *__cpuinit cacheinfo_create_cache_dir(unsigned int cpu_id)
 {
 	struct cache_dir *cache_dir;
-	struct sys_device *sysdev;
+	struct device *dev;
 	struct kobject *kobj = NULL;
 
-	sysdev = get_cpu_sysdev(cpu_id);
-	WARN_ONCE(!sysdev, "no sysdev for CPU %i\n", cpu_id);
-	if (!sysdev)
+	dev = get_cpu_device(cpu_id);
+	WARN_ONCE(!dev, "no dev for CPU %i\n", cpu_id);
+	if (!dev)
 		goto err;
 
-	kobj = kobject_create_and_add("cache", &sysdev->kobj);
+	kobj = kobject_create_and_add("cache", &dev->kobj);
 	if (!kobj)
 		goto err;
 
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 25ddbfc7dd36..da08240353fa 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -27,7 +27,7 @@
 #include <linux/spinlock.h>
 #include <linux/cache.h>
 #include <linux/err.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index ce035c1905f0..f396ef27916b 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -1,4 +1,4 @@
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/percpu.h>
@@ -37,12 +37,12 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices);
 /* Time in microseconds we delay before sleeping in the idle loop */
 DEFINE_PER_CPU(long, smt_snooze_delay) = { 100 };
 
-static ssize_t store_smt_snooze_delay(struct sys_device *dev,
-				      struct sysdev_attribute *attr,
+static ssize_t store_smt_snooze_delay(struct device *dev,
+				      struct device_attribute *attr,
 				      const char *buf,
 				      size_t count)
 {
-	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
 	ssize_t ret;
 	long snooze;
 
@@ -50,21 +50,21 @@ static ssize_t store_smt_snooze_delay(struct sys_device *dev,
 	if (ret != 1)
 		return -EINVAL;
 
-	per_cpu(smt_snooze_delay, cpu->sysdev.id) = snooze;
+	per_cpu(smt_snooze_delay, cpu->dev.id) = snooze;
 
 	return count;
 }
 
-static ssize_t show_smt_snooze_delay(struct sys_device *dev,
-				     struct sysdev_attribute *attr,
+static ssize_t show_smt_snooze_delay(struct device *dev,
+				     struct device_attribute *attr,
 				     char *buf)
 {
-	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
 
-	return sprintf(buf, "%ld\n", per_cpu(smt_snooze_delay, cpu->sysdev.id));
+	return sprintf(buf, "%ld\n", per_cpu(smt_snooze_delay, cpu->dev.id));
 }
 
-static SYSDEV_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
+static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
 		   store_smt_snooze_delay);
 
 static int __init setup_smt_snooze_delay(char *str)
@@ -117,25 +117,25 @@ static void write_##NAME(void *val) \
 	ppc_enable_pmcs(); \
 	mtspr(ADDRESS, *(unsigned long *)val);	\
 } \
-static ssize_t show_##NAME(struct sys_device *dev, \
-			struct sysdev_attribute *attr, \
+static ssize_t show_##NAME(struct device *dev, \
+			struct device_attribute *attr, \
 			char *buf) \
 { \
-	struct cpu *cpu = container_of(dev, struct cpu, sysdev); \
+	struct cpu *cpu = container_of(dev, struct cpu, dev); \
 	unsigned long val; \
-	smp_call_function_single(cpu->sysdev.id, read_##NAME, &val, 1);	\
+	smp_call_function_single(cpu->dev.id, read_##NAME, &val, 1);	\
 	return sprintf(buf, "%lx\n", val); \
 } \
 static ssize_t __used \
-	store_##NAME(struct sys_device *dev, struct sysdev_attribute *attr, \
+	store_##NAME(struct device *dev, struct device_attribute *attr, \
 			const char *buf, size_t count) \
 { \
-	struct cpu *cpu = container_of(dev, struct cpu, sysdev); \
+	struct cpu *cpu = container_of(dev, struct cpu, dev); \
 	unsigned long val; \
 	int ret = sscanf(buf, "%lx", &val); \
 	if (ret != 1) \
 		return -EINVAL; \
-	smp_call_function_single(cpu->sysdev.id, write_##NAME, &val, 1); \
+	smp_call_function_single(cpu->dev.id, write_##NAME, &val, 1); \
 	return count; \
 }
 
@@ -178,22 +178,22 @@ SYSFS_PMCSETUP(purr, SPRN_PURR);
 SYSFS_PMCSETUP(spurr, SPRN_SPURR);
 SYSFS_PMCSETUP(dscr, SPRN_DSCR);
 
-static SYSDEV_ATTR(mmcra, 0600, show_mmcra, store_mmcra);
-static SYSDEV_ATTR(spurr, 0600, show_spurr, NULL);
-static SYSDEV_ATTR(dscr, 0600, show_dscr, store_dscr);
-static SYSDEV_ATTR(purr, 0600, show_purr, store_purr);
+static DEVICE_ATTR(mmcra, 0600, show_mmcra, store_mmcra);
+static DEVICE_ATTR(spurr, 0600, show_spurr, NULL);
+static DEVICE_ATTR(dscr, 0600, show_dscr, store_dscr);
+static DEVICE_ATTR(purr, 0600, show_purr, store_purr);
 
 unsigned long dscr_default = 0;
 EXPORT_SYMBOL(dscr_default);
 
-static ssize_t show_dscr_default(struct sysdev_class *class,
-		struct sysdev_class_attribute *attr, char *buf)
+static ssize_t show_dscr_default(struct device *dev,
+		struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%lx\n", dscr_default);
 }
 
-static ssize_t __used store_dscr_default(struct sysdev_class *class,
-		struct sysdev_class_attribute *attr, const char *buf,
+static ssize_t __used store_dscr_default(struct device *dev,
+		struct device_attribute *attr, const char *buf,
 		size_t count)
 {
 	unsigned long val;
@@ -207,15 +207,14 @@ static ssize_t __used store_dscr_default(struct sysdev_class *class,
 	return count;
 }
 
-static SYSDEV_CLASS_ATTR(dscr_default, 0600,
+static DEVICE_ATTR(dscr_default, 0600,
 		show_dscr_default, store_dscr_default);
 
 static void sysfs_create_dscr_default(void)
 {
 	int err = 0;
 	if (cpu_has_feature(CPU_FTR_DSCR))
-		err = sysfs_create_file(&cpu_sysdev_class.kset.kobj,
-			&attr_dscr_default.attr);
+		err = device_create_file(cpu_subsys.dev_root, &dev_attr_dscr_default);
 }
 #endif /* CONFIG_PPC64 */
 
@@ -259,72 +258,72 @@ SYSFS_PMCSETUP(tsr3, SPRN_PA6T_TSR3);
 #endif /* HAS_PPC_PMC_PA6T */
 
 #ifdef HAS_PPC_PMC_IBM
-static struct sysdev_attribute ibm_common_attrs[] = {
-	_SYSDEV_ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
-	_SYSDEV_ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
+static struct device_attribute ibm_common_attrs[] = {
+	__ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
+	__ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
 };
 #endif /* HAS_PPC_PMC_G4 */
 
 #ifdef HAS_PPC_PMC_G4
-static struct sysdev_attribute g4_common_attrs[] = {
-	_SYSDEV_ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
-	_SYSDEV_ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
-	_SYSDEV_ATTR(mmcr2, 0600, show_mmcr2, store_mmcr2),
+static struct device_attribute g4_common_attrs[] = {
+	__ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
+	__ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
+	__ATTR(mmcr2, 0600, show_mmcr2, store_mmcr2),
 };
 #endif /* HAS_PPC_PMC_G4 */
 
-static struct sysdev_attribute classic_pmc_attrs[] = {
-	_SYSDEV_ATTR(pmc1, 0600, show_pmc1, store_pmc1),
-	_SYSDEV_ATTR(pmc2, 0600, show_pmc2, store_pmc2),
-	_SYSDEV_ATTR(pmc3, 0600, show_pmc3, store_pmc3),
-	_SYSDEV_ATTR(pmc4, 0600, show_pmc4, store_pmc4),
-	_SYSDEV_ATTR(pmc5, 0600, show_pmc5, store_pmc5),
-	_SYSDEV_ATTR(pmc6, 0600, show_pmc6, store_pmc6),
+static struct device_attribute classic_pmc_attrs[] = {
+	__ATTR(pmc1, 0600, show_pmc1, store_pmc1),
+	__ATTR(pmc2, 0600, show_pmc2, store_pmc2),
+	__ATTR(pmc3, 0600, show_pmc3, store_pmc3),
+	__ATTR(pmc4, 0600, show_pmc4, store_pmc4),
+	__ATTR(pmc5, 0600, show_pmc5, store_pmc5),
+	__ATTR(pmc6, 0600, show_pmc6, store_pmc6),
 #ifdef CONFIG_PPC64
-	_SYSDEV_ATTR(pmc7, 0600, show_pmc7, store_pmc7),
-	_SYSDEV_ATTR(pmc8, 0600, show_pmc8, store_pmc8),
+	__ATTR(pmc7, 0600, show_pmc7, store_pmc7),
+	__ATTR(pmc8, 0600, show_pmc8, store_pmc8),
 #endif
 };
 
 #ifdef HAS_PPC_PMC_PA6T
-static struct sysdev_attribute pa6t_attrs[] = {
-	_SYSDEV_ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
-	_SYSDEV_ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
-	_SYSDEV_ATTR(pmc0, 0600, show_pa6t_pmc0, store_pa6t_pmc0),
-	_SYSDEV_ATTR(pmc1, 0600, show_pa6t_pmc1, store_pa6t_pmc1),
-	_SYSDEV_ATTR(pmc2, 0600, show_pa6t_pmc2, store_pa6t_pmc2),
-	_SYSDEV_ATTR(pmc3, 0600, show_pa6t_pmc3, store_pa6t_pmc3),
-	_SYSDEV_ATTR(pmc4, 0600, show_pa6t_pmc4, store_pa6t_pmc4),
-	_SYSDEV_ATTR(pmc5, 0600, show_pa6t_pmc5, store_pa6t_pmc5),
+static struct device_attribute pa6t_attrs[] = {
+	__ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
+	__ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
+	__ATTR(pmc0, 0600, show_pa6t_pmc0, store_pa6t_pmc0),
+	__ATTR(pmc1, 0600, show_pa6t_pmc1, store_pa6t_pmc1),
+	__ATTR(pmc2, 0600, show_pa6t_pmc2, store_pa6t_pmc2),
+	__ATTR(pmc3, 0600, show_pa6t_pmc3, store_pa6t_pmc3),
+	__ATTR(pmc4, 0600, show_pa6t_pmc4, store_pa6t_pmc4),
+	__ATTR(pmc5, 0600, show_pa6t_pmc5, store_pa6t_pmc5),
 #ifdef CONFIG_DEBUG_KERNEL
-	_SYSDEV_ATTR(hid0, 0600, show_hid0, store_hid0),
-	_SYSDEV_ATTR(hid1, 0600, show_hid1, store_hid1),
-	_SYSDEV_ATTR(hid4, 0600, show_hid4, store_hid4),
-	_SYSDEV_ATTR(hid5, 0600, show_hid5, store_hid5),
-	_SYSDEV_ATTR(ima0, 0600, show_ima0, store_ima0),
-	_SYSDEV_ATTR(ima1, 0600, show_ima1, store_ima1),
-	_SYSDEV_ATTR(ima2, 0600, show_ima2, store_ima2),
-	_SYSDEV_ATTR(ima3, 0600, show_ima3, store_ima3),
-	_SYSDEV_ATTR(ima4, 0600, show_ima4, store_ima4),
-	_SYSDEV_ATTR(ima5, 0600, show_ima5, store_ima5),
-	_SYSDEV_ATTR(ima6, 0600, show_ima6, store_ima6),
-	_SYSDEV_ATTR(ima7, 0600, show_ima7, store_ima7),
-	_SYSDEV_ATTR(ima8, 0600, show_ima8, store_ima8),
-	_SYSDEV_ATTR(ima9, 0600, show_ima9, store_ima9),
-	_SYSDEV_ATTR(imaat, 0600, show_imaat, store_imaat),
-	_SYSDEV_ATTR(btcr, 0600, show_btcr, store_btcr),
-	_SYSDEV_ATTR(pccr, 0600, show_pccr, store_pccr),
-	_SYSDEV_ATTR(rpccr, 0600, show_rpccr, store_rpccr),
-	_SYSDEV_ATTR(der, 0600, show_der, store_der),
-	_SYSDEV_ATTR(mer, 0600, show_mer, store_mer),
-	_SYSDEV_ATTR(ber, 0600, show_ber, store_ber),
-	_SYSDEV_ATTR(ier, 0600, show_ier, store_ier),
-	_SYSDEV_ATTR(sier, 0600, show_sier, store_sier),
-	_SYSDEV_ATTR(siar, 0600, show_siar, store_siar),
-	_SYSDEV_ATTR(tsr0, 0600, show_tsr0, store_tsr0),
-	_SYSDEV_ATTR(tsr1, 0600, show_tsr1, store_tsr1),
-	_SYSDEV_ATTR(tsr2, 0600, show_tsr2, store_tsr2),
-	_SYSDEV_ATTR(tsr3, 0600, show_tsr3, store_tsr3),
+	__ATTR(hid0, 0600, show_hid0, store_hid0),
+	__ATTR(hid1, 0600, show_hid1, store_hid1),
+	__ATTR(hid4, 0600, show_hid4, store_hid4),
+	__ATTR(hid5, 0600, show_hid5, store_hid5),
+	__ATTR(ima0, 0600, show_ima0, store_ima0),
+	__ATTR(ima1, 0600, show_ima1, store_ima1),
+	__ATTR(ima2, 0600, show_ima2, store_ima2),
+	__ATTR(ima3, 0600, show_ima3, store_ima3),
+	__ATTR(ima4, 0600, show_ima4, store_ima4),
+	__ATTR(ima5, 0600, show_ima5, store_ima5),
+	__ATTR(ima6, 0600, show_ima6, store_ima6),
+	__ATTR(ima7, 0600, show_ima7, store_ima7),
+	__ATTR(ima8, 0600, show_ima8, store_ima8),
+	__ATTR(ima9, 0600, show_ima9, store_ima9),
+	__ATTR(imaat, 0600, show_imaat, store_imaat),
+	__ATTR(btcr, 0600, show_btcr, store_btcr),
+	__ATTR(pccr, 0600, show_pccr, store_pccr),
+	__ATTR(rpccr, 0600, show_rpccr, store_rpccr),
+	__ATTR(der, 0600, show_der, store_der),
+	__ATTR(mer, 0600, show_mer, store_mer),
+	__ATTR(ber, 0600, show_ber, store_ber),
+	__ATTR(ier, 0600, show_ier, store_ier),
+	__ATTR(sier, 0600, show_sier, store_sier),
+	__ATTR(siar, 0600, show_siar, store_siar),
+	__ATTR(tsr0, 0600, show_tsr0, store_tsr0),
+	__ATTR(tsr1, 0600, show_tsr1, store_tsr1),
+	__ATTR(tsr2, 0600, show_tsr2, store_tsr2),
+	__ATTR(tsr3, 0600, show_tsr3, store_tsr3),
 #endif /* CONFIG_DEBUG_KERNEL */
 };
 #endif /* HAS_PPC_PMC_PA6T */
@@ -333,14 +332,14 @@ static struct sysdev_attribute pa6t_attrs[] = {
 static void __cpuinit register_cpu_online(unsigned int cpu)
 {
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
-	struct sys_device *s = &c->sysdev;
-	struct sysdev_attribute *attrs, *pmc_attrs;
+	struct device *s = &c->dev;
+	struct device_attribute *attrs, *pmc_attrs;
 	int i, nattrs;
 
 #ifdef CONFIG_PPC64
 	if (!firmware_has_feature(FW_FEATURE_ISERIES) &&
 			cpu_has_feature(CPU_FTR_SMT))
-		sysdev_create_file(s, &attr_smt_snooze_delay);
+		device_create_file(s, &dev_attr_smt_snooze_delay);
 #endif
 
 	/* PMC stuff */
@@ -348,14 +347,14 @@ static void __cpuinit register_cpu_online(unsigned int cpu)
 #ifdef HAS_PPC_PMC_IBM
 	case PPC_PMC_IBM:
 		attrs = ibm_common_attrs;
-		nattrs = sizeof(ibm_common_attrs) / sizeof(struct sysdev_attribute);
+		nattrs = sizeof(ibm_common_attrs) / sizeof(struct device_attribute);
 		pmc_attrs = classic_pmc_attrs;
 		break;
 #endif /* HAS_PPC_PMC_IBM */
 #ifdef HAS_PPC_PMC_G4
 	case PPC_PMC_G4:
 		attrs = g4_common_attrs;
-		nattrs = sizeof(g4_common_attrs) / sizeof(struct sysdev_attribute);
+		nattrs = sizeof(g4_common_attrs) / sizeof(struct device_attribute);
 		pmc_attrs = classic_pmc_attrs;
 		break;
 #endif /* HAS_PPC_PMC_G4 */
@@ -363,7 +362,7 @@ static void __cpuinit register_cpu_online(unsigned int cpu)
 	case PPC_PMC_PA6T:
 		/* PA Semi starts counting at PMC0 */
 		attrs = pa6t_attrs;
-		nattrs = sizeof(pa6t_attrs) / sizeof(struct sysdev_attribute);
+		nattrs = sizeof(pa6t_attrs) / sizeof(struct device_attribute);
 		pmc_attrs = NULL;
 		break;
 #endif /* HAS_PPC_PMC_PA6T */
@@ -374,24 +373,24 @@ static void __cpuinit register_cpu_online(unsigned int cpu)
 	}
 
 	for (i = 0; i < nattrs; i++)
-		sysdev_create_file(s, &attrs[i]);
+		device_create_file(s, &attrs[i]);
 
 	if (pmc_attrs)
 		for (i = 0; i < cur_cpu_spec->num_pmcs; i++)
-			sysdev_create_file(s, &pmc_attrs[i]);
+			device_create_file(s, &pmc_attrs[i]);
 
 #ifdef CONFIG_PPC64
 	if (cpu_has_feature(CPU_FTR_MMCRA))
-		sysdev_create_file(s, &attr_mmcra);
+		device_create_file(s, &dev_attr_mmcra);
 
 	if (cpu_has_feature(CPU_FTR_PURR))
-		sysdev_create_file(s, &attr_purr);
+		device_create_file(s, &dev_attr_purr);
 
 	if (cpu_has_feature(CPU_FTR_SPURR))
-		sysdev_create_file(s, &attr_spurr);
+		device_create_file(s, &dev_attr_spurr);
 
 	if (cpu_has_feature(CPU_FTR_DSCR))
-		sysdev_create_file(s, &attr_dscr);
+		device_create_file(s, &dev_attr_dscr);
 #endif /* CONFIG_PPC64 */
 
 	cacheinfo_cpu_online(cpu);
@@ -401,8 +400,8 @@ static void __cpuinit register_cpu_online(unsigned int cpu)
 static void unregister_cpu_online(unsigned int cpu)
 {
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
-	struct sys_device *s = &c->sysdev;
-	struct sysdev_attribute *attrs, *pmc_attrs;
+	struct device *s = &c->dev;
+	struct device_attribute *attrs, *pmc_attrs;
 	int i, nattrs;
 
 	BUG_ON(!c->hotpluggable);
@@ -410,7 +409,7 @@ static void unregister_cpu_online(unsigned int cpu)
 #ifdef CONFIG_PPC64
 	if (!firmware_has_feature(FW_FEATURE_ISERIES) &&
 			cpu_has_feature(CPU_FTR_SMT))
-		sysdev_remove_file(s, &attr_smt_snooze_delay);
+		device_remove_file(s, &dev_attr_smt_snooze_delay);
 #endif
 
 	/* PMC stuff */
@@ -418,14 +417,14 @@ static void unregister_cpu_online(unsigned int cpu)
 #ifdef HAS_PPC_PMC_IBM
 	case PPC_PMC_IBM:
 		attrs = ibm_common_attrs;
-		nattrs = sizeof(ibm_common_attrs) / sizeof(struct sysdev_attribute);
+		nattrs = sizeof(ibm_common_attrs) / sizeof(struct device_attribute);
 		pmc_attrs = classic_pmc_attrs;
 		break;
 #endif /* HAS_PPC_PMC_IBM */
 #ifdef HAS_PPC_PMC_G4
 	case PPC_PMC_G4:
 		attrs = g4_common_attrs;
-		nattrs = sizeof(g4_common_attrs) / sizeof(struct sysdev_attribute);
+		nattrs = sizeof(g4_common_attrs) / sizeof(struct device_attribute);
 		pmc_attrs = classic_pmc_attrs;
 		break;
 #endif /* HAS_PPC_PMC_G4 */
@@ -433,7 +432,7 @@ static void unregister_cpu_online(unsigned int cpu)
 	case PPC_PMC_PA6T:
 		/* PA Semi starts counting at PMC0 */
 		attrs = pa6t_attrs;
-		nattrs = sizeof(pa6t_attrs) / sizeof(struct sysdev_attribute);
+		nattrs = sizeof(pa6t_attrs) / sizeof(struct device_attribute);
 		pmc_attrs = NULL;
 		break;
 #endif /* HAS_PPC_PMC_PA6T */
@@ -444,24 +443,24 @@ static void unregister_cpu_online(unsigned int cpu)
 	}
 
 	for (i = 0; i < nattrs; i++)
-		sysdev_remove_file(s, &attrs[i]);
+		device_remove_file(s, &attrs[i]);
 
 	if (pmc_attrs)
 		for (i = 0; i < cur_cpu_spec->num_pmcs; i++)
-			sysdev_remove_file(s, &pmc_attrs[i]);
+			device_remove_file(s, &pmc_attrs[i]);
 
 #ifdef CONFIG_PPC64
 	if (cpu_has_feature(CPU_FTR_MMCRA))
-		sysdev_remove_file(s, &attr_mmcra);
+		device_remove_file(s, &dev_attr_mmcra);
 
 	if (cpu_has_feature(CPU_FTR_PURR))
-		sysdev_remove_file(s, &attr_purr);
+		device_remove_file(s, &dev_attr_purr);
 
 	if (cpu_has_feature(CPU_FTR_SPURR))
-		sysdev_remove_file(s, &attr_spurr);
+		device_remove_file(s, &dev_attr_spurr);
 
 	if (cpu_has_feature(CPU_FTR_DSCR))
-		sysdev_remove_file(s, &attr_dscr);
+		device_remove_file(s, &dev_attr_dscr);
 #endif /* CONFIG_PPC64 */
 
 	cacheinfo_cpu_offline(cpu);
@@ -513,70 +512,70 @@ static struct notifier_block __cpuinitdata sysfs_cpu_nb = {
 
 static DEFINE_MUTEX(cpu_mutex);
 
-int cpu_add_sysdev_attr(struct sysdev_attribute *attr)
+int cpu_add_dev_attr(struct device_attribute *attr)
 {
 	int cpu;
 
 	mutex_lock(&cpu_mutex);
 
 	for_each_possible_cpu(cpu) {
-		sysdev_create_file(get_cpu_sysdev(cpu), attr);
+		device_create_file(get_cpu_device(cpu), attr);
 	}
 
 	mutex_unlock(&cpu_mutex);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(cpu_add_sysdev_attr);
+EXPORT_SYMBOL_GPL(cpu_add_dev_attr);
 
-int cpu_add_sysdev_attr_group(struct attribute_group *attrs)
+int cpu_add_dev_attr_group(struct attribute_group *attrs)
 {
 	int cpu;
-	struct sys_device *sysdev;
+	struct device *dev;
 	int ret;
 
 	mutex_lock(&cpu_mutex);
 
 	for_each_possible_cpu(cpu) {
-		sysdev = get_cpu_sysdev(cpu);
-		ret = sysfs_create_group(&sysdev->kobj, attrs);
+		dev = get_cpu_device(cpu);
+		ret = sysfs_create_group(&dev->kobj, attrs);
 		WARN_ON(ret != 0);
 	}
 
 	mutex_unlock(&cpu_mutex);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(cpu_add_sysdev_attr_group);
+EXPORT_SYMBOL_GPL(cpu_add_dev_attr_group);
 
 
-void cpu_remove_sysdev_attr(struct sysdev_attribute *attr)
+void cpu_remove_dev_attr(struct device_attribute *attr)
 {
 	int cpu;
 
 	mutex_lock(&cpu_mutex);
 
 	for_each_possible_cpu(cpu) {
-		sysdev_remove_file(get_cpu_sysdev(cpu), attr);
+		device_remove_file(get_cpu_device(cpu), attr);
 	}
 
 	mutex_unlock(&cpu_mutex);
 }
-EXPORT_SYMBOL_GPL(cpu_remove_sysdev_attr);
+EXPORT_SYMBOL_GPL(cpu_remove_dev_attr);
 
-void cpu_remove_sysdev_attr_group(struct attribute_group *attrs)
+void cpu_remove_dev_attr_group(struct attribute_group *attrs)
 {
 	int cpu;
-	struct sys_device *sysdev;
+	struct device *dev;
 
 	mutex_lock(&cpu_mutex);
 
 	for_each_possible_cpu(cpu) {
-		sysdev = get_cpu_sysdev(cpu);
-		sysfs_remove_group(&sysdev->kobj, attrs);
+		dev = get_cpu_device(cpu);
+		sysfs_remove_group(&dev->kobj, attrs);
 	}
 
 	mutex_unlock(&cpu_mutex);
 }
-EXPORT_SYMBOL_GPL(cpu_remove_sysdev_attr_group);
+EXPORT_SYMBOL_GPL(cpu_remove_dev_attr_group);
 
 
 /* NUMA stuff */
@@ -590,7 +589,7 @@ static void register_nodes(void)
 		register_one_node(i);
 }
 
-int sysfs_add_device_to_node(struct sys_device *dev, int nid)
+int sysfs_add_device_to_node(struct device *dev, int nid)
 {
 	struct node *node = &node_devices[nid];
 	return sysfs_create_link(&node->sysdev.kobj, &dev->kobj,
@@ -598,7 +597,7 @@ int sysfs_add_device_to_node(struct sys_device *dev, int nid)
 }
 EXPORT_SYMBOL_GPL(sysfs_add_device_to_node);
 
-void sysfs_remove_device_from_node(struct sys_device *dev, int nid)
+void sysfs_remove_device_from_node(struct device *dev, int nid)
 {
 	struct node *node = &node_devices[nid];
 	sysfs_remove_link(&node->sysdev.kobj, kobject_name(&dev->kobj));
@@ -614,14 +613,14 @@ static void register_nodes(void)
 #endif
 
 /* Only valid if CPU is present. */
-static ssize_t show_physical_id(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t show_physical_id(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
-	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
 
-	return sprintf(buf, "%d\n", get_hard_smp_processor_id(cpu->sysdev.id));
+	return sprintf(buf, "%d\n", get_hard_smp_processor_id(cpu->dev.id));
 }
-static SYSDEV_ATTR(physical_id, 0444, show_physical_id, NULL);
+static DEVICE_ATTR(physical_id, 0444, show_physical_id, NULL);
 
 static int __init topology_init(void)
 {
@@ -646,7 +645,7 @@ static int __init topology_init(void)
 		if (cpu_online(cpu) || c->hotpluggable) {
 			register_cpu(c, cpu);
 
-			sysdev_create_file(&c->sysdev, &attr_physical_id);
+			device_create_file(&c->dev, &dev_attr_physical_id);
 		}
 
 		if (cpu_online(cpu))
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index c7dd4dec4df8..f2b03a863430 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1452,7 +1452,7 @@ int arch_update_cpu_topology(void)
 {
 	int cpu, nid, old_nid;
 	unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
-	struct sys_device *sysdev;
+	struct device *dev;
 
 	for_each_cpu(cpu,&cpu_associativity_changes_mask) {
 		vphn_get_associativity(cpu, associativity);
@@ -1473,9 +1473,9 @@ int arch_update_cpu_topology(void)
 		register_cpu_under_node(cpu, nid);
 		put_online_cpus();
 
-		sysdev = get_cpu_sysdev(cpu);
-		if (sysdev)
-			kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
+		dev = get_cpu_device(cpu);
+		if (dev)
+			kobject_uevent(&dev->kobj, KOBJ_CHANGE);
 	}
 
 	return 1;
diff --git a/arch/powerpc/platforms/cell/cbe_thermal.c b/arch/powerpc/platforms/cell/cbe_thermal.c
index 4d4c8c169124..94560db788bf 100644
--- a/arch/powerpc/platforms/cell/cbe_thermal.c
+++ b/arch/powerpc/platforms/cell/cbe_thermal.c
@@ -46,7 +46,7 @@
  */
 
 #include <linux/module.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/kernel.h>
 #include <linux/cpu.h>
 #include <asm/spu.h>
@@ -59,8 +59,8 @@
 #define TEMP_MIN 65
 #define TEMP_MAX 125
 
-#define SYSDEV_PREFIX_ATTR(_prefix,_name,_mode)			\
-struct sysdev_attribute attr_ ## _prefix ## _ ## _name = {	\
+#define DEVICE_PREFIX_ATTR(_prefix,_name,_mode)			\
+struct device_attribute attr_ ## _prefix ## _ ## _name = {	\
 	.attr = { .name = __stringify(_name), .mode = _mode },	\
 	.show	= _prefix ## _show_ ## _name,			\
 	.store	= _prefix ## _store_ ## _name,			\
@@ -76,36 +76,36 @@ static inline u8 temp_to_reg(u8 temp)
 	return ((temp - TEMP_MIN) >> 1) & 0x3f;
 }
 
-static struct cbe_pmd_regs __iomem *get_pmd_regs(struct sys_device *sysdev)
+static struct cbe_pmd_regs __iomem *get_pmd_regs(struct device *dev)
 {
 	struct spu *spu;
 
-	spu = container_of(sysdev, struct spu, sysdev);
+	spu = container_of(dev, struct spu, dev);
 
 	return cbe_get_pmd_regs(spu_devnode(spu));
 }
 
 /* returns the value for a given spu in a given register */
-static u8 spu_read_register_value(struct sys_device *sysdev, union spe_reg __iomem *reg)
+static u8 spu_read_register_value(struct device *dev, union spe_reg __iomem *reg)
 {
 	union spe_reg value;
 	struct spu *spu;
 
-	spu = container_of(sysdev, struct spu, sysdev);
+	spu = container_of(dev, struct spu, dev);
 	value.val = in_be64(&reg->val);
 
 	return value.spe[spu->spe_id];
 }
 
-static ssize_t spu_show_temp(struct sys_device *sysdev, struct sysdev_attribute *attr,
+static ssize_t spu_show_temp(struct device *dev, struct device_attribute *attr,
 			char *buf)
 {
 	u8 value;
 	struct cbe_pmd_regs __iomem *pmd_regs;
 
-	pmd_regs = get_pmd_regs(sysdev);
+	pmd_regs = get_pmd_regs(dev);
 
-	value = spu_read_register_value(sysdev, &pmd_regs->ts_ctsr1);
+	value = spu_read_register_value(dev, &pmd_regs->ts_ctsr1);
 
 	return sprintf(buf, "%d\n", reg_to_temp(value));
 }
@@ -147,48 +147,48 @@ static ssize_t store_throttle(struct cbe_pmd_regs __iomem *pmd_regs, const char
 	return size;
 }
 
-static ssize_t spu_show_throttle_end(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t spu_show_throttle_end(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
-	return show_throttle(get_pmd_regs(sysdev), buf, 0);
+	return show_throttle(get_pmd_regs(dev), buf, 0);
 }
 
-static ssize_t spu_show_throttle_begin(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t spu_show_throttle_begin(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
-	return show_throttle(get_pmd_regs(sysdev), buf, 8);
+	return show_throttle(get_pmd_regs(dev), buf, 8);
 }
 
-static ssize_t spu_show_throttle_full_stop(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t spu_show_throttle_full_stop(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
-	return show_throttle(get_pmd_regs(sysdev), buf, 16);
+	return show_throttle(get_pmd_regs(dev), buf, 16);
 }
 
-static ssize_t spu_store_throttle_end(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, const char *buf, size_t size)
+static ssize_t spu_store_throttle_end(struct device *dev,
+			struct device_attribute *attr, const char *buf, size_t size)
 {
-	return store_throttle(get_pmd_regs(sysdev), buf, size, 0);
+	return store_throttle(get_pmd_regs(dev), buf, size, 0);
 }
 
-static ssize_t spu_store_throttle_begin(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, const char *buf, size_t size)
+static ssize_t spu_store_throttle_begin(struct device *dev,
+			struct device_attribute *attr, const char *buf, size_t size)
 {
-	return store_throttle(get_pmd_regs(sysdev), buf, size, 8);
+	return store_throttle(get_pmd_regs(dev), buf, size, 8);
 }
 
-static ssize_t spu_store_throttle_full_stop(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, const char *buf, size_t size)
+static ssize_t spu_store_throttle_full_stop(struct device *dev,
+			struct device_attribute *attr, const char *buf, size_t size)
 {
-	return store_throttle(get_pmd_regs(sysdev), buf, size, 16);
+	return store_throttle(get_pmd_regs(dev), buf, size, 16);
 }
 
-static ssize_t ppe_show_temp(struct sys_device *sysdev, char *buf, int pos)
+static ssize_t ppe_show_temp(struct device *dev, char *buf, int pos)
 {
 	struct cbe_pmd_regs __iomem *pmd_regs;
 	u64 value;
 
-	pmd_regs = cbe_get_cpu_pmd_regs(sysdev->id);
+	pmd_regs = cbe_get_cpu_pmd_regs(dev->id);
 	value = in_be64(&pmd_regs->ts_ctsr2);
 
 	value = (value >> pos) & 0x3f;
@@ -199,64 +199,64 @@ static ssize_t ppe_show_temp(struct sys_device *sysdev, char *buf, int pos)
 
 /* shows the temperature of the DTS on the PPE,
  * located near the linear thermal sensor */
-static ssize_t ppe_show_temp0(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t ppe_show_temp0(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
-	return ppe_show_temp(sysdev, buf, 32);
+	return ppe_show_temp(dev, buf, 32);
 }
 
 /* shows the temperature of the second DTS on the PPE */
-static ssize_t ppe_show_temp1(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t ppe_show_temp1(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
-	return ppe_show_temp(sysdev, buf, 0);
+	return ppe_show_temp(dev, buf, 0);
 }
 
-static ssize_t ppe_show_throttle_end(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t ppe_show_throttle_end(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
-	return show_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, 32);
+	return show_throttle(cbe_get_cpu_pmd_regs(dev->id), buf, 32);
 }
 
-static ssize_t ppe_show_throttle_begin(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t ppe_show_throttle_begin(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
-	return show_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, 40);
+	return show_throttle(cbe_get_cpu_pmd_regs(dev->id), buf, 40);
 }
 
-static ssize_t ppe_show_throttle_full_stop(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t ppe_show_throttle_full_stop(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
-	return show_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, 48);
+	return show_throttle(cbe_get_cpu_pmd_regs(dev->id), buf, 48);
 }
 
-static ssize_t ppe_store_throttle_end(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, const char *buf, size_t size)
+static ssize_t ppe_store_throttle_end(struct device *dev,
+			struct device_attribute *attr, const char *buf, size_t size)
 {
-	return store_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, size, 32);
+	return store_throttle(cbe_get_cpu_pmd_regs(dev->id), buf, size, 32);
 }
 
-static ssize_t ppe_store_throttle_begin(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, const char *buf, size_t size)
+static ssize_t ppe_store_throttle_begin(struct device *dev,
+			struct device_attribute *attr, const char *buf, size_t size)
 {
-	return store_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, size, 40);
+	return store_throttle(cbe_get_cpu_pmd_regs(dev->id), buf, size, 40);
 }
 
-static ssize_t ppe_store_throttle_full_stop(struct sys_device *sysdev,
-			struct sysdev_attribute *attr, const char *buf, size_t size)
+static ssize_t ppe_store_throttle_full_stop(struct device *dev,
+			struct device_attribute *attr, const char *buf, size_t size)
 {
-	return store_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, size, 48);
+	return store_throttle(cbe_get_cpu_pmd_regs(dev->id), buf, size, 48);
 }
 
 
-static struct sysdev_attribute attr_spu_temperature = {
+static struct device_attribute attr_spu_temperature = {
 	.attr = {.name = "temperature", .mode = 0400 },
 	.show = spu_show_temp,
 };
 
-static SYSDEV_PREFIX_ATTR(spu, throttle_end, 0600);
-static SYSDEV_PREFIX_ATTR(spu, throttle_begin, 0600);
-static SYSDEV_PREFIX_ATTR(spu, throttle_full_stop, 0600);
+static DEVICE_PREFIX_ATTR(spu, throttle_end, 0600);
+static DEVICE_PREFIX_ATTR(spu, throttle_begin, 0600);
+static DEVICE_PREFIX_ATTR(spu, throttle_full_stop, 0600);
 
 
 static struct attribute *spu_attributes[] = {
@@ -272,19 +272,19 @@ static struct attribute_group spu_attribute_group = {
 	.attrs	= spu_attributes,
 };
 
-static struct sysdev_attribute attr_ppe_temperature0 = {
+static struct device_attribute attr_ppe_temperature0 = {
 	.attr = {.name = "temperature0", .mode = 0400 },
 	.show = ppe_show_temp0,
 };
 
-static struct sysdev_attribute attr_ppe_temperature1 = {
+static struct device_attribute attr_ppe_temperature1 = {
 	.attr = {.name = "temperature1", .mode = 0400 },
 	.show = ppe_show_temp1,
 };
 
-static SYSDEV_PREFIX_ATTR(ppe, throttle_end, 0600);
-static SYSDEV_PREFIX_ATTR(ppe, throttle_begin, 0600);
-static SYSDEV_PREFIX_ATTR(ppe, throttle_full_stop, 0600);
+static DEVICE_PREFIX_ATTR(ppe, throttle_end, 0600);
+static DEVICE_PREFIX_ATTR(ppe, throttle_begin, 0600);
+static DEVICE_PREFIX_ATTR(ppe, throttle_full_stop, 0600);
 
 static struct attribute *ppe_attributes[] = {
 	&attr_ppe_temperature0.attr,
@@ -307,7 +307,7 @@ static int __init init_default_values(void)
 {
 	int cpu;
 	struct cbe_pmd_regs __iomem *pmd_regs;
-	struct sys_device *sysdev;
+	struct device *dev;
 	union ppe_spe_reg tpr;
 	union spe_reg str1;
 	u64 str2;
@@ -349,14 +349,14 @@ static int __init init_default_values(void)
 
 	for_each_possible_cpu (cpu) {
 		pr_debug("processing cpu %d\n", cpu);
-		sysdev = get_cpu_sysdev(cpu);
+		dev = get_cpu_device(cpu);
 
-		if (!sysdev) {
-			pr_info("invalid sysdev pointer for cbe_thermal\n");
+		if (!dev) {
+			pr_info("invalid dev pointer for cbe_thermal\n");
 			return -EINVAL;
 		}
 
-		pmd_regs = cbe_get_cpu_pmd_regs(sysdev->id);
+		pmd_regs = cbe_get_cpu_pmd_regs(dev->id);
 
 		if (!pmd_regs) {
 			pr_info("invalid CBE regs pointer for cbe_thermal\n");
@@ -379,8 +379,8 @@ static int __init thermal_init(void)
 	int rc = init_default_values();
 
 	if (rc == 0) {
-		spu_add_sysdev_attr_group(&spu_attribute_group);
-		cpu_add_sysdev_attr_group(&ppe_attribute_group);
+		spu_add_dev_attr_group(&spu_attribute_group);
+		cpu_add_dev_attr_group(&ppe_attribute_group);
 	}
 
 	return rc;
@@ -389,8 +389,8 @@ module_init(thermal_init);
 
 static void __exit thermal_exit(void)
 {
-	spu_remove_sysdev_attr_group(&spu_attribute_group);
-	cpu_remove_sysdev_attr_group(&ppe_attribute_group);
+	spu_remove_dev_attr_group(&spu_attribute_group);
+	cpu_remove_dev_attr_group(&ppe_attribute_group);
 }
 module_exit(thermal_exit);
 
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 3675da73623f..1708fb7aba35 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -522,31 +522,32 @@ void spu_init_channels(struct spu *spu)
 }
 EXPORT_SYMBOL_GPL(spu_init_channels);
 
-static struct sysdev_class spu_sysdev_class = {
+static struct bus_type spu_subsys = {
 	.name = "spu",
+	.dev_name = "spu",
 };
 
-int spu_add_sysdev_attr(struct sysdev_attribute *attr)
+int spu_add_dev_attr(struct device_attribute *attr)
 {
 	struct spu *spu;
 
 	mutex_lock(&spu_full_list_mutex);
 	list_for_each_entry(spu, &spu_full_list, full_list)
-		sysdev_create_file(&spu->sysdev, attr);
+		device_create_file(&spu->dev, attr);
 	mutex_unlock(&spu_full_list_mutex);
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(spu_add_sysdev_attr);
+EXPORT_SYMBOL_GPL(spu_add_dev_attr);
 
-int spu_add_sysdev_attr_group(struct attribute_group *attrs)
+int spu_add_dev_attr_group(struct attribute_group *attrs)
 {
 	struct spu *spu;
 	int rc = 0;
 
 	mutex_lock(&spu_full_list_mutex);
 	list_for_each_entry(spu, &spu_full_list, full_list) {
-		rc = sysfs_create_group(&spu->sysdev.kobj, attrs);
+		rc = sysfs_create_group(&spu->dev.kobj, attrs);
 
 		/* we're in trouble here, but try unwinding anyway */
 		if (rc) {
@@ -555,7 +556,7 @@ int spu_add_sysdev_attr_group(struct attribute_group *attrs)
 
 			list_for_each_entry_continue_reverse(spu,
 					&spu_full_list, full_list)
-				sysfs_remove_group(&spu->sysdev.kobj, attrs);
+				sysfs_remove_group(&spu->dev.kobj, attrs);
 			break;
 		}
 	}
@@ -564,45 +565,45 @@ int spu_add_sysdev_attr_group(struct attribute_group *attrs)
 
 	return rc;
 }
-EXPORT_SYMBOL_GPL(spu_add_sysdev_attr_group);
+EXPORT_SYMBOL_GPL(spu_add_dev_attr_group);
 
 
-void spu_remove_sysdev_attr(struct sysdev_attribute *attr)
+void spu_remove_dev_attr(struct device_attribute *attr)
 {
 	struct spu *spu;
 
 	mutex_lock(&spu_full_list_mutex);
 	list_for_each_entry(spu, &spu_full_list, full_list)
-		sysdev_remove_file(&spu->sysdev, attr);
+		device_remove_file(&spu->dev, attr);
 	mutex_unlock(&spu_full_list_mutex);
 }
-EXPORT_SYMBOL_GPL(spu_remove_sysdev_attr);
+EXPORT_SYMBOL_GPL(spu_remove_dev_attr);
 
-void spu_remove_sysdev_attr_group(struct attribute_group *attrs)
+void spu_remove_dev_attr_group(struct attribute_group *attrs)
 {
 	struct spu *spu;
 
 	mutex_lock(&spu_full_list_mutex);
 	list_for_each_entry(spu, &spu_full_list, full_list)
-		sysfs_remove_group(&spu->sysdev.kobj, attrs);
+		sysfs_remove_group(&spu->dev.kobj, attrs);
 	mutex_unlock(&spu_full_list_mutex);
 }
-EXPORT_SYMBOL_GPL(spu_remove_sysdev_attr_group);
+EXPORT_SYMBOL_GPL(spu_remove_dev_attr_group);
 
-static int spu_create_sysdev(struct spu *spu)
+static int spu_create_dev(struct spu *spu)
 {
 	int ret;
 
-	spu->sysdev.id = spu->number;
-	spu->sysdev.cls = &spu_sysdev_class;
-	ret = sysdev_register(&spu->sysdev);
+	spu->dev.id = spu->number;
+	spu->dev.bus = &spu_subsys;
+	ret = device_register(&spu->dev);
 	if (ret) {
 		printk(KERN_ERR "Can't register SPU %d with sysfs\n",
 				spu->number);
 		return ret;
 	}
 
-	sysfs_add_device_to_node(&spu->sysdev, spu->node);
+	sysfs_add_device_to_node(&spu->dev, spu->node);
 
 	return 0;
 }
@@ -638,7 +639,7 @@ static int __init create_spu(void *data)
 	if (ret)
 		goto out_destroy;
 
-	ret = spu_create_sysdev(spu);
+	ret = spu_create_dev(spu);
 	if (ret)
 		goto out_free_irqs;
 
@@ -695,10 +696,10 @@ static unsigned long long spu_acct_time(struct spu *spu,
 }
 
 
-static ssize_t spu_stat_show(struct sys_device *sysdev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t spu_stat_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
-	struct spu *spu = container_of(sysdev, struct spu, sysdev);
+	struct spu *spu = container_of(dev, struct spu, dev);
 
 	return sprintf(buf, "%s %llu %llu %llu %llu "
 		      "%llu %llu %llu %llu %llu %llu %llu %llu\n",
@@ -717,7 +718,7 @@ static ssize_t spu_stat_show(struct sys_device *sysdev,
 		spu->stats.libassist);
 }
 
-static SYSDEV_ATTR(stat, 0644, spu_stat_show, NULL);
+static DEVICE_ATTR(stat, 0644, spu_stat_show, NULL);
 
 #ifdef CONFIG_KEXEC
 
@@ -816,8 +817,8 @@ static int __init init_spu_base(void)
 	if (!spu_management_ops)
 		goto out;
 
-	/* create sysdev class for spus */
-	ret = sysdev_class_register(&spu_sysdev_class);
+	/* create system subsystem for spus */
+	ret = subsys_system_register(&spu_subsys, NULL);
 	if (ret)
 		goto out;
 
@@ -826,7 +827,7 @@ static int __init init_spu_base(void)
 	if (ret < 0) {
 		printk(KERN_WARNING "%s: Error initializing spus\n",
 			__func__);
-		goto out_unregister_sysdev_class;
+		goto out_unregister_subsys;
 	}
 
 	if (ret > 0)
@@ -836,15 +837,15 @@ static int __init init_spu_base(void)
 	xmon_register_spus(&spu_full_list);
 	crash_register_spus(&spu_full_list);
 	mutex_unlock(&spu_full_list_mutex);
-	spu_add_sysdev_attr(&attr_stat);
+	spu_add_dev_attr(&dev_attr_stat);
 	register_syscore_ops(&spu_syscore_ops);
 
 	spu_init_affinity();
 
 	return 0;
 
- out_unregister_sysdev_class:
-	sysdev_class_unregister(&spu_sysdev_class);
+ out_unregister_subsys:
+	bus_unregister(&spu_subsys);
  out:
 	return ret;
 }
diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c b/arch/powerpc/platforms/pseries/pseries_energy.c
index c8b3c69fe891..af281dce510a 100644
--- a/arch/powerpc/platforms/pseries/pseries_energy.c
+++ b/arch/powerpc/platforms/pseries/pseries_energy.c
@@ -15,7 +15,7 @@
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/seq_file.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/cpu.h>
 #include <linux/of.h>
 #include <asm/cputhreads.h>
@@ -184,7 +184,7 @@ static ssize_t get_best_energy_list(char *page, int activate)
 	return s-page;
 }
 
-static ssize_t get_best_energy_data(struct sys_device *dev,
+static ssize_t get_best_energy_data(struct device *dev,
 					char *page, int activate)
 {
 	int rc;
@@ -207,26 +207,26 @@ static ssize_t get_best_energy_data(struct sys_device *dev,
 
 /* Wrapper functions */
 
-static ssize_t cpu_activate_hint_list_show(struct sysdev_class *class,
-			struct sysdev_class_attribute *attr, char *page)
+static ssize_t cpu_activate_hint_list_show(struct device *dev,
+			struct device_attribute *attr, char *page)
 {
 	return get_best_energy_list(page, 1);
 }
 
-static ssize_t cpu_deactivate_hint_list_show(struct sysdev_class *class,
-			struct sysdev_class_attribute *attr, char *page)
+static ssize_t cpu_deactivate_hint_list_show(struct device *dev,
+			struct device_attribute *attr, char *page)
 {
 	return get_best_energy_list(page, 0);
 }
 
-static ssize_t percpu_activate_hint_show(struct sys_device *dev,
-			struct sysdev_attribute *attr, char *page)
+static ssize_t percpu_activate_hint_show(struct device *dev,
+			struct device_attribute *attr, char *page)
 {
 	return get_best_energy_data(dev, page, 1);
 }
 
-static ssize_t percpu_deactivate_hint_show(struct sys_device *dev,
-			struct sysdev_attribute *attr, char *page)
+static ssize_t percpu_deactivate_hint_show(struct device *dev,
+			struct device_attribute *attr, char *page)
 {
 	return get_best_energy_data(dev, page, 0);
 }
@@ -241,48 +241,48 @@ static ssize_t percpu_deactivate_hint_show(struct sys_device *dev,
  *	Per-cpu value of the hint
  */
 
-struct sysdev_class_attribute attr_cpu_activate_hint_list =
-		_SYSDEV_CLASS_ATTR(pseries_activate_hint_list, 0444,
+struct device_attribute attr_cpu_activate_hint_list =
+		__ATTR(pseries_activate_hint_list, 0444,
 		cpu_activate_hint_list_show, NULL);
 
-struct sysdev_class_attribute attr_cpu_deactivate_hint_list =
-		_SYSDEV_CLASS_ATTR(pseries_deactivate_hint_list, 0444,
+struct device_attribute attr_cpu_deactivate_hint_list =
+		__ATTR(pseries_deactivate_hint_list, 0444,
 		cpu_deactivate_hint_list_show, NULL);
 
-struct sysdev_attribute attr_percpu_activate_hint =
-		_SYSDEV_ATTR(pseries_activate_hint, 0444,
+struct device_attribute attr_percpu_activate_hint =
+		__ATTR(pseries_activate_hint, 0444,
 		percpu_activate_hint_show, NULL);
 
-struct sysdev_attribute attr_percpu_deactivate_hint =
-		_SYSDEV_ATTR(pseries_deactivate_hint, 0444,
+struct device_attribute attr_percpu_deactivate_hint =
+		__ATTR(pseries_deactivate_hint, 0444,
 		percpu_deactivate_hint_show, NULL);
 
 static int __init pseries_energy_init(void)
 {
 	int cpu, err;
-	struct sys_device *cpu_sys_dev;
+	struct device *cpu_dev;
 
 	if (!check_for_h_best_energy()) {
 		printk(KERN_INFO "Hypercall H_BEST_ENERGY not supported\n");
 		return 0;
 	}
 	/* Create the sysfs files */
-	err = sysfs_create_file(&cpu_sysdev_class.kset.kobj,
-				&attr_cpu_activate_hint_list.attr);
+	err = device_create_file(cpu_subsys.dev_root,
+				&attr_cpu_activate_hint_list);
 	if (!err)
-		err = sysfs_create_file(&cpu_sysdev_class.kset.kobj,
-				&attr_cpu_deactivate_hint_list.attr);
+		err = device_create_file(cpu_subsys.dev_root,
+				&attr_cpu_deactivate_hint_list);
 
 	if (err)
 		return err;
 	for_each_possible_cpu(cpu) {
-		cpu_sys_dev = get_cpu_sysdev(cpu);
-		err = sysfs_create_file(&cpu_sys_dev->kobj,
-				&attr_percpu_activate_hint.attr);
+		cpu_dev = get_cpu_device(cpu);
+		err = device_create_file(cpu_dev,
+				&attr_percpu_activate_hint);
 		if (err)
 			break;
-		err = sysfs_create_file(&cpu_sys_dev->kobj,
-				&attr_percpu_deactivate_hint.attr);
+		err = device_create_file(cpu_dev,
+				&attr_percpu_deactivate_hint);
 		if (err)
 			break;
 	}
@@ -298,23 +298,20 @@ static int __init pseries_energy_init(void)
 static void __exit pseries_energy_cleanup(void)
 {
 	int cpu;
-	struct sys_device *cpu_sys_dev;
+	struct device *cpu_dev;
 
 	if (!sysfs_entries)
 		return;
 
 	/* Remove the sysfs files */
-	sysfs_remove_file(&cpu_sysdev_class.kset.kobj,
-				&attr_cpu_activate_hint_list.attr);
-
-	sysfs_remove_file(&cpu_sysdev_class.kset.kobj,
-				&attr_cpu_deactivate_hint_list.attr);
+	device_remove_file(cpu_subsys.dev_root, &attr_cpu_activate_hint_list);
+	device_remove_file(cpu_subsys.dev_root, &attr_cpu_deactivate_hint_list);
 
 	for_each_possible_cpu(cpu) {
-		cpu_sys_dev = get_cpu_sysdev(cpu);
-		sysfs_remove_file(&cpu_sys_dev->kobj,
+		cpu_dev = get_cpu_device(cpu);
+		sysfs_remove_file(&cpu_dev->kobj,
 				&attr_percpu_activate_hint.attr);
-		sysfs_remove_file(&cpu_sys_dev->kobj,
+		sysfs_remove_file(&cpu_dev->kobj,
 				&attr_percpu_deactivate_hint.attr);
 	}
 }
diff --git a/arch/powerpc/sysdev/ppc4xx_cpm.c b/arch/powerpc/sysdev/ppc4xx_cpm.c
index 73b86cc5ea74..82e2cfe35c62 100644
--- a/arch/powerpc/sysdev/ppc4xx_cpm.c
+++ b/arch/powerpc/sysdev/ppc4xx_cpm.c
@@ -179,12 +179,12 @@ static struct kobj_attribute cpm_idle_attr =
 
 static void cpm_idle_config_sysfs(void)
 {
-	struct sys_device *sys_dev;
+	struct device *dev;
 	unsigned long ret;
 
-	sys_dev = get_cpu_sysdev(0);
+	dev = get_cpu_device(0);
 
-	ret = sysfs_create_file(&sys_dev->kobj,
+	ret = sysfs_create_file(&dev->kobj,
 				&cpm_idle_attr.attr);
 	if (ret)
 		printk(KERN_WARNING
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 3ea872890da2..66cca03c0282 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -831,8 +831,8 @@ int setup_profiling_timer(unsigned int multiplier)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static ssize_t cpu_configure_show(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t cpu_configure_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	ssize_t count;
 
@@ -842,8 +842,8 @@ static ssize_t cpu_configure_show(struct sys_device *dev,
 	return count;
 }
 
-static ssize_t cpu_configure_store(struct sys_device *dev,
-				  struct sysdev_attribute *attr,
+static ssize_t cpu_configure_store(struct device *dev,
+				  struct device_attribute *attr,
 				  const char *buf, size_t count)
 {
 	int cpu = dev->id;
@@ -889,11 +889,11 @@ out:
 	put_online_cpus();
 	return rc ? rc : count;
 }
-static SYSDEV_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store);
+static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store);
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static ssize_t cpu_polarization_show(struct sys_device *dev,
-				     struct sysdev_attribute *attr, char *buf)
+static ssize_t cpu_polarization_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
 {
 	int cpu = dev->id;
 	ssize_t count;
@@ -919,22 +919,22 @@ static ssize_t cpu_polarization_show(struct sys_device *dev,
 	mutex_unlock(&smp_cpu_state_mutex);
 	return count;
 }
-static SYSDEV_ATTR(polarization, 0444, cpu_polarization_show, NULL);
+static DEVICE_ATTR(polarization, 0444, cpu_polarization_show, NULL);
 
-static ssize_t show_cpu_address(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t show_cpu_address(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%d\n", __cpu_logical_map[dev->id]);
 }
-static SYSDEV_ATTR(address, 0444, show_cpu_address, NULL);
+static DEVICE_ATTR(address, 0444, show_cpu_address, NULL);
 
 
 static struct attribute *cpu_common_attrs[] = {
 #ifdef CONFIG_HOTPLUG_CPU
-	&attr_configure.attr,
+	&dev_attr_configure.attr,
 #endif
-	&attr_address.attr,
-	&attr_polarization.attr,
+	&dev_attr_address.attr,
+	&dev_attr_polarization.attr,
 	NULL,
 };
 
@@ -942,8 +942,8 @@ static struct attribute_group cpu_common_attr_group = {
 	.attrs = cpu_common_attrs,
 };
 
-static ssize_t show_capability(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t show_capability(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	unsigned int capability;
 	int rc;
@@ -953,10 +953,10 @@ static ssize_t show_capability(struct sys_device *dev,
 		return rc;
 	return sprintf(buf, "%u\n", capability);
 }
-static SYSDEV_ATTR(capability, 0444, show_capability, NULL);
+static DEVICE_ATTR(capability, 0444, show_capability, NULL);
 
-static ssize_t show_idle_count(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t show_idle_count(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	struct s390_idle_data *idle;
 	unsigned long long idle_count;
@@ -976,10 +976,10 @@ repeat:
 		goto repeat;
 	return sprintf(buf, "%llu\n", idle_count);
 }
-static SYSDEV_ATTR(idle_count, 0444, show_idle_count, NULL);
+static DEVICE_ATTR(idle_count, 0444, show_idle_count, NULL);
 
-static ssize_t show_idle_time(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t show_idle_time(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	struct s390_idle_data *idle;
 	unsigned long long now, idle_time, idle_enter;
@@ -1001,12 +1001,12 @@ repeat:
 		goto repeat;
 	return sprintf(buf, "%llu\n", idle_time >> 12);
 }
-static SYSDEV_ATTR(idle_time_us, 0444, show_idle_time, NULL);
+static DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL);
 
 static struct attribute *cpu_online_attrs[] = {
-	&attr_capability.attr,
-	&attr_idle_count.attr,
-	&attr_idle_time_us.attr,
+	&dev_attr_capability.attr,
+	&dev_attr_idle_count.attr,
+	&dev_attr_idle_time_us.attr,
 	NULL,
 };
 
@@ -1019,7 +1019,7 @@ static int __cpuinit smp_cpu_notify(struct notifier_block *self,
 {
 	unsigned int cpu = (unsigned int)(long)hcpu;
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
-	struct sys_device *s = &c->sysdev;
+	struct device *s = &c->dev;
 	struct s390_idle_data *idle;
 	int err = 0;
 
@@ -1045,7 +1045,7 @@ static struct notifier_block __cpuinitdata smp_cpu_nb = {
 static int __devinit smp_add_present_cpu(int cpu)
 {
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
-	struct sys_device *s = &c->sysdev;
+	struct device *s = &c->dev;
 	int rc;
 
 	c->hotpluggable = 1;
@@ -1098,8 +1098,8 @@ out:
 	return rc;
 }
 
-static ssize_t __ref rescan_store(struct sysdev_class *class,
-				  struct sysdev_class_attribute *attr,
+static ssize_t __ref rescan_store(struct device *dev,
+				  struct device_attribute *attr,
 				  const char *buf,
 				  size_t count)
 {
@@ -1108,11 +1108,11 @@ static ssize_t __ref rescan_store(struct sysdev_class *class,
 	rc = smp_rescan_cpus();
 	return rc ? rc : count;
 }
-static SYSDEV_CLASS_ATTR(rescan, 0200, NULL, rescan_store);
+static DEVICE_ATTR(rescan, 0200, NULL, rescan_store);
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static ssize_t dispatching_show(struct sysdev_class *class,
-				struct sysdev_class_attribute *attr,
+static ssize_t dispatching_show(struct device *dev,
+				struct device_attribute *attr,
 				char *buf)
 {
 	ssize_t count;
@@ -1123,8 +1123,8 @@ static ssize_t dispatching_show(struct sysdev_class *class,
 	return count;
 }
 
-static ssize_t dispatching_store(struct sysdev_class *dev,
-				 struct sysdev_class_attribute *attr,
+static ssize_t dispatching_store(struct device *dev,
+				 struct device_attribute *attr,
 				 const char *buf,
 				 size_t count)
 {
@@ -1148,7 +1148,7 @@ out:
 	put_online_cpus();
 	return rc ? rc : count;
 }
-static SYSDEV_CLASS_ATTR(dispatching, 0644, dispatching_show,
+static DEVICE_ATTR(dispatching, 0644, dispatching_show,
 			 dispatching_store);
 
 static int __init topology_init(void)
@@ -1159,11 +1159,11 @@ static int __init topology_init(void)
 	register_cpu_notifier(&smp_cpu_nb);
 
 #ifdef CONFIG_HOTPLUG_CPU
-	rc = sysdev_class_create_file(&cpu_sysdev_class, &attr_rescan);
+	rc = device_create_file(cpu_subsys.dev_root, &dev_attr_rescan);
 	if (rc)
 		return rc;
 #endif
-	rc = sysdev_class_create_file(&cpu_sysdev_class, &attr_dispatching);
+	rc = device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching);
 	if (rc)
 		return rc;
 	for_each_present_cpu(cpu) {
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 77b8942b9a15..6dfc524c31aa 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -230,7 +230,7 @@ void store_topology(struct sysinfo_15_1_x *info)
 int arch_update_cpu_topology(void)
 {
 	struct sysinfo_15_1_x *info = tl_info;
-	struct sys_device *sysdev;
+	struct device *dev;
 	int cpu;
 
 	if (!MACHINE_HAS_TOPOLOGY) {
@@ -242,8 +242,8 @@ int arch_update_cpu_topology(void)
 	tl_to_cores(info);
 	update_cpu_core_map();
 	for_each_online_cpu(cpu) {
-		sysdev = get_cpu_sysdev(cpu);
-		kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
+		dev = get_cpu_device(cpu);
+		kobject_uevent(&dev->kobj, KOBJ_CHANGE);
 	}
 	return 1;
 }
diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c
index f0907995b4c9..a8140f0bbf6c 100644
--- a/arch/sh/kernel/cpu/sh4/sq.c
+++ b/arch/sh/kernel/cpu/sh4/sq.c
@@ -13,7 +13,7 @@
 #include <linux/init.h>
 #include <linux/cpu.h>
 #include <linux/bitmap.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -337,9 +337,9 @@ static struct kobj_type ktype_percpu_entry = {
 	.default_attrs	= sq_sysfs_attrs,
 };
 
-static int __devinit sq_sysdev_add(struct sys_device *sysdev)
+static int __devinit sq_dev_add(struct device *dev)
 {
-	unsigned int cpu = sysdev->id;
+	unsigned int cpu = dev->id;
 	struct kobject *kobj;
 	int error;
 
@@ -348,25 +348,27 @@ static int __devinit sq_sysdev_add(struct sys_device *sysdev)
 		return -ENOMEM;
 
 	kobj = sq_kobject[cpu];
-	error = kobject_init_and_add(kobj, &ktype_percpu_entry, &sysdev->kobj,
+	error = kobject_init_and_add(kobj, &ktype_percpu_entry, &dev->kobj,
 				     "%s", "sq");
 	if (!error)
 		kobject_uevent(kobj, KOBJ_ADD);
 	return error;
 }
 
-static int __devexit sq_sysdev_remove(struct sys_device *sysdev)
+static int __devexit sq_dev_remove(struct device *dev)
 {
-	unsigned int cpu = sysdev->id;
+	unsigned int cpu = dev->id;
 	struct kobject *kobj = sq_kobject[cpu];
 
 	kobject_put(kobj);
 	return 0;
 }
 
-static struct sysdev_driver sq_sysdev_driver = {
-	.add		= sq_sysdev_add,
-	.remove		= __devexit_p(sq_sysdev_remove),
+static struct subsys_interface sq_interface = {
+	.name		= "sq"
+	.subsys		= &cpu_subsys,
+	.add_dev	= sq_dev_add,
+	.remove_dev	= __devexit_p(sq_dev_remove),
 };
 
 static int __init sq_api_init(void)
@@ -386,7 +388,7 @@ static int __init sq_api_init(void)
 	if (unlikely(!sq_bitmap))
 		goto out;
 
-	ret = sysdev_driver_register(&cpu_sysdev_class, &sq_sysdev_driver);
+	ret = subsys_interface_register(&sq_interface);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -401,7 +403,7 @@ out:
 
 static void __exit sq_api_exit(void)
 {
-	sysdev_driver_unregister(&cpu_sysdev_class, &sq_sysdev_driver);
+	subsys_interface_unregister(&sq_interface);
 	kfree(sq_bitmap);
 	kmem_cache_destroy(sq_cache);
 }
diff --git a/arch/sparc/kernel/sysfs.c b/arch/sparc/kernel/sysfs.c
index 7408201d7efb..654e8aad3bbe 100644
--- a/arch/sparc/kernel/sysfs.c
+++ b/arch/sparc/kernel/sysfs.c
@@ -3,7 +3,7 @@
  * Copyright (C) 2007 David S. Miller <davem@davemloft.net>
  */
 #include <linux/sched.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/percpu.h>
@@ -16,13 +16,13 @@
 static DEFINE_PER_CPU(struct hv_mmu_statistics, mmu_stats) __attribute__((aligned(64)));
 
 #define SHOW_MMUSTAT_ULONG(NAME) \
-static ssize_t show_##NAME(struct sys_device *dev, \
-			struct sysdev_attribute *attr, char *buf) \
+static ssize_t show_##NAME(struct device *dev, \
+			struct device_attribute *attr, char *buf) \
 { \
 	struct hv_mmu_statistics *p = &per_cpu(mmu_stats, dev->id); \
 	return sprintf(buf, "%lu\n", p->NAME); \
 } \
-static SYSDEV_ATTR(NAME, 0444, show_##NAME, NULL)
+static DEVICE_ATTR(NAME, 0444, show_##NAME, NULL)
 
 SHOW_MMUSTAT_ULONG(immu_tsb_hits_ctx0_8k_tte);
 SHOW_MMUSTAT_ULONG(immu_tsb_ticks_ctx0_8k_tte);
@@ -58,38 +58,38 @@ SHOW_MMUSTAT_ULONG(dmmu_tsb_hits_ctxnon0_256mb_tte);
 SHOW_MMUSTAT_ULONG(dmmu_tsb_ticks_ctxnon0_256mb_tte);
 
 static struct attribute *mmu_stat_attrs[] = {
-	&attr_immu_tsb_hits_ctx0_8k_tte.attr,
-	&attr_immu_tsb_ticks_ctx0_8k_tte.attr,
-	&attr_immu_tsb_hits_ctx0_64k_tte.attr,
-	&attr_immu_tsb_ticks_ctx0_64k_tte.attr,
-	&attr_immu_tsb_hits_ctx0_4mb_tte.attr,
-	&attr_immu_tsb_ticks_ctx0_4mb_tte.attr,
-	&attr_immu_tsb_hits_ctx0_256mb_tte.attr,
-	&attr_immu_tsb_ticks_ctx0_256mb_tte.attr,
-	&attr_immu_tsb_hits_ctxnon0_8k_tte.attr,
-	&attr_immu_tsb_ticks_ctxnon0_8k_tte.attr,
-	&attr_immu_tsb_hits_ctxnon0_64k_tte.attr,
-	&attr_immu_tsb_ticks_ctxnon0_64k_tte.attr,
-	&attr_immu_tsb_hits_ctxnon0_4mb_tte.attr,
-	&attr_immu_tsb_ticks_ctxnon0_4mb_tte.attr,
-	&attr_immu_tsb_hits_ctxnon0_256mb_tte.attr,
-	&attr_immu_tsb_ticks_ctxnon0_256mb_tte.attr,
-	&attr_dmmu_tsb_hits_ctx0_8k_tte.attr,
-	&attr_dmmu_tsb_ticks_ctx0_8k_tte.attr,
-	&attr_dmmu_tsb_hits_ctx0_64k_tte.attr,
-	&attr_dmmu_tsb_ticks_ctx0_64k_tte.attr,
-	&attr_dmmu_tsb_hits_ctx0_4mb_tte.attr,
-	&attr_dmmu_tsb_ticks_ctx0_4mb_tte.attr,
-	&attr_dmmu_tsb_hits_ctx0_256mb_tte.attr,
-	&attr_dmmu_tsb_ticks_ctx0_256mb_tte.attr,
-	&attr_dmmu_tsb_hits_ctxnon0_8k_tte.attr,
-	&attr_dmmu_tsb_ticks_ctxnon0_8k_tte.attr,
-	&attr_dmmu_tsb_hits_ctxnon0_64k_tte.attr,
-	&attr_dmmu_tsb_ticks_ctxnon0_64k_tte.attr,
-	&attr_dmmu_tsb_hits_ctxnon0_4mb_tte.attr,
-	&attr_dmmu_tsb_ticks_ctxnon0_4mb_tte.attr,
-	&attr_dmmu_tsb_hits_ctxnon0_256mb_tte.attr,
-	&attr_dmmu_tsb_ticks_ctxnon0_256mb_tte.attr,
+	&dev_attr_immu_tsb_hits_ctx0_8k_tte.attr,
+	&dev_attr_immu_tsb_ticks_ctx0_8k_tte.attr,
+	&dev_attr_immu_tsb_hits_ctx0_64k_tte.attr,
+	&dev_attr_immu_tsb_ticks_ctx0_64k_tte.attr,
+	&dev_attr_immu_tsb_hits_ctx0_4mb_tte.attr,
+	&dev_attr_immu_tsb_ticks_ctx0_4mb_tte.attr,
+	&dev_attr_immu_tsb_hits_ctx0_256mb_tte.attr,
+	&dev_attr_immu_tsb_ticks_ctx0_256mb_tte.attr,
+	&dev_attr_immu_tsb_hits_ctxnon0_8k_tte.attr,
+	&dev_attr_immu_tsb_ticks_ctxnon0_8k_tte.attr,
+	&dev_attr_immu_tsb_hits_ctxnon0_64k_tte.attr,
+	&dev_attr_immu_tsb_ticks_ctxnon0_64k_tte.attr,
+	&dev_attr_immu_tsb_hits_ctxnon0_4mb_tte.attr,
+	&dev_attr_immu_tsb_ticks_ctxnon0_4mb_tte.attr,
+	&dev_attr_immu_tsb_hits_ctxnon0_256mb_tte.attr,
+	&dev_attr_immu_tsb_ticks_ctxnon0_256mb_tte.attr,
+	&dev_attr_dmmu_tsb_hits_ctx0_8k_tte.attr,
+	&dev_attr_dmmu_tsb_ticks_ctx0_8k_tte.attr,
+	&dev_attr_dmmu_tsb_hits_ctx0_64k_tte.attr,
+	&dev_attr_dmmu_tsb_ticks_ctx0_64k_tte.attr,
+	&dev_attr_dmmu_tsb_hits_ctx0_4mb_tte.attr,
+	&dev_attr_dmmu_tsb_ticks_ctx0_4mb_tte.attr,
+	&dev_attr_dmmu_tsb_hits_ctx0_256mb_tte.attr,
+	&dev_attr_dmmu_tsb_ticks_ctx0_256mb_tte.attr,
+	&dev_attr_dmmu_tsb_hits_ctxnon0_8k_tte.attr,
+	&dev_attr_dmmu_tsb_ticks_ctxnon0_8k_tte.attr,
+	&dev_attr_dmmu_tsb_hits_ctxnon0_64k_tte.attr,
+	&dev_attr_dmmu_tsb_ticks_ctxnon0_64k_tte.attr,
+	&dev_attr_dmmu_tsb_hits_ctxnon0_4mb_tte.attr,
+	&dev_attr_dmmu_tsb_ticks_ctxnon0_4mb_tte.attr,
+	&dev_attr_dmmu_tsb_hits_ctxnon0_256mb_tte.attr,
+	&dev_attr_dmmu_tsb_ticks_ctxnon0_256mb_tte.attr,
 	NULL,
 };
 
@@ -139,15 +139,15 @@ static unsigned long write_mmustat_enable(unsigned long val)
 	return sun4v_mmustat_conf(ra, &orig_ra);
 }
 
-static ssize_t show_mmustat_enable(struct sys_device *s,
-				struct sysdev_attribute *attr, char *buf)
+static ssize_t show_mmustat_enable(struct device *s,
+				struct device_attribute *attr, char *buf)
 {
 	unsigned long val = run_on_cpu(s->id, read_mmustat_enable, 0);
 	return sprintf(buf, "%lx\n", val);
 }
 
-static ssize_t store_mmustat_enable(struct sys_device *s,
-			struct sysdev_attribute *attr, const char *buf,
+static ssize_t store_mmustat_enable(struct device *s,
+			struct device_attribute *attr, const char *buf,
 			size_t count)
 {
 	unsigned long val, err;
@@ -163,39 +163,39 @@ static ssize_t store_mmustat_enable(struct sys_device *s,
 	return count;
 }
 
-static SYSDEV_ATTR(mmustat_enable, 0644, show_mmustat_enable, store_mmustat_enable);
+static DEVICE_ATTR(mmustat_enable, 0644, show_mmustat_enable, store_mmustat_enable);
 
 static int mmu_stats_supported;
 
-static int register_mmu_stats(struct sys_device *s)
+static int register_mmu_stats(struct device *s)
 {
 	if (!mmu_stats_supported)
 		return 0;
-	sysdev_create_file(s, &attr_mmustat_enable);
+	device_create_file(s, &dev_attr_mmustat_enable);
 	return sysfs_create_group(&s->kobj, &mmu_stat_group);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void unregister_mmu_stats(struct sys_device *s)
+static void unregister_mmu_stats(struct device *s)
 {
 	if (!mmu_stats_supported)
 		return;
 	sysfs_remove_group(&s->kobj, &mmu_stat_group);
-	sysdev_remove_file(s, &attr_mmustat_enable);
+	device_remove_file(s, &dev_attr_mmustat_enable);
 }
 #endif
 
 #define SHOW_CPUDATA_ULONG_NAME(NAME, MEMBER) \
-static ssize_t show_##NAME(struct sys_device *dev, \
-		struct sysdev_attribute *attr, char *buf) \
+static ssize_t show_##NAME(struct device *dev, \
+		struct device_attribute *attr, char *buf) \
 { \
 	cpuinfo_sparc *c = &cpu_data(dev->id); \
 	return sprintf(buf, "%lu\n", c->MEMBER); \
 }
 
 #define SHOW_CPUDATA_UINT_NAME(NAME, MEMBER) \
-static ssize_t show_##NAME(struct sys_device *dev, \
-		struct sysdev_attribute *attr, char *buf) \
+static ssize_t show_##NAME(struct device *dev, \
+		struct device_attribute *attr, char *buf) \
 { \
 	cpuinfo_sparc *c = &cpu_data(dev->id); \
 	return sprintf(buf, "%u\n", c->MEMBER); \
@@ -209,14 +209,14 @@ SHOW_CPUDATA_UINT_NAME(l1_icache_line_size, icache_line_size);
 SHOW_CPUDATA_UINT_NAME(l2_cache_size, ecache_size);
 SHOW_CPUDATA_UINT_NAME(l2_cache_line_size, ecache_line_size);
 
-static struct sysdev_attribute cpu_core_attrs[] = {
-	_SYSDEV_ATTR(clock_tick,          0444, show_clock_tick, NULL),
-	_SYSDEV_ATTR(l1_dcache_size,      0444, show_l1_dcache_size, NULL),
-	_SYSDEV_ATTR(l1_dcache_line_size, 0444, show_l1_dcache_line_size, NULL),
-	_SYSDEV_ATTR(l1_icache_size,      0444, show_l1_icache_size, NULL),
-	_SYSDEV_ATTR(l1_icache_line_size, 0444, show_l1_icache_line_size, NULL),
-	_SYSDEV_ATTR(l2_cache_size,       0444, show_l2_cache_size, NULL),
-	_SYSDEV_ATTR(l2_cache_line_size,  0444, show_l2_cache_line_size, NULL),
+static struct device_attribute cpu_core_attrs[] = {
+	__ATTR(clock_tick,          0444, show_clock_tick, NULL),
+	__ATTR(l1_dcache_size,      0444, show_l1_dcache_size, NULL),
+	__ATTR(l1_dcache_line_size, 0444, show_l1_dcache_line_size, NULL),
+	__ATTR(l1_icache_size,      0444, show_l1_icache_size, NULL),
+	__ATTR(l1_icache_line_size, 0444, show_l1_icache_line_size, NULL),
+	__ATTR(l2_cache_size,       0444, show_l2_cache_size, NULL),
+	__ATTR(l2_cache_line_size,  0444, show_l2_cache_line_size, NULL),
 };
 
 static DEFINE_PER_CPU(struct cpu, cpu_devices);
@@ -224,11 +224,11 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices);
 static void register_cpu_online(unsigned int cpu)
 {
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
-	struct sys_device *s = &c->sysdev;
+	struct device *s = &c->dev;
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(cpu_core_attrs); i++)
-		sysdev_create_file(s, &cpu_core_attrs[i]);
+		device_create_file(s, &cpu_core_attrs[i]);
 
 	register_mmu_stats(s);
 }
@@ -237,12 +237,12 @@ static void register_cpu_online(unsigned int cpu)
 static void unregister_cpu_online(unsigned int cpu)
 {
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
-	struct sys_device *s = &c->sysdev;
+	struct device *s = &c->dev;
 	int i;
 
 	unregister_mmu_stats(s);
 	for (i = 0; i < ARRAY_SIZE(cpu_core_attrs); i++)
-		sysdev_remove_file(s, &cpu_core_attrs[i]);
+		device_remove_file(s, &cpu_core_attrs[i]);
 }
 #endif
 
diff --git a/arch/tile/kernel/sysfs.c b/arch/tile/kernel/sysfs.c
index b671a86f4515..e7ce2a5161b8 100644
--- a/arch/tile/kernel/sysfs.c
+++ b/arch/tile/kernel/sysfs.c
@@ -14,7 +14,7 @@
  * /sys entry support.
  */
 
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/cpu.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
@@ -31,55 +31,55 @@ static ssize_t get_hv_confstr(char *page, int query)
 	return n;
 }
 
-static ssize_t chip_width_show(struct sysdev_class *dev,
-			       struct sysdev_class_attribute *attr,
+static ssize_t chip_width_show(struct device *dev,
+			       struct device_attribute *attr,
 			       char *page)
 {
 	return sprintf(page, "%u\n", smp_width);
 }
-static SYSDEV_CLASS_ATTR(chip_width, 0444, chip_width_show, NULL);
+static DEVICE_ATTR(chip_width, 0444, chip_width_show, NULL);
 
-static ssize_t chip_height_show(struct sysdev_class *dev,
-				struct sysdev_class_attribute *attr,
+static ssize_t chip_height_show(struct device *dev,
+				struct device_attribute *attr,
 				char *page)
 {
 	return sprintf(page, "%u\n", smp_height);
 }
-static SYSDEV_CLASS_ATTR(chip_height, 0444, chip_height_show, NULL);
+static DEVICE_ATTR(chip_height, 0444, chip_height_show, NULL);
 
-static ssize_t chip_serial_show(struct sysdev_class *dev,
-				struct sysdev_class_attribute *attr,
+static ssize_t chip_serial_show(struct device *dev,
+				struct device_attribute *attr,
 				char *page)
 {
 	return get_hv_confstr(page, HV_CONFSTR_CHIP_SERIAL_NUM);
 }
-static SYSDEV_CLASS_ATTR(chip_serial, 0444, chip_serial_show, NULL);
+static DEVICE_ATTR(chip_serial, 0444, chip_serial_show, NULL);
 
-static ssize_t chip_revision_show(struct sysdev_class *dev,
-				  struct sysdev_class_attribute *attr,
+static ssize_t chip_revision_show(struct device *dev,
+				  struct device_attribute *attr,
 				  char *page)
 {
 	return get_hv_confstr(page, HV_CONFSTR_CHIP_REV);
 }
-static SYSDEV_CLASS_ATTR(chip_revision, 0444, chip_revision_show, NULL);
+static DEVICE_ATTR(chip_revision, 0444, chip_revision_show, NULL);
 
 
-static ssize_t type_show(struct sysdev_class *dev,
-			    struct sysdev_class_attribute *attr,
+static ssize_t type_show(struct device *dev,
+			    struct device_attribute *attr,
 			    char *page)
 {
 	return sprintf(page, "tilera\n");
 }
-static SYSDEV_CLASS_ATTR(type, 0444, type_show, NULL);
+static DEVICE_ATTR(type, 0444, type_show, NULL);
 
 #define HV_CONF_ATTR(name, conf)					\
-	static ssize_t name ## _show(struct sysdev_class *dev,		\
-				     struct sysdev_class_attribute *attr, \
+	static ssize_t name ## _show(struct device *dev,		\
+				     struct device_attribute *attr, \
 				     char *page)			\
 	{								\
 		return get_hv_confstr(page, conf);			\
 	}								\
-	static SYSDEV_CLASS_ATTR(name, 0444, name ## _show, NULL);
+	static DEVICE_ATTR(name, 0444, name ## _show, NULL);
 
 HV_CONF_ATTR(version,		HV_CONFSTR_HV_SW_VER)
 HV_CONF_ATTR(config_version,	HV_CONFSTR_HV_CONFIG_VER)
@@ -95,15 +95,15 @@ HV_CONF_ATTR(mezz_description,	HV_CONFSTR_MEZZ_DESC)
 HV_CONF_ATTR(switch_control,	HV_CONFSTR_SWITCH_CONTROL)
 
 static struct attribute *board_attrs[] = {
-	&attr_board_part.attr,
-	&attr_board_serial.attr,
-	&attr_board_revision.attr,
-	&attr_board_description.attr,
-	&attr_mezz_part.attr,
-	&attr_mezz_serial.attr,
-	&attr_mezz_revision.attr,
-	&attr_mezz_description.attr,
-	&attr_switch_control.attr,
+	&dev_attr_board_part.attr,
+	&dev_attr_board_serial.attr,
+	&dev_attr_board_revision.attr,
+	&dev_attr_board_description.attr,
+	&dev_attr_mezz_part.attr,
+	&dev_attr_mezz_serial.attr,
+	&dev_attr_mezz_revision.attr,
+	&dev_attr_mezz_description.attr,
+	&dev_attr_switch_control.attr,
 	NULL
 };
 
@@ -150,12 +150,11 @@ hvconfig_bin_read(struct file *filp, struct kobject *kobj,
 
 static int __init create_sysfs_entries(void)
 {
-	struct sysdev_class *cls = &cpu_sysdev_class;
 	int err = 0;
 
 #define create_cpu_attr(name)						\
 	if (!err)							\
-		err = sysfs_create_file(&cls->kset.kobj, &attr_##name.attr);
+		err = device_create_file(cpu_subsys.dev_root, &dev_attr_##name);
 	create_cpu_attr(chip_width);
 	create_cpu_attr(chip_height);
 	create_cpu_attr(chip_serial);
@@ -163,7 +162,7 @@ static int __init create_sysfs_entries(void)
 
 #define create_hv_attr(name)						\
 	if (!err)							\
-		err = sysfs_create_file(hypervisor_kobj, &attr_##name.attr);
+		err = sysfs_create_file(hypervisor_kobj, &dev_attr_##name);
 	create_hv_attr(type);
 	create_hv_attr(version);
 	create_hv_attr(config_version);
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c9321f34e55b..0b05fb49c560 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -149,7 +149,7 @@ static inline void enable_p5_mce(void) {}
 
 void mce_setup(struct mce *m);
 void mce_log(struct mce *m);
-DECLARE_PER_CPU(struct sys_device, mce_sysdev);
+DECLARE_PER_CPU(struct device, mce_device);
 
 /*
  * Maximum banks number.
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index a3b0811693c9..6b45e5e7a901 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -844,8 +844,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
 
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
-
-extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
+#include <linux/cpu.h>
 
 /* pointer to kobject for cpuX/cache */
 static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
@@ -1073,9 +1072,9 @@ err_out:
 static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
 
 /* Add/Remove cache interface for CPU device */
-static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
+static int __cpuinit cache_add_dev(struct device *dev)
 {
-	unsigned int cpu = sys_dev->id;
+	unsigned int cpu = dev->id;
 	unsigned long i, j;
 	struct _index_kobject *this_object;
 	struct _cpuid4_info   *this_leaf;
@@ -1087,7 +1086,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 
 	retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
 				      &ktype_percpu_entry,
-				      &sys_dev->kobj, "%s", "cache");
+				      &dev->kobj, "%s", "cache");
 	if (retval < 0) {
 		cpuid4_cache_sysfs_exit(cpu);
 		return retval;
@@ -1124,9 +1123,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 	return 0;
 }
 
-static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
+static void __cpuinit cache_remove_dev(struct device *dev)
 {
-	unsigned int cpu = sys_dev->id;
+	unsigned int cpu = dev->id;
 	unsigned long i;
 
 	if (per_cpu(ici_cpuid4_info, cpu) == NULL)
@@ -1145,17 +1144,17 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct sys_device *sys_dev;
+	struct device *dev;
 
-	sys_dev = get_cpu_sysdev(cpu);
+	dev = get_cpu_device(cpu);
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		cache_add_dev(sys_dev);
+		cache_add_dev(dev);
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
-		cache_remove_dev(sys_dev);
+		cache_remove_dev(dev);
 		break;
 	}
 	return NOTIFY_OK;
@@ -1174,9 +1173,9 @@ static int __cpuinit cache_sysfs_init(void)
 
 	for_each_online_cpu(i) {
 		int err;
-		struct sys_device *sys_dev = get_cpu_sysdev(i);
+		struct device *dev = get_cpu_device(i);
 
-		err = cache_add_dev(sys_dev);
+		err = cache_add_dev(dev);
 		if (err)
 			return err;
 	}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index fefcc69ee8b5..ed44c8a65858 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,4 +1,4 @@
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <asm/mce.h>
 
 enum severity_level {
@@ -17,7 +17,7 @@ enum severity_level {
 struct mce_bank {
 	u64			ctl;			/* subevents to enable */
 	unsigned char init;				/* initialise bank? */
-	struct sysdev_attribute attr;			/* sysdev attribute */
+	struct device_attribute attr;			/* device attribute */
 	char			attrname[ATTR_LEN];	/* attribute name */
 };
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 362056aefeb4..0156c6f85d7b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -19,7 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/percpu.h>
 #include <linux/string.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/syscore_ops.h>
 #include <linux/delay.h>
 #include <linux/ctype.h>
@@ -1751,7 +1751,7 @@ static struct syscore_ops mce_syscore_ops = {
 };
 
 /*
- * mce_sysdev: Sysfs support
+ * mce_device: Sysfs support
  */
 
 static void mce_cpu_restart(void *data)
@@ -1787,27 +1787,28 @@ static void mce_enable_ce(void *all)
 		__mcheck_cpu_init_timer();
 }
 
-static struct sysdev_class mce_sysdev_class = {
+static struct bus_type mce_subsys = {
 	.name		= "machinecheck",
+	.dev_name	= "machinecheck",
 };
 
-DEFINE_PER_CPU(struct sys_device, mce_sysdev);
+DEFINE_PER_CPU(struct device, mce_device);
 
 __cpuinitdata
 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 
-static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
+static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
 {
 	return container_of(attr, struct mce_bank, attr);
 }
 
-static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t show_bank(struct device *s, struct device_attribute *attr,
 			 char *buf)
 {
 	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
 }
 
-static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t set_bank(struct device *s, struct device_attribute *attr,
 			const char *buf, size_t size)
 {
 	u64 new;
@@ -1822,14 +1823,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
 }
 
 static ssize_t
-show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
 {
 	strcpy(buf, mce_helper);
 	strcat(buf, "\n");
 	return strlen(mce_helper) + 1;
 }
 
-static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
 				const char *buf, size_t siz)
 {
 	char *p;
@@ -1844,8 +1845,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 	return strlen(mce_helper) + !!p;
 }
 
-static ssize_t set_ignore_ce(struct sys_device *s,
-			     struct sysdev_attribute *attr,
+static ssize_t set_ignore_ce(struct device *s,
+			     struct device_attribute *attr,
 			     const char *buf, size_t size)
 {
 	u64 new;
@@ -1868,8 +1869,8 @@ static ssize_t set_ignore_ce(struct sys_device *s,
 	return size;
 }
 
-static ssize_t set_cmci_disabled(struct sys_device *s,
-				 struct sysdev_attribute *attr,
+static ssize_t set_cmci_disabled(struct device *s,
+				 struct device_attribute *attr,
 				 const char *buf, size_t size)
 {
 	u64 new;
@@ -1891,108 +1892,107 @@ static ssize_t set_cmci_disabled(struct sys_device *s,
 	return size;
 }
 
-static ssize_t store_int_with_restart(struct sys_device *s,
-				      struct sysdev_attribute *attr,
+static ssize_t store_int_with_restart(struct device *s,
+				      struct device_attribute *attr,
 				      const char *buf, size_t size)
 {
-	ssize_t ret = sysdev_store_int(s, attr, buf, size);
+	ssize_t ret = device_store_int(s, attr, buf, size);
 	mce_restart();
 	return ret;
 }
 
-static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
-static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
-static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
-static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
+static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+static DEVICE_INT_ATTR(tolerant, 0644, tolerant);
+static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
+static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
 
-static struct sysdev_ext_attribute attr_check_interval = {
-	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
-		     store_int_with_restart),
+static struct dev_ext_attribute dev_attr_check_interval = {
+	__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
 	&check_interval
 };
 
-static struct sysdev_ext_attribute attr_ignore_ce = {
-	_SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
+static struct dev_ext_attribute dev_attr_ignore_ce = {
+	__ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),
 	&mce_ignore_ce
 };
 
-static struct sysdev_ext_attribute attr_cmci_disabled = {
-	_SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
+static struct dev_ext_attribute dev_attr_cmci_disabled = {
+	__ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),
 	&mce_cmci_disabled
 };
 
-static struct sysdev_attribute *mce_sysdev_attrs[] = {
-	&attr_tolerant.attr,
-	&attr_check_interval.attr,
-	&attr_trigger,
-	&attr_monarch_timeout.attr,
-	&attr_dont_log_ce.attr,
-	&attr_ignore_ce.attr,
-	&attr_cmci_disabled.attr,
+static struct device_attribute *mce_device_attrs[] = {
+	&dev_attr_tolerant.attr,
+	&dev_attr_check_interval.attr,
+	&dev_attr_trigger,
+	&dev_attr_monarch_timeout.attr,
+	&dev_attr_dont_log_ce.attr,
+	&dev_attr_ignore_ce.attr,
+	&dev_attr_cmci_disabled.attr,
 	NULL
 };
 
-static cpumask_var_t mce_sysdev_initialized;
+static cpumask_var_t mce_device_initialized;
 
-/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_sysdev_create(unsigned int cpu)
+/* Per cpu device init. All of the cpus still share the same ctrl bank: */
+static __cpuinit int mce_device_create(unsigned int cpu)
 {
-	struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
+	struct device *dev = &per_cpu(mce_device, cpu);
 	int err;
 	int i, j;
 
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	memset(&sysdev->kobj, 0, sizeof(struct kobject));
-	sysdev->id  = cpu;
-	sysdev->cls = &mce_sysdev_class;
+	memset(&dev->kobj, 0, sizeof(struct kobject));
+	dev->id  = cpu;
+	dev->bus = &mce_subsys;
 
-	err = sysdev_register(sysdev);
+	err = device_register(dev);
 	if (err)
 		return err;
 
-	for (i = 0; mce_sysdev_attrs[i]; i++) {
-		err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]);
+	for (i = 0; mce_device_attrs[i]; i++) {
+		err = device_create_file(dev, mce_device_attrs[i]);
 		if (err)
 			goto error;
 	}
 	for (j = 0; j < banks; j++) {
-		err = sysdev_create_file(sysdev, &mce_banks[j].attr);
+		err = device_create_file(dev, &mce_banks[j].attr);
 		if (err)
 			goto error2;
 	}
-	cpumask_set_cpu(cpu, mce_sysdev_initialized);
+	cpumask_set_cpu(cpu, mce_device_initialized);
 
 	return 0;
 error2:
 	while (--j >= 0)
-		sysdev_remove_file(sysdev, &mce_banks[j].attr);
+		device_remove_file(dev, &mce_banks[j].attr);
 error:
 	while (--i >= 0)
-		sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
+		device_remove_file(dev, mce_device_attrs[i]);
 
-	sysdev_unregister(sysdev);
+	device_unregister(dev);
 
 	return err;
 }
 
-static __cpuinit void mce_sysdev_remove(unsigned int cpu)
+static __cpuinit void mce_device_remove(unsigned int cpu)
 {
-	struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
+	struct device *dev = &per_cpu(mce_device, cpu);
 	int i;
 
-	if (!cpumask_test_cpu(cpu, mce_sysdev_initialized))
+	if (!cpumask_test_cpu(cpu, mce_device_initialized))
 		return;
 
-	for (i = 0; mce_sysdev_attrs[i]; i++)
-		sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
+	for (i = 0; mce_device_attrs[i]; i++)
+		device_remove_file(dev, mce_device_attrs[i]);
 
 	for (i = 0; i < banks; i++)
-		sysdev_remove_file(sysdev, &mce_banks[i].attr);
+		device_remove_file(dev, &mce_banks[i].attr);
 
-	sysdev_unregister(sysdev);
-	cpumask_clear_cpu(cpu, mce_sysdev_initialized);
+	device_unregister(dev);
+	cpumask_clear_cpu(cpu, mce_device_initialized);
 }
 
 /* Make sure there are no machine checks on offlined CPUs. */
@@ -2042,7 +2042,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		mce_sysdev_create(cpu);
+		mce_device_create(cpu);
 		if (threshold_cpu_callback)
 			threshold_cpu_callback(action, cpu);
 		break;
@@ -2050,7 +2050,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_DEAD_FROZEN:
 		if (threshold_cpu_callback)
 			threshold_cpu_callback(action, cpu);
-		mce_sysdev_remove(cpu);
+		mce_device_remove(cpu);
 		break;
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
@@ -2084,7 +2084,7 @@ static __init void mce_init_banks(void)
 
 	for (i = 0; i < banks; i++) {
 		struct mce_bank *b = &mce_banks[i];
-		struct sysdev_attribute *a = &b->attr;
+		struct device_attribute *a = &b->attr;
 
 		sysfs_attr_init(&a->attr);
 		a->attr.name	= b->attrname;
@@ -2104,16 +2104,16 @@ static __init int mcheck_init_device(void)
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL);
+	zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
 
 	mce_init_banks();
 
-	err = sysdev_class_register(&mce_sysdev_class);
+	err = subsys_system_register(&mce_subsys, NULL);
 	if (err)
 		return err;
 
 	for_each_online_cpu(i) {
-		err = mce_sysdev_create(i);
+		err = mce_device_create(i);
 		if (err)
 			return err;
 	}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f5474218cffe..56d2aa1acd55 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -17,7 +17,6 @@
 #include <linux/notifier.h>
 #include <linux/kobject.h>
 #include <linux/percpu.h>
-#include <linux/sysdev.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sysfs.h>
@@ -548,7 +547,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (!b)
 			goto out;
 
-		err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj,
+		err = sysfs_create_link(&per_cpu(mce_device, cpu).kobj,
 					b->kobj, name);
 		if (err)
 			goto out;
@@ -571,7 +570,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		goto out;
 	}
 
-	b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj);
+	b->kobj = kobject_create_and_add(name, &per_cpu(mce_device, cpu).kobj);
 	if (!b->kobj)
 		goto out_free;
 
@@ -591,7 +590,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (i == cpu)
 			continue;
 
-		err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj,
+		err = sysfs_create_link(&per_cpu(mce_device, i).kobj,
 					b->kobj, name);
 		if (err)
 			goto out;
@@ -669,7 +668,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 #ifdef CONFIG_SMP
 	/* sibling symlink */
 	if (shared_bank[bank] && b->blocks->cpu != cpu) {
-		sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name);
+		sysfs_remove_link(&per_cpu(mce_device, cpu).kobj, name);
 		per_cpu(threshold_banks, cpu)[bank] = NULL;
 
 		return;
@@ -681,7 +680,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 		if (i == cpu)
 			continue;
 
-		sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name);
+		sysfs_remove_link(&per_cpu(mce_device, i).kobj, name);
 		per_cpu(threshold_banks, i)[bank] = NULL;
 	}
 
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 787e06c84ea6..59e3f6ed265f 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -19,7 +19,6 @@
 #include <linux/kernel.h>
 #include <linux/percpu.h>
 #include <linux/export.h>
-#include <linux/sysdev.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/smp.h>
@@ -69,16 +68,16 @@ static atomic_t therm_throt_en	= ATOMIC_INIT(0);
 static u32 lvtthmr_init __read_mostly;
 
 #ifdef CONFIG_SYSFS
-#define define_therm_throt_sysdev_one_ro(_name)				\
-	static SYSDEV_ATTR(_name, 0444,					\
-			   therm_throt_sysdev_show_##_name,		\
+#define define_therm_throt_device_one_ro(_name)				\
+	static DEVICE_ATTR(_name, 0444,					\
+			   therm_throt_device_show_##_name,		\
 				   NULL)				\
 
-#define define_therm_throt_sysdev_show_func(event, name)		\
+#define define_therm_throt_device_show_func(event, name)		\
 									\
-static ssize_t therm_throt_sysdev_show_##event##_##name(		\
-			struct sys_device *dev,				\
-			struct sysdev_attribute *attr,			\
+static ssize_t therm_throt_device_show_##event##_##name(		\
+			struct device *dev,				\
+			struct device_attribute *attr,			\
 			char *buf)					\
 {									\
 	unsigned int cpu = dev->id;					\
@@ -95,20 +94,20 @@ static ssize_t therm_throt_sysdev_show_##event##_##name(		\
 	return ret;							\
 }
 
-define_therm_throt_sysdev_show_func(core_throttle, count);
-define_therm_throt_sysdev_one_ro(core_throttle_count);
+define_therm_throt_device_show_func(core_throttle, count);
+define_therm_throt_device_one_ro(core_throttle_count);
 
-define_therm_throt_sysdev_show_func(core_power_limit, count);
-define_therm_throt_sysdev_one_ro(core_power_limit_count);
+define_therm_throt_device_show_func(core_power_limit, count);
+define_therm_throt_device_one_ro(core_power_limit_count);
 
-define_therm_throt_sysdev_show_func(package_throttle, count);
-define_therm_throt_sysdev_one_ro(package_throttle_count);
+define_therm_throt_device_show_func(package_throttle, count);
+define_therm_throt_device_one_ro(package_throttle_count);
 
-define_therm_throt_sysdev_show_func(package_power_limit, count);
-define_therm_throt_sysdev_one_ro(package_power_limit_count);
+define_therm_throt_device_show_func(package_power_limit, count);
+define_therm_throt_device_one_ro(package_power_limit_count);
 
 static struct attribute *thermal_throttle_attrs[] = {
-	&attr_core_throttle_count.attr,
+	&dev_attr_core_throttle_count.attr,
 	NULL
 };
 
@@ -223,36 +222,36 @@ static int thresh_event_valid(int event)
 
 #ifdef CONFIG_SYSFS
 /* Add/Remove thermal_throttle interface for CPU device: */
-static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
+static __cpuinit int thermal_throttle_add_dev(struct device *dev,
 				unsigned int cpu)
 {
 	int err;
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
-	err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
+	err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
 	if (err)
 		return err;
 
 	if (cpu_has(c, X86_FEATURE_PLN))
-		err = sysfs_add_file_to_group(&sys_dev->kobj,
-					      &attr_core_power_limit_count.attr,
+		err = sysfs_add_file_to_group(&dev->kobj,
+					      &dev_attr_core_power_limit_count.attr,
 					      thermal_attr_group.name);
 	if (cpu_has(c, X86_FEATURE_PTS)) {
-		err = sysfs_add_file_to_group(&sys_dev->kobj,
-					      &attr_package_throttle_count.attr,
+		err = sysfs_add_file_to_group(&dev->kobj,
+					      &dev_attr_package_throttle_count.attr,
 					      thermal_attr_group.name);
 		if (cpu_has(c, X86_FEATURE_PLN))
-			err = sysfs_add_file_to_group(&sys_dev->kobj,
-					&attr_package_power_limit_count.attr,
+			err = sysfs_add_file_to_group(&dev->kobj,
+					&dev_attr_package_power_limit_count.attr,
 					thermal_attr_group.name);
 	}
 
 	return err;
 }
 
-static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
+static __cpuinit void thermal_throttle_remove_dev(struct device *dev)
 {
-	sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
+	sysfs_remove_group(&dev->kobj, &thermal_attr_group);
 }
 
 /* Mutex protecting device creation against CPU hotplug: */
@@ -265,16 +264,16 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
 			      void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct sys_device *sys_dev;
+	struct device *dev;
 	int err = 0;
 
-	sys_dev = get_cpu_sysdev(cpu);
+	dev = get_cpu_device(cpu);
 
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
 		mutex_lock(&therm_cpu_lock);
-		err = thermal_throttle_add_dev(sys_dev, cpu);
+		err = thermal_throttle_add_dev(dev, cpu);
 		mutex_unlock(&therm_cpu_lock);
 		WARN_ON(err);
 		break;
@@ -283,7 +282,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		mutex_lock(&therm_cpu_lock);
-		thermal_throttle_remove_dev(sys_dev);
+		thermal_throttle_remove_dev(dev);
 		mutex_unlock(&therm_cpu_lock);
 		break;
 	}
@@ -310,7 +309,7 @@ static __init int thermal_throttle_init_device(void)
 #endif
 	/* connect live CPUs to sysfs */
 	for_each_online_cpu(cpu) {
-		err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu);
+		err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
 		WARN_ON(err);
 	}
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index f2d2a664e797..cf88f2a16473 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -292,8 +292,8 @@ static int reload_for_cpu(int cpu)
 	return err;
 }
 
-static ssize_t reload_store(struct sys_device *dev,
-			    struct sysdev_attribute *attr,
+static ssize_t reload_store(struct device *dev,
+			    struct device_attribute *attr,
 			    const char *buf, size_t size)
 {
 	unsigned long val;
@@ -318,30 +318,30 @@ static ssize_t reload_store(struct sys_device *dev,
 	return ret;
 }
 
-static ssize_t version_show(struct sys_device *dev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t version_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
 
 	return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
 }
 
-static ssize_t pf_show(struct sys_device *dev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t pf_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
 
 	return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
 }
 
-static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
-static SYSDEV_ATTR(version, 0400, version_show, NULL);
-static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
+static DEVICE_ATTR(reload, 0200, NULL, reload_store);
+static DEVICE_ATTR(version, 0400, version_show, NULL);
+static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
 
 static struct attribute *mc_default_attrs[] = {
-	&attr_reload.attr,
-	&attr_version.attr,
-	&attr_processor_flags.attr,
+	&dev_attr_reload.attr,
+	&dev_attr_version.attr,
+	&dev_attr_processor_flags.attr,
 	NULL
 };
 
@@ -405,43 +405,45 @@ static enum ucode_state microcode_update_cpu(int cpu)
 	return ustate;
 }
 
-static int mc_sysdev_add(struct sys_device *sys_dev)
+static int mc_device_add(struct device *dev, struct subsys_interface *sif)
 {
-	int err, cpu = sys_dev->id;
+	int err, cpu = dev->id;
 
 	if (!cpu_online(cpu))
 		return 0;
 
 	pr_debug("CPU%d added\n", cpu);
 
-	err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
+	err = sysfs_create_group(&dev->kobj, &mc_attr_group);
 	if (err)
 		return err;
 
 	if (microcode_init_cpu(cpu) == UCODE_ERROR) {
-		sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+		sysfs_remove_group(&dev->kobj, &mc_attr_group);
 		return -EINVAL;
 	}
 
 	return err;
 }
 
-static int mc_sysdev_remove(struct sys_device *sys_dev)
+static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
 {
-	int cpu = sys_dev->id;
+	int cpu = dev->id;
 
 	if (!cpu_online(cpu))
 		return 0;
 
 	pr_debug("CPU%d removed\n", cpu);
 	microcode_fini_cpu(cpu);
-	sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+	sysfs_remove_group(&dev->kobj, &mc_attr_group);
 	return 0;
 }
 
-static struct sysdev_driver mc_sysdev_driver = {
-	.add			= mc_sysdev_add,
-	.remove			= mc_sysdev_remove,
+static struct subsys_interface mc_cpu_interface = {
+	.name			= "microcode",
+	.subsys			= &cpu_subsys,
+	.add_dev		= mc_device_add,
+	.remove_dev		= mc_device_remove,
 };
 
 /**
@@ -464,9 +466,9 @@ static __cpuinit int
 mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct sys_device *sys_dev;
+	struct device *dev;
 
-	sys_dev = get_cpu_sysdev(cpu);
+	dev = get_cpu_device(cpu);
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
@@ -474,13 +476,13 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 		pr_debug("CPU%d added\n", cpu);
-		if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
+		if (sysfs_create_group(&dev->kobj, &mc_attr_group))
 			pr_err("Failed to create group for CPU%d\n", cpu);
 		break;
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		/* Suspend is in progress, only remove the interface */
-		sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+		sysfs_remove_group(&dev->kobj, &mc_attr_group);
 		pr_debug("CPU%d removed\n", cpu);
 		break;
 
@@ -527,7 +529,7 @@ static int __init microcode_init(void)
 	get_online_cpus();
 	mutex_lock(&microcode_mutex);
 
-	error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+	error = subsys_interface_register(&mc_cpu_interface);
 
 	mutex_unlock(&microcode_mutex);
 	put_online_cpus();
@@ -561,7 +563,7 @@ static void __exit microcode_exit(void)
 	get_online_cpus();
 	mutex_lock(&microcode_mutex);
 
-	sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+	subsys_interface_unregister(&mc_cpu_interface);
 
 	mutex_unlock(&microcode_mutex);
 	put_online_cpus();
diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index 9d7bc9f6b6cc..20a68ca386de 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -446,7 +446,7 @@ static int __cpuinit acpi_processor_add(struct acpi_device *device)
 {
 	struct acpi_processor *pr = NULL;
 	int result = 0;
-	struct sys_device *sysdev;
+	struct device *dev;
 
 	pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
 	if (!pr)
@@ -491,8 +491,8 @@ static int __cpuinit acpi_processor_add(struct acpi_device *device)
 
 	per_cpu(processors, pr->id) = pr;
 
-	sysdev = get_cpu_sysdev(pr->id);
-	if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev")) {
+	dev = get_cpu_device(pr->id);
+	if (sysfs_create_link(&device->dev.kobj, &dev->kobj, "sysdev")) {
 		result = -EFAULT;
 		goto err_free_cpumask;
 	}
diff --git a/drivers/acpi/processor_thermal.c b/drivers/acpi/processor_thermal.c
index 870550d6a4bf..3b599abf2b40 100644
--- a/drivers/acpi/processor_thermal.c
+++ b/drivers/acpi/processor_thermal.c
@@ -30,7 +30,6 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/cpufreq.h>
-#include <linux/sysdev.h>
 
 #include <asm/uaccess.h>
 
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 251acea3d359..5bb0298fbcc0 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -1,8 +1,7 @@
 /*
- * drivers/base/cpu.c - basic CPU class support
+ * CPU subsystem support
  */
 
-#include <linux/sysdev.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sched.h>
@@ -14,40 +13,40 @@
 
 #include "base.h"
 
-static struct sysdev_class_attribute *cpu_sysdev_class_attrs[];
-
-struct sysdev_class cpu_sysdev_class = {
+struct bus_type cpu_subsys = {
 	.name = "cpu",
-	.attrs = cpu_sysdev_class_attrs,
+	.dev_name = "cpu",
 };
-EXPORT_SYMBOL(cpu_sysdev_class);
+EXPORT_SYMBOL_GPL(cpu_subsys);
 
-static DEFINE_PER_CPU(struct sys_device *, cpu_sys_devices);
+static DEFINE_PER_CPU(struct device *, cpu_sys_devices);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static ssize_t show_online(struct sys_device *dev, struct sysdev_attribute *attr,
+static ssize_t show_online(struct device *dev,
+			   struct device_attribute *attr,
 			   char *buf)
 {
-	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
 
-	return sprintf(buf, "%u\n", !!cpu_online(cpu->sysdev.id));
+	return sprintf(buf, "%u\n", !!cpu_online(cpu->dev.id));
 }
 
-static ssize_t __ref store_online(struct sys_device *dev, struct sysdev_attribute *attr,
-				 const char *buf, size_t count)
+static ssize_t __ref store_online(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
 {
-	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
 	ssize_t ret;
 
 	cpu_hotplug_driver_lock();
 	switch (buf[0]) {
 	case '0':
-		ret = cpu_down(cpu->sysdev.id);
+		ret = cpu_down(cpu->dev.id);
 		if (!ret)
 			kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
 		break;
 	case '1':
-		ret = cpu_up(cpu->sysdev.id);
+		ret = cpu_up(cpu->dev.id);
 		if (!ret)
 			kobject_uevent(&dev->kobj, KOBJ_ONLINE);
 		break;
@@ -60,44 +59,44 @@ static ssize_t __ref store_online(struct sys_device *dev, struct sysdev_attribut
 		ret = count;
 	return ret;
 }
-static SYSDEV_ATTR(online, 0644, show_online, store_online);
+static DEVICE_ATTR(online, 0644, show_online, store_online);
 
 static void __cpuinit register_cpu_control(struct cpu *cpu)
 {
-	sysdev_create_file(&cpu->sysdev, &attr_online);
+	device_create_file(&cpu->dev, &dev_attr_online);
 }
 void unregister_cpu(struct cpu *cpu)
 {
-	int logical_cpu = cpu->sysdev.id;
+	int logical_cpu = cpu->dev.id;
 
 	unregister_cpu_under_node(logical_cpu, cpu_to_node(logical_cpu));
 
-	sysdev_remove_file(&cpu->sysdev, &attr_online);
+	device_remove_file(&cpu->dev, &dev_attr_online);
 
-	sysdev_unregister(&cpu->sysdev);
+	device_unregister(&cpu->dev);
 	per_cpu(cpu_sys_devices, logical_cpu) = NULL;
 	return;
 }
 
 #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
-static ssize_t cpu_probe_store(struct sysdev_class *class,
-			       struct sysdev_class_attribute *attr,
+static ssize_t cpu_probe_store(struct device *dev,
+			       struct device_attribute *attr,
 			       const char *buf,
 			       size_t count)
 {
 	return arch_cpu_probe(buf, count);
 }
 
-static ssize_t cpu_release_store(struct sysdev_class *class,
-				 struct sysdev_class_attribute *attr,
+static ssize_t cpu_release_store(struct device *dev,
+				 struct device_attribute *attr,
 				 const char *buf,
 				 size_t count)
 {
 	return arch_cpu_release(buf, count);
 }
 
-static SYSDEV_CLASS_ATTR(probe, S_IWUSR, NULL, cpu_probe_store);
-static SYSDEV_CLASS_ATTR(release, S_IWUSR, NULL, cpu_release_store);
+static DEVICE_ATTR(probe, S_IWUSR, NULL, cpu_probe_store);
+static DEVICE_ATTR(release, S_IWUSR, NULL, cpu_release_store);
 #endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
 
 #else /* ... !CONFIG_HOTPLUG_CPU */
@@ -109,15 +108,15 @@ static inline void register_cpu_control(struct cpu *cpu)
 #ifdef CONFIG_KEXEC
 #include <linux/kexec.h>
 
-static ssize_t show_crash_notes(struct sys_device *dev, struct sysdev_attribute *attr,
+static ssize_t show_crash_notes(struct device *dev, struct device_attribute *attr,
 				char *buf)
 {
-	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+	struct cpu *cpu = container_of(dev, struct cpu, dev);
 	ssize_t rc;
 	unsigned long long addr;
 	int cpunum;
 
-	cpunum = cpu->sysdev.id;
+	cpunum = cpu->dev.id;
 
 	/*
 	 * Might be reading other cpu's data based on which cpu read thread
@@ -129,7 +128,7 @@ static ssize_t show_crash_notes(struct sys_device *dev, struct sysdev_attribute
 	rc = sprintf(buf, "%Lx\n", addr);
 	return rc;
 }
-static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL);
+static DEVICE_ATTR(crash_notes, 0400, show_crash_notes, NULL);
 #endif
 
 /*
@@ -137,12 +136,12 @@ static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL);
  */
 
 struct cpu_attr {
-	struct sysdev_class_attribute attr;
+	struct device_attribute attr;
 	const struct cpumask *const * const map;
 };
 
-static ssize_t show_cpus_attr(struct sysdev_class *class,
-			      struct sysdev_class_attribute *attr,
+static ssize_t show_cpus_attr(struct device *dev,
+			      struct device_attribute *attr,
 			      char *buf)
 {
 	struct cpu_attr *ca = container_of(attr, struct cpu_attr, attr);
@@ -153,10 +152,10 @@ static ssize_t show_cpus_attr(struct sysdev_class *class,
 	return n;
 }
 
-#define _CPU_ATTR(name, map)						\
-	{ _SYSDEV_CLASS_ATTR(name, 0444, show_cpus_attr, NULL), map }
+#define _CPU_ATTR(name, map) \
+	{ __ATTR(name, 0444, show_cpus_attr, NULL), map }
 
-/* Keep in sync with cpu_sysdev_class_attrs */
+/* Keep in sync with cpu_subsys_attrs */
 static struct cpu_attr cpu_attrs[] = {
 	_CPU_ATTR(online, &cpu_online_mask),
 	_CPU_ATTR(possible, &cpu_possible_mask),
@@ -166,19 +165,19 @@ static struct cpu_attr cpu_attrs[] = {
 /*
  * Print values for NR_CPUS and offlined cpus
  */
-static ssize_t print_cpus_kernel_max(struct sysdev_class *class,
-				     struct sysdev_class_attribute *attr, char *buf)
+static ssize_t print_cpus_kernel_max(struct device *dev,
+				     struct device_attribute *attr, char *buf)
 {
 	int n = snprintf(buf, PAGE_SIZE-2, "%d\n", NR_CPUS - 1);
 	return n;
 }
-static SYSDEV_CLASS_ATTR(kernel_max, 0444, print_cpus_kernel_max, NULL);
+static DEVICE_ATTR(kernel_max, 0444, print_cpus_kernel_max, NULL);
 
 /* arch-optional setting to enable display of offline cpus >= nr_cpu_ids */
 unsigned int total_cpus;
 
-static ssize_t print_cpus_offline(struct sysdev_class *class,
-				  struct sysdev_class_attribute *attr, char *buf)
+static ssize_t print_cpus_offline(struct device *dev,
+				  struct device_attribute *attr, char *buf)
 {
 	int n = 0, len = PAGE_SIZE-2;
 	cpumask_var_t offline;
@@ -205,7 +204,7 @@ static ssize_t print_cpus_offline(struct sysdev_class *class,
 	n += snprintf(&buf[n], len - n, "\n");
 	return n;
 }
-static SYSDEV_CLASS_ATTR(offline, 0444, print_cpus_offline, NULL);
+static DEVICE_ATTR(offline, 0444, print_cpus_offline, NULL);
 
 /*
  * register_cpu - Setup a sysfs device for a CPU.
@@ -218,57 +217,66 @@ static SYSDEV_CLASS_ATTR(offline, 0444, print_cpus_offline, NULL);
 int __cpuinit register_cpu(struct cpu *cpu, int num)
 {
 	int error;
-	cpu->node_id = cpu_to_node(num);
-	cpu->sysdev.id = num;
-	cpu->sysdev.cls = &cpu_sysdev_class;
-
-	error = sysdev_register(&cpu->sysdev);
 
+	cpu->node_id = cpu_to_node(num);
+	cpu->dev.id = num;
+	cpu->dev.bus = &cpu_subsys;
+	error = device_register(&cpu->dev);
 	if (!error && cpu->hotpluggable)
 		register_cpu_control(cpu);
 	if (!error)
-		per_cpu(cpu_sys_devices, num) = &cpu->sysdev;
+		per_cpu(cpu_sys_devices, num) = &cpu->dev;
 	if (!error)
 		register_cpu_under_node(num, cpu_to_node(num));
 
 #ifdef CONFIG_KEXEC
 	if (!error)
-		error = sysdev_create_file(&cpu->sysdev, &attr_crash_notes);
+		error = device_create_file(&cpu->dev, &dev_attr_crash_notes);
 #endif
 	return error;
 }
 
-struct sys_device *get_cpu_sysdev(unsigned cpu)
+struct device *get_cpu_device(unsigned cpu)
 {
 	if (cpu < nr_cpu_ids && cpu_possible(cpu))
 		return per_cpu(cpu_sys_devices, cpu);
 	else
 		return NULL;
 }
-EXPORT_SYMBOL_GPL(get_cpu_sysdev);
+EXPORT_SYMBOL_GPL(get_cpu_device);
+
+static struct attribute *cpu_root_attrs[] = {
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+	&dev_attr_probe.attr,
+	&dev_attr_release.attr,
+#endif
+	&cpu_attrs[0].attr.attr,
+	&cpu_attrs[1].attr.attr,
+	&cpu_attrs[2].attr.attr,
+	&dev_attr_kernel_max.attr,
+	&dev_attr_offline.attr,
+	NULL
+};
+
+static struct attribute_group cpu_root_attr_group = {
+	.attrs = cpu_root_attrs,
+};
+
+static const struct attribute_group *cpu_root_attr_groups[] = {
+	&cpu_root_attr_group,
+	NULL,
+};
 
 int __init cpu_dev_init(void)
 {
 	int err;
 
-	err = sysdev_class_register(&cpu_sysdev_class);
+	err = subsys_system_register(&cpu_subsys, cpu_root_attr_groups);
+	if (err)
+		return err;
+
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	if (!err)
-		err = sched_create_sysfs_power_savings_entries(&cpu_sysdev_class);
+	err = sched_create_sysfs_power_savings_entries(cpu_subsys.dev_root);
 #endif
-
 	return err;
 }
-
-static struct sysdev_class_attribute *cpu_sysdev_class_attrs[] = {
-#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
-	&attr_probe,
-	&attr_release,
-#endif
-	&cpu_attrs[0].attr,
-	&cpu_attrs[1].attr,
-	&cpu_attrs[2].attr,
-	&attr_kernel_max,
-	&attr_offline,
-	NULL
-};
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 793f796c4da3..6ce1501c7de5 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -315,12 +315,12 @@ struct node node_devices[MAX_NUMNODES];
 int register_cpu_under_node(unsigned int cpu, unsigned int nid)
 {
 	int ret;
-	struct sys_device *obj;
+	struct device *obj;
 
 	if (!node_online(nid))
 		return 0;
 
-	obj = get_cpu_sysdev(cpu);
+	obj = get_cpu_device(cpu);
 	if (!obj)
 		return 0;
 
@@ -337,12 +337,12 @@ int register_cpu_under_node(unsigned int cpu, unsigned int nid)
 
 int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
 {
-	struct sys_device *obj;
+	struct device *obj;
 
 	if (!node_online(nid))
 		return 0;
 
-	obj = get_cpu_sysdev(cpu);
+	obj = get_cpu_device(cpu);
 	if (!obj)
 		return 0;
 
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index f6f37a05a0c3..ae989c57cd5e 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -23,7 +23,6 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
  */
-#include <linux/sysdev.h>
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/cpu.h>
@@ -32,14 +31,14 @@
 #include <linux/topology.h>
 
 #define define_one_ro_named(_name, _func)				\
-static SYSDEV_ATTR(_name, 0444, _func, NULL)
+	static DEVICE_ATTR(_name, 0444, _func, NULL)
 
 #define define_one_ro(_name)				\
-static SYSDEV_ATTR(_name, 0444, show_##_name, NULL)
+	static DEVICE_ATTR(_name, 0444, show_##_name, NULL)
 
 #define define_id_show_func(name)				\
-static ssize_t show_##name(struct sys_device *dev,		\
-		struct sysdev_attribute *attr, char *buf)	\
+static ssize_t show_##name(struct device *dev,			\
+		struct device_attribute *attr, char *buf)	\
 {								\
 	unsigned int cpu = dev->id;				\
 	return sprintf(buf, "%d\n", topology_##name(cpu));	\
@@ -65,16 +64,16 @@ static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf)
 
 #ifdef arch_provides_topology_pointers
 #define define_siblings_show_map(name)					\
-static ssize_t show_##name(struct sys_device *dev,			\
-			   struct sysdev_attribute *attr, char *buf)	\
+static ssize_t show_##name(struct device *dev,				\
+			   struct device_attribute *attr, char *buf)	\
 {									\
 	unsigned int cpu = dev->id;					\
 	return show_cpumap(0, topology_##name(cpu), buf);		\
 }
 
 #define define_siblings_show_list(name)					\
-static ssize_t show_##name##_list(struct sys_device *dev,		\
-				  struct sysdev_attribute *attr,	\
+static ssize_t show_##name##_list(struct device *dev,			\
+				  struct device_attribute *attr,	\
 				  char *buf)				\
 {									\
 	unsigned int cpu = dev->id;					\
@@ -83,15 +82,15 @@ static ssize_t show_##name##_list(struct sys_device *dev,		\
 
 #else
 #define define_siblings_show_map(name)					\
-static ssize_t show_##name(struct sys_device *dev,			\
-			   struct sysdev_attribute *attr, char *buf)	\
+static ssize_t show_##name(struct device *dev,				\
+			   struct device_attribute *attr, char *buf)	\
 {									\
 	return show_cpumap(0, topology_##name(dev->id), buf);		\
 }
 
 #define define_siblings_show_list(name)					\
-static ssize_t show_##name##_list(struct sys_device *dev,		\
-				  struct sysdev_attribute *attr,	\
+static ssize_t show_##name##_list(struct device *dev,			\
+				  struct device_attribute *attr,	\
 				  char *buf)				\
 {									\
 	return show_cpumap(1, topology_##name(dev->id), buf);		\
@@ -124,16 +123,16 @@ define_one_ro_named(book_siblings_list, show_book_cpumask_list);
 #endif
 
 static struct attribute *default_attrs[] = {
-	&attr_physical_package_id.attr,
-	&attr_core_id.attr,
-	&attr_thread_siblings.attr,
-	&attr_thread_siblings_list.attr,
-	&attr_core_siblings.attr,
-	&attr_core_siblings_list.attr,
+	&dev_attr_physical_package_id.attr,
+	&dev_attr_core_id.attr,
+	&dev_attr_thread_siblings.attr,
+	&dev_attr_thread_siblings_list.attr,
+	&dev_attr_core_siblings.attr,
+	&dev_attr_core_siblings_list.attr,
 #ifdef CONFIG_SCHED_BOOK
-	&attr_book_id.attr,
-	&attr_book_siblings.attr,
-	&attr_book_siblings_list.attr,
+	&dev_attr_book_id.attr,
+	&dev_attr_book_siblings.attr,
+	&dev_attr_book_siblings_list.attr,
 #endif
 	NULL
 };
@@ -146,16 +145,16 @@ static struct attribute_group topology_attr_group = {
 /* Add/Remove cpu_topology interface for CPU device */
 static int __cpuinit topology_add_dev(unsigned int cpu)
 {
-	struct sys_device *sys_dev = get_cpu_sysdev(cpu);
+	struct device *dev = get_cpu_device(cpu);
 
-	return sysfs_create_group(&sys_dev->kobj, &topology_attr_group);
+	return sysfs_create_group(&dev->kobj, &topology_attr_group);
 }
 
 static void __cpuinit topology_remove_dev(unsigned int cpu)
 {
-	struct sys_device *sys_dev = get_cpu_sysdev(cpu);
+	struct device *dev = get_cpu_device(cpu);
 
-	sysfs_remove_group(&sys_dev->kobj, &topology_attr_group);
+	sysfs_remove_group(&dev->kobj, &topology_attr_group);
 }
 
 static int __cpuinit topology_cpu_callback(struct notifier_block *nfb,
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 987a165ede26..8c2df3499da7 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -679,7 +679,7 @@ static struct kobj_type ktype_cpufreq = {
  */
 static int cpufreq_add_dev_policy(unsigned int cpu,
 				  struct cpufreq_policy *policy,
-				  struct sys_device *sys_dev)
+				  struct device *dev)
 {
 	int ret = 0;
 #ifdef CONFIG_SMP
@@ -728,7 +728,7 @@ static int cpufreq_add_dev_policy(unsigned int cpu,
 			spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
 			pr_debug("CPU already managed, adding link\n");
-			ret = sysfs_create_link(&sys_dev->kobj,
+			ret = sysfs_create_link(&dev->kobj,
 						&managed_policy->kobj,
 						"cpufreq");
 			if (ret)
@@ -761,7 +761,7 @@ static int cpufreq_add_dev_symlink(unsigned int cpu,
 
 	for_each_cpu(j, policy->cpus) {
 		struct cpufreq_policy *managed_policy;
-		struct sys_device *cpu_sys_dev;
+		struct device *cpu_dev;
 
 		if (j == cpu)
 			continue;
@@ -770,8 +770,8 @@ static int cpufreq_add_dev_symlink(unsigned int cpu,
 
 		pr_debug("CPU %u already managed, adding link\n", j);
 		managed_policy = cpufreq_cpu_get(cpu);
-		cpu_sys_dev = get_cpu_sysdev(j);
-		ret = sysfs_create_link(&cpu_sys_dev->kobj, &policy->kobj,
+		cpu_dev = get_cpu_device(j);
+		ret = sysfs_create_link(&cpu_dev->kobj, &policy->kobj,
 					"cpufreq");
 		if (ret) {
 			cpufreq_cpu_put(managed_policy);
@@ -783,7 +783,7 @@ static int cpufreq_add_dev_symlink(unsigned int cpu,
 
 static int cpufreq_add_dev_interface(unsigned int cpu,
 				     struct cpufreq_policy *policy,
-				     struct sys_device *sys_dev)
+				     struct device *dev)
 {
 	struct cpufreq_policy new_policy;
 	struct freq_attr **drv_attr;
@@ -793,7 +793,7 @@ static int cpufreq_add_dev_interface(unsigned int cpu,
 
 	/* prepare interface data */
 	ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
-				   &sys_dev->kobj, "cpufreq");
+				   &dev->kobj, "cpufreq");
 	if (ret)
 		return ret;
 
@@ -866,9 +866,9 @@ err_out_kobj_put:
  * with with cpu hotplugging and all hell will break loose. Tried to clean this
  * mess up, but more thorough testing is needed. - Mathieu
  */
-static int cpufreq_add_dev(struct sys_device *sys_dev)
+static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
 {
-	unsigned int cpu = sys_dev->id;
+	unsigned int cpu = dev->id;
 	int ret = 0, found = 0;
 	struct cpufreq_policy *policy;
 	unsigned long flags;
@@ -947,7 +947,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
 				     CPUFREQ_START, policy);
 
-	ret = cpufreq_add_dev_policy(cpu, policy, sys_dev);
+	ret = cpufreq_add_dev_policy(cpu, policy, dev);
 	if (ret) {
 		if (ret > 0)
 			/* This is a managed cpu, symlink created,
@@ -956,7 +956,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 		goto err_unlock_policy;
 	}
 
-	ret = cpufreq_add_dev_interface(cpu, policy, sys_dev);
+	ret = cpufreq_add_dev_interface(cpu, policy, dev);
 	if (ret)
 		goto err_out_unregister;
 
@@ -999,15 +999,15 @@ module_out:
  * Caller should already have policy_rwsem in write mode for this CPU.
  * This routine frees the rwsem before returning.
  */
-static int __cpufreq_remove_dev(struct sys_device *sys_dev)
+static int __cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 {
-	unsigned int cpu = sys_dev->id;
+	unsigned int cpu = dev->id;
 	unsigned long flags;
 	struct cpufreq_policy *data;
 	struct kobject *kobj;
 	struct completion *cmp;
 #ifdef CONFIG_SMP
-	struct sys_device *cpu_sys_dev;
+	struct device *cpu_dev;
 	unsigned int j;
 #endif
 
@@ -1032,7 +1032,7 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 		pr_debug("removing link\n");
 		cpumask_clear_cpu(cpu, data->cpus);
 		spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
-		kobj = &sys_dev->kobj;
+		kobj = &dev->kobj;
 		cpufreq_cpu_put(data);
 		unlock_policy_rwsem_write(cpu);
 		sysfs_remove_link(kobj, "cpufreq");
@@ -1071,8 +1071,8 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 			strncpy(per_cpu(cpufreq_cpu_governor, j),
 				data->governor->name, CPUFREQ_NAME_LEN);
 #endif
-			cpu_sys_dev = get_cpu_sysdev(j);
-			kobj = &cpu_sys_dev->kobj;
+			cpu_dev = get_cpu_device(j);
+			kobj = &cpu_dev->kobj;
 			unlock_policy_rwsem_write(cpu);
 			sysfs_remove_link(kobj, "cpufreq");
 			lock_policy_rwsem_write(cpu);
@@ -1112,11 +1112,11 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 	if (unlikely(cpumask_weight(data->cpus) > 1)) {
 		/* first sibling now owns the new sysfs dir */
 		cpumask_clear_cpu(cpu, data->cpus);
-		cpufreq_add_dev(get_cpu_sysdev(cpumask_first(data->cpus)));
+		cpufreq_add_dev(get_cpu_device(cpumask_first(data->cpus)), NULL);
 
 		/* finally remove our own symlink */
 		lock_policy_rwsem_write(cpu);
-		__cpufreq_remove_dev(sys_dev);
+		__cpufreq_remove_dev(dev, sif);
 	}
 #endif
 
@@ -1128,9 +1128,9 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 }
 
 
-static int cpufreq_remove_dev(struct sys_device *sys_dev)
+static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 {
-	unsigned int cpu = sys_dev->id;
+	unsigned int cpu = dev->id;
 	int retval;
 
 	if (cpu_is_offline(cpu))
@@ -1139,7 +1139,7 @@ static int cpufreq_remove_dev(struct sys_device *sys_dev)
 	if (unlikely(lock_policy_rwsem_write(cpu)))
 		BUG();
 
-	retval = __cpufreq_remove_dev(sys_dev);
+	retval = __cpufreq_remove_dev(dev, sif);
 	return retval;
 }
 
@@ -1271,9 +1271,11 @@ out:
 }
 EXPORT_SYMBOL(cpufreq_get);
 
-static struct sysdev_driver cpufreq_sysdev_driver = {
-	.add		= cpufreq_add_dev,
-	.remove		= cpufreq_remove_dev,
+static struct subsys_interface cpufreq_interface = {
+	.name		= "cpufreq",
+	.subsys		= &cpu_subsys,
+	.add_dev	= cpufreq_add_dev,
+	.remove_dev	= cpufreq_remove_dev,
 };
 
 
@@ -1765,25 +1767,25 @@ static int __cpuinit cpufreq_cpu_callback(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct sys_device *sys_dev;
+	struct device *dev;
 
-	sys_dev = get_cpu_sysdev(cpu);
-	if (sys_dev) {
+	dev = get_cpu_device(cpu);
+	if (dev) {
 		switch (action) {
 		case CPU_ONLINE:
 		case CPU_ONLINE_FROZEN:
-			cpufreq_add_dev(sys_dev);
+			cpufreq_add_dev(dev, NULL);
 			break;
 		case CPU_DOWN_PREPARE:
 		case CPU_DOWN_PREPARE_FROZEN:
 			if (unlikely(lock_policy_rwsem_write(cpu)))
 				BUG();
 
-			__cpufreq_remove_dev(sys_dev);
+			__cpufreq_remove_dev(dev, NULL);
 			break;
 		case CPU_DOWN_FAILED:
 		case CPU_DOWN_FAILED_FROZEN:
-			cpufreq_add_dev(sys_dev);
+			cpufreq_add_dev(dev, NULL);
 			break;
 		}
 	}
@@ -1830,8 +1832,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	cpufreq_driver = driver_data;
 	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
-	ret = sysdev_driver_register(&cpu_sysdev_class,
-					&cpufreq_sysdev_driver);
+	ret = subsys_interface_register(&cpufreq_interface);
 	if (ret)
 		goto err_null_driver;
 
@@ -1850,7 +1851,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 		if (ret) {
 			pr_debug("no CPU initialized for driver %s\n",
 							driver_data->name);
-			goto err_sysdev_unreg;
+			goto err_if_unreg;
 		}
 	}
 
@@ -1858,9 +1859,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	pr_debug("driver %s up and running\n", driver_data->name);
 
 	return 0;
-err_sysdev_unreg:
-	sysdev_driver_unregister(&cpu_sysdev_class,
-			&cpufreq_sysdev_driver);
+err_if_unreg:
+	subsys_interface_unregister(&cpufreq_interface);
 err_null_driver:
 	spin_lock_irqsave(&cpufreq_driver_lock, flags);
 	cpufreq_driver = NULL;
@@ -1887,7 +1887,7 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
 
 	pr_debug("unregistering driver %s\n", driver->name);
 
-	sysdev_driver_unregister(&cpu_sysdev_class, &cpufreq_sysdev_driver);
+	subsys_interface_unregister(&cpufreq_interface);
 	unregister_hotcpu_notifier(&cpufreq_cpu_notifier);
 
 	spin_lock_irqsave(&cpufreq_driver_lock, flags);
@@ -1907,8 +1907,7 @@ static int __init cpufreq_core_init(void)
 		init_rwsem(&per_cpu(cpu_policy_rwsem, cpu));
 	}
 
-	cpufreq_global_kobject = kobject_create_and_add("cpufreq",
-						&cpu_sysdev_class.kset.kobj);
+	cpufreq_global_kobject = kobject_create_and_add("cpufreq", &cpu_subsys.dev_root->kobj);
 	BUG_ON(!cpufreq_global_kobject);
 	register_syscore_ops(&cpufreq_syscore_ops);
 
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index c5072a91e848..390380a8cfc9 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -11,7 +11,6 @@
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/sysdev.h>
 #include <linux/cpu.h>
 #include <linux/sysfs.h>
 #include <linux/cpufreq.h>
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 06ce2680d00d..59f4261c753a 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -291,10 +291,10 @@ EXPORT_SYMBOL_GPL(cpuidle_disable_device);
 static int __cpuidle_register_device(struct cpuidle_device *dev)
 {
 	int ret;
-	struct sys_device *sys_dev = get_cpu_sysdev((unsigned long)dev->cpu);
+	struct device *cpu_dev = get_cpu_device((unsigned long)dev->cpu);
 	struct cpuidle_driver *cpuidle_driver = cpuidle_get_driver();
 
-	if (!sys_dev)
+	if (!dev)
 		return -EINVAL;
 	if (!try_module_get(cpuidle_driver->owner))
 		return -EINVAL;
@@ -303,7 +303,7 @@ static int __cpuidle_register_device(struct cpuidle_device *dev)
 
 	per_cpu(cpuidle_devices, dev->cpu) = dev;
 	list_add(&dev->device_list, &cpuidle_detected_devices);
-	if ((ret = cpuidle_add_sysfs(sys_dev))) {
+	if ((ret = cpuidle_add_sysfs(cpu_dev))) {
 		module_put(cpuidle_driver->owner);
 		return ret;
 	}
@@ -344,7 +344,7 @@ EXPORT_SYMBOL_GPL(cpuidle_register_device);
  */
 void cpuidle_unregister_device(struct cpuidle_device *dev)
 {
-	struct sys_device *sys_dev = get_cpu_sysdev((unsigned long)dev->cpu);
+	struct device *cpu_dev = get_cpu_device((unsigned long)dev->cpu);
 	struct cpuidle_driver *cpuidle_driver = cpuidle_get_driver();
 
 	if (dev->registered == 0)
@@ -354,7 +354,7 @@ void cpuidle_unregister_device(struct cpuidle_device *dev)
 
 	cpuidle_disable_device(dev);
 
-	cpuidle_remove_sysfs(sys_dev);
+	cpuidle_remove_sysfs(cpu_dev);
 	list_del(&dev->device_list);
 	wait_for_completion(&dev->kobj_unregister);
 	per_cpu(cpuidle_devices, dev->cpu) = NULL;
@@ -411,7 +411,7 @@ static int __init cpuidle_init(void)
 	if (cpuidle_disabled())
 		return -ENODEV;
 
-	ret = cpuidle_add_class_sysfs(&cpu_sysdev_class);
+	ret = cpuidle_add_interface(cpu_subsys.dev_root);
 	if (ret)
 		return ret;
 
diff --git a/drivers/cpuidle/cpuidle.h b/drivers/cpuidle/cpuidle.h
index 38c3fd8b9d76..7db186685c27 100644
--- a/drivers/cpuidle/cpuidle.h
+++ b/drivers/cpuidle/cpuidle.h
@@ -5,7 +5,7 @@
 #ifndef __DRIVER_CPUIDLE_H
 #define __DRIVER_CPUIDLE_H
 
-#include <linux/sysdev.h>
+#include <linux/device.h>
 
 /* For internal use only */
 extern struct cpuidle_governor *cpuidle_curr_governor;
@@ -23,11 +23,11 @@ extern void cpuidle_uninstall_idle_handler(void);
 extern int cpuidle_switch_governor(struct cpuidle_governor *gov);
 
 /* sysfs */
-extern int cpuidle_add_class_sysfs(struct sysdev_class *cls);
-extern void cpuidle_remove_class_sysfs(struct sysdev_class *cls);
+extern int cpuidle_add_interface(struct device *dev);
+extern void cpuidle_remove_interface(struct device *dev);
 extern int cpuidle_add_state_sysfs(struct cpuidle_device *device);
 extern void cpuidle_remove_state_sysfs(struct cpuidle_device *device);
-extern int cpuidle_add_sysfs(struct sys_device *sysdev);
-extern void cpuidle_remove_sysfs(struct sys_device *sysdev);
+extern int cpuidle_add_sysfs(struct device *dev);
+extern void cpuidle_remove_sysfs(struct device *dev);
 
 #endif /* __DRIVER_CPUIDLE_H */
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 1e756e160dca..3fe41fe4851a 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -22,8 +22,8 @@ static int __init cpuidle_sysfs_setup(char *unused)
 }
 __setup("cpuidle_sysfs_switch", cpuidle_sysfs_setup);
 
-static ssize_t show_available_governors(struct sysdev_class *class,
-					struct sysdev_class_attribute *attr,
+static ssize_t show_available_governors(struct device *dev,
+					struct device_attribute *attr,
 					char *buf)
 {
 	ssize_t i = 0;
@@ -42,8 +42,8 @@ out:
 	return i;
 }
 
-static ssize_t show_current_driver(struct sysdev_class *class,
-				   struct sysdev_class_attribute *attr,
+static ssize_t show_current_driver(struct device *dev,
+				   struct device_attribute *attr,
 				   char *buf)
 {
 	ssize_t ret;
@@ -59,8 +59,8 @@ static ssize_t show_current_driver(struct sysdev_class *class,
 	return ret;
 }
 
-static ssize_t show_current_governor(struct sysdev_class *class,
-				     struct sysdev_class_attribute *attr,
+static ssize_t show_current_governor(struct device *dev,
+				     struct device_attribute *attr,
 				     char *buf)
 {
 	ssize_t ret;
@@ -75,8 +75,8 @@ static ssize_t show_current_governor(struct sysdev_class *class,
 	return ret;
 }
 
-static ssize_t store_current_governor(struct sysdev_class *class,
-				      struct sysdev_class_attribute *attr,
+static ssize_t store_current_governor(struct device *dev,
+				      struct device_attribute *attr,
 				      const char *buf, size_t count)
 {
 	char gov_name[CPUIDLE_NAME_LEN];
@@ -109,50 +109,48 @@ static ssize_t store_current_governor(struct sysdev_class *class,
 		return count;
 }
 
-static SYSDEV_CLASS_ATTR(current_driver, 0444, show_current_driver, NULL);
-static SYSDEV_CLASS_ATTR(current_governor_ro, 0444, show_current_governor,
-			 NULL);
+static DEVICE_ATTR(current_driver, 0444, show_current_driver, NULL);
+static DEVICE_ATTR(current_governor_ro, 0444, show_current_governor, NULL);
 
-static struct attribute *cpuclass_default_attrs[] = {
-	&attr_current_driver.attr,
-	&attr_current_governor_ro.attr,
+static struct attribute *cpuidle_default_attrs[] = {
+	&dev_attr_current_driver.attr,
+	&dev_attr_current_governor_ro.attr,
 	NULL
 };
 
-static SYSDEV_CLASS_ATTR(available_governors, 0444, show_available_governors,
-			 NULL);
-static SYSDEV_CLASS_ATTR(current_governor, 0644, show_current_governor,
-			 store_current_governor);
+static DEVICE_ATTR(available_governors, 0444, show_available_governors, NULL);
+static DEVICE_ATTR(current_governor, 0644, show_current_governor,
+		   store_current_governor);
 
-static struct attribute *cpuclass_switch_attrs[] = {
-	&attr_available_governors.attr,
-	&attr_current_driver.attr,
-	&attr_current_governor.attr,
+static struct attribute *cpuidle_switch_attrs[] = {
+	&dev_attr_available_governors.attr,
+	&dev_attr_current_driver.attr,
+	&dev_attr_current_governor.attr,
 	NULL
 };
 
-static struct attribute_group cpuclass_attr_group = {
-	.attrs = cpuclass_default_attrs,
+static struct attribute_group cpuidle_attr_group = {
+	.attrs = cpuidle_default_attrs,
 	.name = "cpuidle",
 };
 
 /**
- * cpuidle_add_class_sysfs - add CPU global sysfs attributes
+ * cpuidle_add_interface - add CPU global sysfs attributes
  */
-int cpuidle_add_class_sysfs(struct sysdev_class *cls)
+int cpuidle_add_interface(struct device *dev)
 {
 	if (sysfs_switch)
-		cpuclass_attr_group.attrs = cpuclass_switch_attrs;
+		cpuidle_attr_group.attrs = cpuidle_switch_attrs;
 
-	return sysfs_create_group(&cls->kset.kobj, &cpuclass_attr_group);
+	return sysfs_create_group(&dev->kobj, &cpuidle_attr_group);
 }
 
 /**
- * cpuidle_remove_class_sysfs - remove CPU global sysfs attributes
+ * cpuidle_remove_interface - remove CPU global sysfs attributes
  */
-void cpuidle_remove_class_sysfs(struct sysdev_class *cls)
+void cpuidle_remove_interface(struct device *dev)
 {
-	sysfs_remove_group(&cls->kset.kobj, &cpuclass_attr_group);
+	sysfs_remove_group(&dev->kobj, &cpuidle_attr_group);
 }
 
 struct cpuidle_attr {
@@ -365,16 +363,16 @@ void cpuidle_remove_state_sysfs(struct cpuidle_device *device)
 
 /**
  * cpuidle_add_sysfs - creates a sysfs instance for the target device
- * @sysdev: the target device
+ * @dev: the target device
  */
-int cpuidle_add_sysfs(struct sys_device *sysdev)
+int cpuidle_add_sysfs(struct device *cpu_dev)
 {
-	int cpu = sysdev->id;
+	int cpu = cpu_dev->id;
 	struct cpuidle_device *dev;
 	int error;
 
 	dev = per_cpu(cpuidle_devices, cpu);
-	error = kobject_init_and_add(&dev->kobj, &ktype_cpuidle, &sysdev->kobj,
+	error = kobject_init_and_add(&dev->kobj, &ktype_cpuidle, &cpu_dev->kobj,
 				     "cpuidle");
 	if (!error)
 		kobject_uevent(&dev->kobj, KOBJ_ADD);
@@ -383,11 +381,11 @@ int cpuidle_add_sysfs(struct sys_device *sysdev)
 
 /**
  * cpuidle_remove_sysfs - deletes a sysfs instance on the target device
- * @sysdev: the target device
+ * @dev: the target device
  */
-void cpuidle_remove_sysfs(struct sys_device *sysdev)
+void cpuidle_remove_sysfs(struct device *cpu_dev)
 {
-	int cpu = sysdev->id;
+	int cpu = cpu_dev->id;
 	struct cpuidle_device *dev;
 
 	dev = per_cpu(cpuidle_devices, cpu);
diff --git a/drivers/s390/char/sclp_config.c b/drivers/s390/char/sclp_config.c
index 95b909ac2b73..3c03c1060be6 100644
--- a/drivers/s390/char/sclp_config.c
+++ b/drivers/s390/char/sclp_config.c
@@ -11,7 +11,7 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/cpu.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/workqueue.h>
 #include <asm/smp.h>
 
@@ -31,14 +31,14 @@ static struct work_struct sclp_cpu_change_work;
 static void sclp_cpu_capability_notify(struct work_struct *work)
 {
 	int cpu;
-	struct sys_device *sysdev;
+	struct device *dev;
 
 	s390_adjust_jiffies();
 	pr_warning("cpu capability changed.\n");
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
-		sysdev = get_cpu_sysdev(cpu);
-		kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
+		dev = get_cpu_device(cpu);
+		kobject_uevent(&dev->kobj, KOBJ_CHANGE);
 	}
 	put_online_cpus();
 }
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 6cb60fd2ea84..fc3da0d70d68 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -14,7 +14,7 @@
 #ifndef _LINUX_CPU_H_
 #define _LINUX_CPU_H_
 
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/node.h>
 #include <linux/compiler.h>
 #include <linux/cpumask.h>
@@ -22,19 +22,19 @@
 struct cpu {
 	int node_id;		/* The node which contains the CPU */
 	int hotpluggable;	/* creates sysfs control file if hotpluggable */
-	struct sys_device sysdev;
+	struct device dev;
 };
 
 extern int register_cpu(struct cpu *cpu, int num);
-extern struct sys_device *get_cpu_sysdev(unsigned cpu);
+extern struct device *get_cpu_device(unsigned cpu);
 
-extern int cpu_add_sysdev_attr(struct sysdev_attribute *attr);
-extern void cpu_remove_sysdev_attr(struct sysdev_attribute *attr);
+extern int cpu_add_dev_attr(struct device_attribute *attr);
+extern void cpu_remove_dev_attr(struct device_attribute *attr);
 
-extern int cpu_add_sysdev_attr_group(struct attribute_group *attrs);
-extern void cpu_remove_sysdev_attr_group(struct attribute_group *attrs);
+extern int cpu_add_dev_attr_group(struct attribute_group *attrs);
+extern void cpu_remove_dev_attr_group(struct attribute_group *attrs);
 
-extern int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls);
+extern int sched_create_sysfs_power_savings_entries(struct device *dev);
 
 #ifdef CONFIG_HOTPLUG_CPU
 extern void unregister_cpu(struct cpu *cpu);
@@ -160,7 +160,7 @@ static inline void cpu_maps_update_done(void)
 }
 
 #endif /* CONFIG_SMP */
-extern struct sysdev_class cpu_sysdev_class;
+extern struct bus_type cpu_subsys;
 
 #ifdef CONFIG_HOTPLUG_CPU
 /* Stop CPUs going up and down. */
diff --git a/kernel/sched.c b/kernel/sched.c
index 0e9344a71be3..530772646443 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7923,54 +7923,52 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 }
 
 #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
-					   struct sysdev_class_attribute *attr,
-					   char *page)
+static ssize_t sched_mc_power_savings_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
 {
-	return sprintf(page, "%u\n", sched_mc_power_savings);
+	return sprintf(buf, "%u\n", sched_mc_power_savings);
 }
-static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
-					    struct sysdev_class_attribute *attr,
+static ssize_t sched_mc_power_savings_store(struct device *dev,
+					    struct device_attribute *attr,
 					    const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 0);
 }
-static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
-			 sched_mc_power_savings_show,
-			 sched_mc_power_savings_store);
+static DEVICE_ATTR(sched_mc_power_savings, 0644,
+		   sched_mc_power_savings_show,
+		   sched_mc_power_savings_store);
 #endif
 
 #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
-					    struct sysdev_class_attribute *attr,
-					    char *page)
+static ssize_t sched_smt_power_savings_show(struct device *dev,
+					    struct device_attribute *attr,
+					    char *buf)
 {
-	return sprintf(page, "%u\n", sched_smt_power_savings);
+	return sprintf(buf, "%u\n", sched_smt_power_savings);
 }
-static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
-					     struct sysdev_class_attribute *attr,
+static ssize_t sched_smt_power_savings_store(struct device *dev,
+					    struct device_attribute *attr,
 					     const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 1);
 }
-static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+static DEVICE_ATTR(sched_smt_power_savings, 0644,
 		   sched_smt_power_savings_show,
 		   sched_smt_power_savings_store);
 #endif
 
-int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+int __init sched_create_sysfs_power_savings_entries(struct device *dev)
 {
 	int err = 0;
 
 #ifdef CONFIG_SCHED_SMT
 	if (smt_capable())
-		err = sysfs_create_file(&cls->kset.kobj,
-					&attr_sched_smt_power_savings.attr);
+		err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
 #endif
 #ifdef CONFIG_SCHED_MC
 	if (!err && mc_capable())
-		err = sysfs_create_file(&cls->kset.kobj,
-					&attr_sched_mc_power_savings.attr);
+		err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
 #endif
 	return err;
 }
-- 
cgit v1.2.3-55-g7522


From edbaa603eb801655e80808a9cf3d3b622e8ac66b Mon Sep 17 00:00:00 2001
From: Kay Sievers
Date: Wed, 21 Dec 2011 16:26:03 -0800
Subject: driver-core: remove sysdev.h usage.

The sysdev.h file should not be needed by any in-kernel code, so remove
the .h file from these random files that seem to still want to include
it.

The sysdev code will be going away soon, so this include needs to be
removed no matter what.

Cc: Jiandong Zheng <jdzheng@broadcom.com>
Cc: Scott Branden <sbranden@broadcom.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Kukjin Kim <kgene.kim@samsung.com>
Cc: David Brown <davidb@codeaurora.org>
Cc: Daniel Walker <dwalker@fifo99.com>
Cc: Bryan Huntsman <bryanh@codeaurora.org>
Cc: Ben Dooks <ben-linux@fluff.org>
Cc: Wan ZongShun <mcuos.com@gmail.com>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: "Venkatesh Pallipadi
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: Richard Purdie <rpurdie@rpsys.net>
Cc: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
---
 arch/arm/mach-bcmring/core.c                 | 1 -
 arch/arm/mach-exynos/irq-eint.c              | 2 +-
 arch/arm/mach-integrator/integrator_cp.c     | 2 +-
 arch/arm/mach-ks8695/irq.c                   | 2 +-
 arch/arm/mach-lpc32xx/phy3250.c              | 2 +-
 arch/arm/mach-msm/board-sapphire.c           | 2 +-
 arch/arm/mach-realview/core.c                | 2 +-
 arch/arm/mach-realview/realview_eb.c         | 2 +-
 arch/arm/mach-realview/realview_pb1176.c     | 2 +-
 arch/arm/mach-realview/realview_pb11mp.c     | 2 +-
 arch/arm/mach-realview/realview_pba8.c       | 2 +-
 arch/arm/mach-realview/realview_pbx.c        | 2 +-
 arch/arm/mach-s3c2410/bast-irq.c             | 2 +-
 arch/arm/mach-s3c2410/mach-h1940.c           | 2 +-
 arch/arm/mach-s3c2410/mach-qt2410.c          | 2 +-
 arch/arm/mach-s3c2412/clock.c                | 2 +-
 arch/arm/mach-s3c2440/mach-rx1950.c          | 2 +-
 arch/arm/mach-s3c2440/mach-rx3715.c          | 2 +-
 arch/arm/mach-s3c2443/clock.c                | 2 +-
 arch/arm/mach-s3c64xx/irq-eint.c             | 2 +-
 arch/arm/mach-s5p64x0/clock-s5p6440.c        | 2 +-
 arch/arm/mach-s5p64x0/clock-s5p6450.c        | 2 +-
 arch/arm/mach-s5p64x0/clock.c                | 2 +-
 arch/arm/mach-s5pv210/clock.c                | 2 +-
 arch/arm/mach-s5pv210/mach-smdkc110.c        | 2 +-
 arch/arm/mach-s5pv210/mach-smdkv210.c        | 2 +-
 arch/arm/mach-versatile/core.c               | 1 -
 arch/arm/mach-versatile/versatile_ab.c       | 1 -
 arch/arm/mach-versatile/versatile_pb.c       | 1 -
 arch/arm/mach-vexpress/v2m.c                 | 2 +-
 arch/arm/mach-w90x900/irq.c                  | 2 +-
 arch/arm/plat-s3c24xx/common-smdk.c          | 2 +-
 arch/arm/plat-s3c24xx/cpu-freq.c             | 2 +-
 arch/arm/plat-s3c24xx/irq.c                  | 2 +-
 arch/arm/plat-s3c24xx/pm-simtec.c            | 1 -
 arch/arm/plat-s3c24xx/s3c2410-clock.c        | 2 +-
 arch/arm/plat-s3c24xx/s3c2412-iotiming.c     | 2 +-
 arch/arm/plat-s5p/clock.c                    | 2 +-
 arch/arm/plat-s5p/irq-eint.c                 | 2 +-
 arch/arm/plat-samsung/clock-clksrc.c         | 2 +-
 arch/arm/plat-samsung/clock.c                | 2 +-
 arch/arm/plat-samsung/pm-gpio.c              | 2 +-
 arch/arm/plat-samsung/wakeup-mask.c          | 2 +-
 arch/avr32/boards/merisc/merisc_sysfs.c      | 1 -
 arch/avr32/kernel/irq.c                      | 2 +-
 arch/mips/txx9/generic/setup_tx4939.c        | 2 +-
 arch/powerpc/platforms/cell/smp.c            | 2 +-
 arch/powerpc/platforms/iseries/smp.c         | 2 +-
 arch/powerpc/platforms/powermac/cpufreq_32.c | 2 +-
 arch/powerpc/platforms/pseries/smp.c         | 2 +-
 arch/powerpc/sysdev/uic.c                    | 1 -
 arch/unicore32/kernel/puv3-core.c            | 1 -
 arch/unicore32/kernel/puv3-nb0916.c          | 1 -
 arch/x86/kernel/hpet.c                       | 1 -
 arch/x86/kernel/irqinit.c                    | 2 +-
 arch/x86/platform/uv/uv_sysfs.c              | 2 +-
 drivers/gpio/gpio-samsung.c                  | 2 +-
 drivers/leds/led-class.c                     | 1 -
 drivers/leds/led-triggers.c                  | 1 -
 drivers/macintosh/smu.c                      | 4 ----
 drivers/net/bonding/bond_sysfs.c             | 1 -
 drivers/platform/x86/intel_scu_ipc.c         | 2 +-
 drivers/s390/block/xpram.c                   | 2 +-
 63 files changed, 49 insertions(+), 66 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/mach-bcmring/core.c b/arch/arm/mach-bcmring/core.c
index 43eadbcc29ed..31d84ae44c4c 100644
--- a/arch/arm/mach-bcmring/core.c
+++ b/arch/arm/mach-bcmring/core.c
@@ -25,7 +25,6 @@
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
 #include <linux/platform_device.h>
-#include <linux/sysdev.h>
 #include <linux/interrupt.h>
 #include <linux/amba/bus.h>
 #include <linux/clkdev.h>
diff --git a/arch/arm/mach-exynos/irq-eint.c b/arch/arm/mach-exynos/irq-eint.c
index badb8c66fc9b..fe461901d60f 100644
--- a/arch/arm/mach-exynos/irq-eint.c
+++ b/arch/arm/mach-exynos/irq-eint.c
@@ -14,7 +14,7 @@
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/io.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/gpio.h>
 
 #include <plat/pm.h>
diff --git a/arch/arm/mach-integrator/integrator_cp.c b/arch/arm/mach-integrator/integrator_cp.c
index 5de49c33e4d4..208545c53da6 100644
--- a/arch/arm/mach-integrator/integrator_cp.c
+++ b/arch/arm/mach-integrator/integrator_cp.c
@@ -14,7 +14,7 @@
 #include <linux/platform_device.h>
 #include <linux/dma-mapping.h>
 #include <linux/string.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/amba/bus.h>
 #include <linux/amba/kmi.h>
 #include <linux/amba/clcd.h>
diff --git a/arch/arm/mach-ks8695/irq.c b/arch/arm/mach-ks8695/irq.c
index a78092dcd6fb..76802aac0f45 100644
--- a/arch/arm/mach-ks8695/irq.c
+++ b/arch/arm/mach-ks8695/irq.c
@@ -23,7 +23,7 @@
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/io.h>
 
 #include <mach/hardware.h>
diff --git a/arch/arm/mach-lpc32xx/phy3250.c b/arch/arm/mach-lpc32xx/phy3250.c
index 6d2f0d1b9373..21f3d4b9ee85 100644
--- a/arch/arm/mach-lpc32xx/phy3250.c
+++ b/arch/arm/mach-lpc32xx/phy3250.c
@@ -18,7 +18,7 @@
 
 #include <linux/init.h>
 #include <linux/platform_device.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/dma-mapping.h>
diff --git a/arch/arm/mach-msm/board-sapphire.c b/arch/arm/mach-msm/board-sapphire.c
index 32b465763dbd..97b8191d9d38 100644
--- a/arch/arm/mach-msm/board-sapphire.c
+++ b/arch/arm/mach-msm/board-sapphire.c
@@ -18,7 +18,7 @@
 #include <linux/input.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 
 #include <linux/delay.h>
 
diff --git a/arch/arm/mach-realview/core.c b/arch/arm/mach-realview/core.c
index d5ed5d4f77d6..acd329afc3ac 100644
--- a/arch/arm/mach-realview/core.c
+++ b/arch/arm/mach-realview/core.c
@@ -21,7 +21,7 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/dma-mapping.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/interrupt.h>
 #include <linux/amba/bus.h>
 #include <linux/amba/clcd.h>
diff --git a/arch/arm/mach-realview/realview_eb.c b/arch/arm/mach-realview/realview_eb.c
index 026c66ad7ec2..af608f76e0a7 100644
--- a/arch/arm/mach-realview/realview_eb.c
+++ b/arch/arm/mach-realview/realview_eb.c
@@ -21,7 +21,7 @@
 
 #include <linux/init.h>
 #include <linux/platform_device.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/amba/bus.h>
 #include <linux/amba/pl061.h>
 #include <linux/amba/mmci.h>
diff --git a/arch/arm/mach-realview/realview_pb1176.c b/arch/arm/mach-realview/realview_pb1176.c
index c057540ec776..510424669d0e 100644
--- a/arch/arm/mach-realview/realview_pb1176.c
+++ b/arch/arm/mach-realview/realview_pb1176.c
@@ -21,7 +21,7 @@
 
 #include <linux/init.h>
 #include <linux/platform_device.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/amba/bus.h>
 #include <linux/amba/pl061.h>
 #include <linux/amba/mmci.h>
diff --git a/arch/arm/mach-realview/realview_pb11mp.c b/arch/arm/mach-realview/realview_pb11mp.c
index 671ad6d6ff00..70d1bbdf2a3e 100644
--- a/arch/arm/mach-realview/realview_pb11mp.c
+++ b/arch/arm/mach-realview/realview_pb11mp.c
@@ -21,7 +21,7 @@
 
 #include <linux/init.h>
 #include <linux/platform_device.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/amba/bus.h>
 #include <linux/amba/pl061.h>
 #include <linux/amba/mmci.h>
diff --git a/arch/arm/mach-realview/realview_pba8.c b/arch/arm/mach-realview/realview_pba8.c
index cbf22df4ad5b..b841fc0a75dc 100644
--- a/arch/arm/mach-realview/realview_pba8.c
+++ b/arch/arm/mach-realview/realview_pba8.c
@@ -21,7 +21,7 @@
 
 #include <linux/init.h>
 #include <linux/platform_device.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/amba/bus.h>
 #include <linux/amba/pl061.h>
 #include <linux/amba/mmci.h>
diff --git a/arch/arm/mach-realview/realview_pbx.c b/arch/arm/mach-realview/realview_pbx.c
index 63c4114afae9..e102120fc6ac 100644
--- a/arch/arm/mach-realview/realview_pbx.c
+++ b/arch/arm/mach-realview/realview_pbx.c
@@ -20,7 +20,7 @@
 
 #include <linux/init.h>
 #include <linux/platform_device.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/amba/bus.h>
 #include <linux/amba/pl061.h>
 #include <linux/amba/mmci.h>
diff --git a/arch/arm/mach-s3c2410/bast-irq.c b/arch/arm/mach-s3c2410/bast-irq.c
index bc53d2d16d1a..ac7b2ad5c405 100644
--- a/arch/arm/mach-s3c2410/bast-irq.c
+++ b/arch/arm/mach-s3c2410/bast-irq.c
@@ -24,7 +24,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/ioport.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/io.h>
 
 #include <asm/mach-types.h>
diff --git a/arch/arm/mach-s3c2410/mach-h1940.c b/arch/arm/mach-s3c2410/mach-h1940.c
index 05a7d16e59f5..837a2d6f3524 100644
--- a/arch/arm/mach-s3c2410/mach-h1940.c
+++ b/arch/arm/mach-s3c2410/mach-h1940.c
@@ -18,7 +18,7 @@
 #include <linux/memblock.h>
 #include <linux/timer.h>
 #include <linux/init.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/serial_core.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
diff --git a/arch/arm/mach-s3c2410/mach-qt2410.c b/arch/arm/mach-s3c2410/mach-qt2410.c
index 451852156254..6f69789943d5 100644
--- a/arch/arm/mach-s3c2410/mach-qt2410.c
+++ b/arch/arm/mach-s3c2410/mach-qt2410.c
@@ -28,7 +28,7 @@
 #include <linux/timer.h>
 #include <linux/init.h>
 #include <linux/gpio.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/platform_device.h>
 #include <linux/serial_core.h>
 #include <linux/spi/spi.h>
diff --git a/arch/arm/mach-s3c2412/clock.c b/arch/arm/mach-s3c2412/clock.c
index 140711db6c89..516881640808 100644
--- a/arch/arm/mach-s3c2412/clock.c
+++ b/arch/arm/mach-s3c2412/clock.c
@@ -26,7 +26,7 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/err.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/clk.h>
 #include <linux/mutex.h>
 #include <linux/delay.h>
diff --git a/arch/arm/mach-s3c2440/mach-rx1950.c b/arch/arm/mach-s3c2440/mach-rx1950.c
index 0d3453bf567c..2078c1fe8e35 100644
--- a/arch/arm/mach-s3c2440/mach-rx1950.c
+++ b/arch/arm/mach-s3c2440/mach-rx1950.c
@@ -24,7 +24,7 @@
 #include <linux/serial_core.h>
 #include <linux/input.h>
 #include <linux/gpio_keys.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/pda_power.h>
 #include <linux/pwm_backlight.h>
 #include <linux/pwm.h>
diff --git a/arch/arm/mach-s3c2440/mach-rx3715.c b/arch/arm/mach-s3c2440/mach-rx3715.c
index e19499c2f909..4c2f553fbd52 100644
--- a/arch/arm/mach-s3c2440/mach-rx3715.c
+++ b/arch/arm/mach-s3c2440/mach-rx3715.c
@@ -20,7 +20,7 @@
 #include <linux/init.h>
 #include <linux/tty.h>
 #include <linux/console.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/platform_device.h>
 #include <linux/serial_core.h>
 #include <linux/serial.h>
diff --git a/arch/arm/mach-s3c2443/clock.c b/arch/arm/mach-s3c2443/clock.c
index 1c2c088aa2e8..6dde2696f8f0 100644
--- a/arch/arm/mach-s3c2443/clock.c
+++ b/arch/arm/mach-s3c2443/clock.c
@@ -27,7 +27,7 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/err.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/clk.h>
 #include <linux/mutex.h>
 #include <linux/serial_core.h>
diff --git a/arch/arm/mach-s3c64xx/irq-eint.c b/arch/arm/mach-s3c64xx/irq-eint.c
index 4d203be1f4c3..e3e75d186039 100644
--- a/arch/arm/mach-s3c64xx/irq-eint.c
+++ b/arch/arm/mach-s3c64xx/irq-eint.c
@@ -14,7 +14,7 @@
 
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/gpio.h>
 #include <linux/irq.h>
 #include <linux/io.h>
diff --git a/arch/arm/mach-s5p64x0/clock-s5p6440.c b/arch/arm/mach-s5p64x0/clock-s5p6440.c
index c54c65d511f0..8a293c3e3944 100644
--- a/arch/arm/mach-s5p64x0/clock-s5p6440.c
+++ b/arch/arm/mach-s5p64x0/clock-s5p6440.c
@@ -17,7 +17,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/clk.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/io.h>
 
 #include <mach/hardware.h>
diff --git a/arch/arm/mach-s5p64x0/clock-s5p6450.c b/arch/arm/mach-s5p64x0/clock-s5p6450.c
index 2d04abfba12e..0277b483b9b7 100644
--- a/arch/arm/mach-s5p64x0/clock-s5p6450.c
+++ b/arch/arm/mach-s5p64x0/clock-s5p6450.c
@@ -17,7 +17,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/clk.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/io.h>
 
 #include <mach/hardware.h>
diff --git a/arch/arm/mach-s5p64x0/clock.c b/arch/arm/mach-s5p64x0/clock.c
index b52c6e2f37a6..041f91e5b425 100644
--- a/arch/arm/mach-s5p64x0/clock.c
+++ b/arch/arm/mach-s5p64x0/clock.c
@@ -17,7 +17,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/clk.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/io.h>
 
 #include <mach/hardware.h>
diff --git a/arch/arm/mach-s5pv210/clock.c b/arch/arm/mach-s5pv210/clock.c
index 4c5ac7a69e9e..6a1e5876bb84 100644
--- a/arch/arm/mach-s5pv210/clock.c
+++ b/arch/arm/mach-s5pv210/clock.c
@@ -17,7 +17,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/clk.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/io.h>
 
 #include <mach/map.h>
diff --git a/arch/arm/mach-s5pv210/mach-smdkc110.c b/arch/arm/mach-s5pv210/mach-smdkc110.c
index f7266bb0cac8..05cf7dc61286 100644
--- a/arch/arm/mach-s5pv210/mach-smdkc110.c
+++ b/arch/arm/mach-s5pv210/mach-smdkc110.c
@@ -13,7 +13,7 @@
 #include <linux/init.h>
 #include <linux/serial_core.h>
 #include <linux/i2c.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
diff --git a/arch/arm/mach-s5pv210/mach-smdkv210.c b/arch/arm/mach-s5pv210/mach-smdkv210.c
index a9106c392398..7d4d98916d69 100644
--- a/arch/arm/mach-s5pv210/mach-smdkv210.c
+++ b/arch/arm/mach-s5pv210/mach-smdkv210.c
@@ -13,7 +13,7 @@
 #include <linux/i2c.h>
 #include <linux/init.h>
 #include <linux/serial_core.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/dm9000.h>
 #include <linux/fb.h>
 #include <linux/gpio.h>
diff --git a/arch/arm/mach-versatile/core.c b/arch/arm/mach-versatile/core.c
index e340a54251df..3a4893b82ad1 100644
--- a/arch/arm/mach-versatile/core.c
+++ b/arch/arm/mach-versatile/core.c
@@ -22,7 +22,6 @@
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
 #include <linux/platform_device.h>
-#include <linux/sysdev.h>
 #include <linux/interrupt.h>
 #include <linux/irqdomain.h>
 #include <linux/of_address.h>
diff --git a/arch/arm/mach-versatile/versatile_ab.c b/arch/arm/mach-versatile/versatile_ab.c
index fda4866703cd..07b9122ff84f 100644
--- a/arch/arm/mach-versatile/versatile_ab.c
+++ b/arch/arm/mach-versatile/versatile_ab.c
@@ -21,7 +21,6 @@
 
 #include <linux/init.h>
 #include <linux/device.h>
-#include <linux/sysdev.h>
 #include <linux/amba/bus.h>
 #include <linux/io.h>
 
diff --git a/arch/arm/mach-versatile/versatile_pb.c b/arch/arm/mach-versatile/versatile_pb.c
index feaf9cbe60f6..8d8b80e79e03 100644
--- a/arch/arm/mach-versatile/versatile_pb.c
+++ b/arch/arm/mach-versatile/versatile_pb.c
@@ -21,7 +21,6 @@
 
 #include <linux/init.h>
 #include <linux/device.h>
-#include <linux/sysdev.h>
 #include <linux/amba/bus.h>
 #include <linux/amba/pl061.h>
 #include <linux/amba/mmci.h>
diff --git a/arch/arm/mach-vexpress/v2m.c b/arch/arm/mach-vexpress/v2m.c
index 1fafc3244607..de5a2384284b 100644
--- a/arch/arm/mach-vexpress/v2m.c
+++ b/arch/arm/mach-vexpress/v2m.c
@@ -10,7 +10,7 @@
 #include <linux/ata_platform.h>
 #include <linux/smsc911x.h>
 #include <linux/spinlock.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/usb/isp1760.h>
 #include <linux/clkdev.h>
 #include <linux/mtd/physmap.h>
diff --git a/arch/arm/mach-w90x900/irq.c b/arch/arm/mach-w90x900/irq.c
index 7bf143c443f1..f47e29586a01 100644
--- a/arch/arm/mach-w90x900/irq.c
+++ b/arch/arm/mach-w90x900/irq.c
@@ -19,7 +19,7 @@
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/ptrace.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/io.h>
 
 #include <asm/irq.h>
diff --git a/arch/arm/plat-s3c24xx/common-smdk.c b/arch/arm/plat-s3c24xx/common-smdk.c
index bcc43f346272..084604be6ad1 100644
--- a/arch/arm/plat-s3c24xx/common-smdk.c
+++ b/arch/arm/plat-s3c24xx/common-smdk.c
@@ -19,7 +19,7 @@
 #include <linux/timer.h>
 #include <linux/init.h>
 #include <linux/gpio.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/platform_device.h>
 
 #include <linux/mtd/mtd.h>
diff --git a/arch/arm/plat-s3c24xx/cpu-freq.c b/arch/arm/plat-s3c24xx/cpu-freq.c
index b3d3d0278997..468079938884 100644
--- a/arch/arm/plat-s3c24xx/cpu-freq.c
+++ b/arch/arm/plat-s3c24xx/cpu-freq.c
@@ -20,7 +20,7 @@
 #include <linux/clk.h>
 #include <linux/err.h>
 #include <linux/io.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/sysfs.h>
 #include <linux/slab.h>
 
diff --git a/arch/arm/plat-s3c24xx/irq.c b/arch/arm/plat-s3c24xx/irq.c
index fc8c5f89954d..bc42c04091fd 100644
--- a/arch/arm/plat-s3c24xx/irq.c
+++ b/arch/arm/plat-s3c24xx/irq.c
@@ -22,7 +22,7 @@
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/syscore_ops.h>
 
 #include <asm/irq.h>
diff --git a/arch/arm/plat-s3c24xx/pm-simtec.c b/arch/arm/plat-s3c24xx/pm-simtec.c
index 663b280d65da..68296b1fe7e5 100644
--- a/arch/arm/plat-s3c24xx/pm-simtec.c
+++ b/arch/arm/plat-s3c24xx/pm-simtec.c
@@ -18,7 +18,6 @@
 #include <linux/list.h>
 #include <linux/timer.h>
 #include <linux/init.h>
-#include <linux/sysdev.h>
 #include <linux/device.h>
 #include <linux/io.h>
 
diff --git a/arch/arm/plat-s3c24xx/s3c2410-clock.c b/arch/arm/plat-s3c24xx/s3c2410-clock.c
index def76aa3825a..25dc4d4397b1 100644
--- a/arch/arm/plat-s3c24xx/s3c2410-clock.c
+++ b/arch/arm/plat-s3c24xx/s3c2410-clock.c
@@ -26,7 +26,7 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/err.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/clk.h>
 #include <linux/mutex.h>
 #include <linux/delay.h>
diff --git a/arch/arm/plat-s3c24xx/s3c2412-iotiming.c b/arch/arm/plat-s3c24xx/s3c2412-iotiming.c
index 0b46d3895d62..48eee39ab369 100644
--- a/arch/arm/plat-s3c24xx/s3c2412-iotiming.c
+++ b/arch/arm/plat-s3c24xx/s3c2412-iotiming.c
@@ -17,7 +17,7 @@
 #include <linux/ioport.h>
 #include <linux/cpufreq.h>
 #include <linux/seq_file.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/delay.h>
 #include <linux/clk.h>
 #include <linux/err.h>
diff --git a/arch/arm/plat-s5p/clock.c b/arch/arm/plat-s5p/clock.c
index 5f84a3f13ef9..963edea7f7e7 100644
--- a/arch/arm/plat-s5p/clock.c
+++ b/arch/arm/plat-s5p/clock.c
@@ -17,7 +17,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/clk.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/io.h>
 #include <asm/div64.h>
 
diff --git a/arch/arm/plat-s5p/irq-eint.c b/arch/arm/plat-s5p/irq-eint.c
index b5bb774985b0..c496b359c371 100644
--- a/arch/arm/plat-s5p/irq-eint.c
+++ b/arch/arm/plat-s5p/irq-eint.c
@@ -14,7 +14,7 @@
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/io.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/gpio.h>
 
 #include <asm/hardware/vic.h>
diff --git a/arch/arm/plat-samsung/clock-clksrc.c b/arch/arm/plat-samsung/clock-clksrc.c
index ae8b8507663f..786a4107a157 100644
--- a/arch/arm/plat-samsung/clock-clksrc.c
+++ b/arch/arm/plat-samsung/clock-clksrc.c
@@ -16,7 +16,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/clk.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/io.h>
 
 #include <plat/clock.h>
diff --git a/arch/arm/plat-samsung/clock.c b/arch/arm/plat-samsung/clock.c
index 3b4451979d1b..10f71179071f 100644
--- a/arch/arm/plat-samsung/clock.c
+++ b/arch/arm/plat-samsung/clock.c
@@ -33,7 +33,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/platform_device.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/clk.h>
diff --git a/arch/arm/plat-samsung/pm-gpio.c b/arch/arm/plat-samsung/pm-gpio.c
index 4be016eaa6db..c2ff92c30bdf 100644
--- a/arch/arm/plat-samsung/pm-gpio.c
+++ b/arch/arm/plat-samsung/pm-gpio.c
@@ -14,7 +14,7 @@
 */
 
 #include <linux/kernel.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/gpio.h>
diff --git a/arch/arm/plat-samsung/wakeup-mask.c b/arch/arm/plat-samsung/wakeup-mask.c
index dc814037297b..20c3d9117cc2 100644
--- a/arch/arm/plat-samsung/wakeup-mask.c
+++ b/arch/arm/plat-samsung/wakeup-mask.c
@@ -11,7 +11,7 @@
 
 #include <linux/kernel.h>
 #include <linux/spinlock.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/types.h>
 #include <linux/irq.h>
 #include <linux/io.h>
diff --git a/arch/avr32/boards/merisc/merisc_sysfs.c b/arch/avr32/boards/merisc/merisc_sysfs.c
index df431fdba9ad..5a252318f4bd 100644
--- a/arch/avr32/boards/merisc/merisc_sysfs.c
+++ b/arch/avr32/boards/merisc/merisc_sysfs.c
@@ -13,7 +13,6 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/device.h>
-#include <linux/sysdev.h>
 #include <linux/timer.h>
 #include <linux/err.h>
 #include <linux/ctype.h>
diff --git a/arch/avr32/kernel/irq.c b/arch/avr32/kernel/irq.c
index bc3aa18293df..900e49b2258b 100644
--- a/arch/avr32/kernel/irq.c
+++ b/arch/avr32/kernel/irq.c
@@ -14,7 +14,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 
 /* May be overridden by platform code */
 int __weak nmi_enable(void)
diff --git a/arch/mips/txx9/generic/setup_tx4939.c b/arch/mips/txx9/generic/setup_tx4939.c
index ba3cec3155df..6567895d1f59 100644
--- a/arch/mips/txx9/generic/setup_tx4939.c
+++ b/arch/mips/txx9/generic/setup_tx4939.c
@@ -15,7 +15,7 @@
 #include <linux/delay.h>
 #include <linux/netdevice.h>
 #include <linux/notifier.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/ethtool.h>
 #include <linux/param.h>
 #include <linux/ptrace.h>
diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c
index f5c5c762d5a3..4a255cf8cd17 100644
--- a/arch/powerpc/platforms/cell/smp.c
+++ b/arch/powerpc/platforms/cell/smp.c
@@ -23,7 +23,7 @@
 #include <linux/spinlock.h>
 #include <linux/cache.h>
 #include <linux/err.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/cpu.h>
 
 #include <asm/ptrace.h>
diff --git a/arch/powerpc/platforms/iseries/smp.c b/arch/powerpc/platforms/iseries/smp.c
index 7e2a5515ed76..02df49fb59f0 100644
--- a/arch/powerpc/platforms/iseries/smp.c
+++ b/arch/powerpc/platforms/iseries/smp.c
@@ -24,7 +24,7 @@
 #include <linux/spinlock.h>
 #include <linux/cache.h>
 #include <linux/err.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/cpu.h>
 
 #include <asm/ptrace.h>
diff --git a/arch/powerpc/platforms/powermac/cpufreq_32.c b/arch/powerpc/platforms/powermac/cpufreq_32.c
index 04af5f48b4eb..1fc386a23f18 100644
--- a/arch/powerpc/platforms/powermac/cpufreq_32.c
+++ b/arch/powerpc/platforms/powermac/cpufreq_32.c
@@ -23,7 +23,7 @@
 #include <linux/pmu.h>
 #include <linux/cpufreq.h>
 #include <linux/init.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/hardirq.h>
 #include <asm/prom.h>
 #include <asm/machdep.h>
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index 26e93fd4c62b..6212ff4693ea 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -22,7 +22,7 @@
 #include <linux/spinlock.h>
 #include <linux/cache.h>
 #include <linux/err.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/cpu.h>
 
 #include <asm/ptrace.h>
diff --git a/arch/powerpc/sysdev/uic.c b/arch/powerpc/sysdev/uic.c
index 3330feca7502..063c901b1265 100644
--- a/arch/powerpc/sysdev/uic.c
+++ b/arch/powerpc/sysdev/uic.c
@@ -18,7 +18,6 @@
 #include <linux/stddef.h>
 #include <linux/sched.h>
 #include <linux/signal.h>
-#include <linux/sysdev.h>
 #include <linux/device.h>
 #include <linux/bootmem.h>
 #include <linux/spinlock.h>
diff --git a/arch/unicore32/kernel/puv3-core.c b/arch/unicore32/kernel/puv3-core.c
index 1a505a787765..254adeecc61a 100644
--- a/arch/unicore32/kernel/puv3-core.c
+++ b/arch/unicore32/kernel/puv3-core.c
@@ -13,7 +13,6 @@
 
 #include <linux/init.h>
 #include <linux/device.h>
-#include <linux/sysdev.h>
 #include <linux/amba/bus.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
diff --git a/arch/unicore32/kernel/puv3-nb0916.c b/arch/unicore32/kernel/puv3-nb0916.c
index e731c561ed4e..37b12a06b499 100644
--- a/arch/unicore32/kernel/puv3-nb0916.c
+++ b/arch/unicore32/kernel/puv3-nb0916.c
@@ -13,7 +13,6 @@
 
 #include <linux/init.h>
 #include <linux/device.h>
-#include <linux/sysdev.h>
 #include <linux/platform_device.h>
 #include <linux/mtd/physmap.h>
 #include <linux/io.h>
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index b946a9eac7d9..98094a9580f7 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -2,7 +2,6 @@
 #include <linux/clockchips.h>
 #include <linux/interrupt.h>
 #include <linux/export.h>
-#include <linux/sysdev.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/i8253.h>
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index b3300e6bacef..313fb5cddbce 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -9,7 +9,7 @@
 #include <linux/kprobes.h>
 #include <linux/init.h>
 #include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/bitops.h>
 #include <linux/acpi.h>
 #include <linux/io.h>
diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c
index 309c70fb7759..5d4ba301e776 100644
--- a/arch/x86/platform/uv/uv_sysfs.c
+++ b/arch/x86/platform/uv/uv_sysfs.c
@@ -19,7 +19,7 @@
  *  Copyright (c) Russ Anderson
  */
 
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <asm/uv/bios.h>
 #include <asm/uv/uv.h>
 
diff --git a/drivers/gpio/gpio-samsung.c b/drivers/gpio/gpio-samsung.c
index 866251852719..dcbe4541fe49 100644
--- a/drivers/gpio/gpio-samsung.c
+++ b/drivers/gpio/gpio-samsung.c
@@ -22,7 +22,7 @@
 #include <linux/spinlock.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/ioport.h>
 
 #include <asm/irq.h>
diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 661b692573e7..cc068d3a3244 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -15,7 +15,6 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/device.h>
-#include <linux/sysdev.h>
 #include <linux/timer.h>
 #include <linux/err.h>
 #include <linux/ctype.h>
diff --git a/drivers/leds/led-triggers.c b/drivers/leds/led-triggers.c
index 6f1ff93d7cec..46b4c766335d 100644
--- a/drivers/leds/led-triggers.c
+++ b/drivers/leds/led-triggers.c
@@ -17,7 +17,6 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/device.h>
-#include <linux/sysdev.h>
 #include <linux/timer.h>
 #include <linux/rwsem.h>
 #include <linux/leds.h>
diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c
index 116a49ce74b2..54ac7ffacb40 100644
--- a/drivers/macintosh/smu.c
+++ b/drivers/macintosh/smu.c
@@ -32,7 +32,6 @@
 #include <linux/completion.h>
 #include <linux/miscdevice.h>
 #include <linux/delay.h>
-#include <linux/sysdev.h>
 #include <linux/poll.h>
 #include <linux/mutex.h>
 #include <linux/of_device.h>
@@ -681,9 +680,6 @@ static struct platform_driver smu_of_platform_driver =
 static int __init smu_init_sysfs(void)
 {
 	/*
-	 * Due to sysfs bogosity, a sysdev is not a real device, so
-	 * we should in fact create both if we want sysdev semantics
-	 * for power management.
 	 * For now, we don't power manage machines with an SMU chip,
 	 * I'm a bit too far from figuring out how that works with those
 	 * new chipsets, but that will come back and bite us
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 5a20804fdece..549742f23ad5 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -26,7 +26,6 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/sched.h>
-#include <linux/sysdev.h>
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/string.h>
diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c
index 48870e504231..f00d0d1e0653 100644
--- a/drivers/platform/x86/intel_scu_ipc.c
+++ b/drivers/platform/x86/intel_scu_ipc.c
@@ -19,7 +19,7 @@
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/init.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/pm.h>
 #include <linux/pci.h>
 #include <linux/interrupt.h>
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index 98f3e4ade924..690c3338a8ae 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -36,7 +36,7 @@
 #include <linux/blkdev.h>
 #include <linux/blkpg.h>
 #include <linux/hdreg.h>  /* HDIO_GETGEO */
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/bio.h>
 #include <linux/suspend.h>
 #include <linux/platform_device.h>
-- 
cgit v1.2.3-55-g7522


From 933393f58fef9963eac61db8093689544e29a600 Mon Sep 17 00:00:00 2001
From: Christoph Lameter
Date: Thu, 22 Dec 2011 11:58:51 -0600
Subject: percpu: Remove irqsafe_cpu_xxx variants

We simply say that regular this_cpu use must be safe regardless of
preemption and interrupt state.  That has no material change for x86
and s390 implementations of this_cpu operations.  However, arches that
do not provide their own implementation for this_cpu operations will
now get code generated that disables interrupts instead of preemption.

-tj: This is part of on-going percpu API cleanup.  For detailed
     discussion of the subject, please refer to the following thread.

     http://thread.gmane.org/gmane.linux.kernel/1222078

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
LKML-Reference: <alpine.DEB.2.00.1112221154380.11787@router.home>
---
 arch/s390/include/asm/percpu.h     |  44 ++++-----
 arch/x86/include/asm/percpu.h      |  28 ------
 include/linux/netdevice.h          |   4 +-
 include/linux/netfilter/x_tables.h |   4 +-
 include/linux/percpu.h             | 190 +++++--------------------------------
 include/net/snmp.h                 |  14 +--
 mm/slub.c                          |   6 +-
 net/caif/caif_dev.c                |   4 +-
 net/caif/cffrml.c                  |   4 +-
 9 files changed, 62 insertions(+), 236 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h
index 5325c89a5843..0fbd1899c7b0 100644
--- a/arch/s390/include/asm/percpu.h
+++ b/arch/s390/include/asm/percpu.h
@@ -19,7 +19,7 @@
 #define ARCH_NEEDS_WEAK_PER_CPU
 #endif
 
-#define arch_irqsafe_cpu_to_op(pcp, val, op)				\
+#define arch_this_cpu_to_op(pcp, val, op)				\
 do {									\
 	typedef typeof(pcp) pcp_op_T__;					\
 	pcp_op_T__ old__, new__, prev__;				\
@@ -41,27 +41,27 @@ do {									\
 	preempt_enable();						\
 } while (0)
 
-#define irqsafe_cpu_add_1(pcp, val) arch_irqsafe_cpu_to_op(pcp, val, +)
-#define irqsafe_cpu_add_2(pcp, val) arch_irqsafe_cpu_to_op(pcp, val, +)
-#define irqsafe_cpu_add_4(pcp, val) arch_irqsafe_cpu_to_op(pcp, val, +)
-#define irqsafe_cpu_add_8(pcp, val) arch_irqsafe_cpu_to_op(pcp, val, +)
+#define this_cpu_add_1(pcp, val) arch_this_cpu_to_op(pcp, val, +)
+#define this_cpu_add_2(pcp, val) arch_this_cpu_to_op(pcp, val, +)
+#define this_cpu_add_4(pcp, val) arch_this_cpu_to_op(pcp, val, +)
+#define this_cpu_add_8(pcp, val) arch_this_cpu_to_op(pcp, val, +)
 
-#define irqsafe_cpu_and_1(pcp, val) arch_irqsafe_cpu_to_op(pcp, val, &)
-#define irqsafe_cpu_and_2(pcp, val) arch_irqsafe_cpu_to_op(pcp, val, &)
-#define irqsafe_cpu_and_4(pcp, val) arch_irqsafe_cpu_to_op(pcp, val, &)
-#define irqsafe_cpu_and_8(pcp, val) arch_irqsafe_cpu_to_op(pcp, val, &)
+#define this_cpu_and_1(pcp, val) arch_this_cpu_to_op(pcp, val, &)
+#define this_cpu_and_2(pcp, val) arch_this_cpu_to_op(pcp, val, &)
+#define this_cpu_and_4(pcp, val) arch_this_cpu_to_op(pcp, val, &)
+#define this_cpu_and_8(pcp, val) arch_this_cpu_to_op(pcp, val, &)
 
-#define irqsafe_cpu_or_1(pcp, val) arch_irqsafe_cpu_to_op(pcp, val, |)
-#define irqsafe_cpu_or_2(pcp, val) arch_irqsafe_cpu_to_op(pcp, val, |)
-#define irqsafe_cpu_or_4(pcp, val) arch_irqsafe_cpu_to_op(pcp, val, |)
-#define irqsafe_cpu_or_8(pcp, val) arch_irqsafe_cpu_to_op(pcp, val, |)
+#define this_cpu_or_1(pcp, val) arch_this_cpu_to_op(pcp, val, |)
+#define this_cpu_or_2(pcp, val) arch_this_cpu_to_op(pcp, val, |)
+#define this_cpu_or_4(pcp, val) arch_this_cpu_to_op(pcp, val, |)
+#define this_cpu_or_8(pcp, val) arch_this_cpu_to_op(pcp, val, |)
 
-#define irqsafe_cpu_xor_1(pcp, val) arch_irqsafe_cpu_to_op(pcp, val, ^)
-#define irqsafe_cpu_xor_2(pcp, val) arch_irqsafe_cpu_to_op(pcp, val, ^)
-#define irqsafe_cpu_xor_4(pcp, val) arch_irqsafe_cpu_to_op(pcp, val, ^)
-#define irqsafe_cpu_xor_8(pcp, val) arch_irqsafe_cpu_to_op(pcp, val, ^)
+#define this_cpu_xor_1(pcp, val) arch_this_cpu_to_op(pcp, val, ^)
+#define this_cpu_xor_2(pcp, val) arch_this_cpu_to_op(pcp, val, ^)
+#define this_cpu_xor_4(pcp, val) arch_this_cpu_to_op(pcp, val, ^)
+#define this_cpu_xor_8(pcp, val) arch_this_cpu_to_op(pcp, val, ^)
 
-#define arch_irqsafe_cpu_cmpxchg(pcp, oval, nval)			\
+#define arch_this_cpu_cmpxchg(pcp, oval, nval)			\
 ({									\
 	typedef typeof(pcp) pcp_op_T__;					\
 	pcp_op_T__ ret__;						\
@@ -79,10 +79,10 @@ do {									\
 	ret__;								\
 })
 
-#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval) arch_irqsafe_cpu_cmpxchg(pcp, oval, nval)
-#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval) arch_irqsafe_cpu_cmpxchg(pcp, oval, nval)
-#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) arch_irqsafe_cpu_cmpxchg(pcp, oval, nval)
-#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) arch_irqsafe_cpu_cmpxchg(pcp, oval, nval)
+#define this_cpu_cmpxchg_1(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval)
+#define this_cpu_cmpxchg_2(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval)
+#define this_cpu_cmpxchg_4(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval)
+#define this_cpu_cmpxchg_8(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval)
 
 #include <asm-generic/percpu.h>
 
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 3470c9d0ebba..562ccb5323de 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -414,22 +414,6 @@ do {									\
 #define this_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
 #define this_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
 
-#define irqsafe_cpu_add_1(pcp, val)	percpu_add_op((pcp), val)
-#define irqsafe_cpu_add_2(pcp, val)	percpu_add_op((pcp), val)
-#define irqsafe_cpu_add_4(pcp, val)	percpu_add_op((pcp), val)
-#define irqsafe_cpu_and_1(pcp, val)	percpu_to_op("and", (pcp), val)
-#define irqsafe_cpu_and_2(pcp, val)	percpu_to_op("and", (pcp), val)
-#define irqsafe_cpu_and_4(pcp, val)	percpu_to_op("and", (pcp), val)
-#define irqsafe_cpu_or_1(pcp, val)	percpu_to_op("or", (pcp), val)
-#define irqsafe_cpu_or_2(pcp, val)	percpu_to_op("or", (pcp), val)
-#define irqsafe_cpu_or_4(pcp, val)	percpu_to_op("or", (pcp), val)
-#define irqsafe_cpu_xor_1(pcp, val)	percpu_to_op("xor", (pcp), val)
-#define irqsafe_cpu_xor_2(pcp, val)	percpu_to_op("xor", (pcp), val)
-#define irqsafe_cpu_xor_4(pcp, val)	percpu_to_op("xor", (pcp), val)
-#define irqsafe_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval)
-#define irqsafe_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
-#define irqsafe_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
-
 #ifndef CONFIG_M386
 #define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
 #define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
@@ -445,9 +429,6 @@ do {									\
 #define this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 #define this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 
-#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
-#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
-#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 #endif /* !CONFIG_M386 */
 
 #ifdef CONFIG_X86_CMPXCHG64
@@ -467,7 +448,6 @@ do {									\
 
 #define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2)		percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
 #define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2)		percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
-#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2)	percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
 #endif /* CONFIG_X86_CMPXCHG64 */
 
 /*
@@ -495,13 +475,6 @@ do {									\
 #define this_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
 #define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 
-#define irqsafe_cpu_add_8(pcp, val)	percpu_add_op((pcp), val)
-#define irqsafe_cpu_and_8(pcp, val)	percpu_to_op("and", (pcp), val)
-#define irqsafe_cpu_or_8(pcp, val)	percpu_to_op("or", (pcp), val)
-#define irqsafe_cpu_xor_8(pcp, val)	percpu_to_op("xor", (pcp), val)
-#define irqsafe_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
-#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
-
 /*
  * Pretty complex macro to generate cmpxchg16 instruction.  The instruction
  * is not supported on early AMD64 processors so we must be able to emulate
@@ -532,7 +505,6 @@ do {									\
 
 #define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2)		percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
 #define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2)		percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
-#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2)	percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
 
 #endif
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a82ad4dd306a..ca8d9bc4e502 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2115,7 +2115,7 @@ extern void netdev_run_todo(void);
  */
 static inline void dev_put(struct net_device *dev)
 {
-	irqsafe_cpu_dec(*dev->pcpu_refcnt);
+	this_cpu_dec(*dev->pcpu_refcnt);
 }
 
 /**
@@ -2126,7 +2126,7 @@ static inline void dev_put(struct net_device *dev)
  */
 static inline void dev_hold(struct net_device *dev)
 {
-	irqsafe_cpu_inc(*dev->pcpu_refcnt);
+	this_cpu_inc(*dev->pcpu_refcnt);
 }
 
 /* Carrier loss detection, dial on demand. The functions netif_carrier_on
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 32cddf78b13e..8d674a786744 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -471,7 +471,7 @@ DECLARE_PER_CPU(seqcount_t, xt_recseq);
  *
  * Begin packet processing : all readers must wait the end
  * 1) Must be called with preemption disabled
- * 2) softirqs must be disabled too (or we should use irqsafe_cpu_add())
+ * 2) softirqs must be disabled too (or we should use this_cpu_add())
  * Returns :
  *  1 if no recursion on this cpu
  *  0 if recursion detected
@@ -503,7 +503,7 @@ static inline unsigned int xt_write_recseq_begin(void)
  *
  * End packet processing : all readers can proceed
  * 1) Must be called with preemption disabled
- * 2) softirqs must be disabled too (or we should use irqsafe_cpu_add())
+ * 2) softirqs must be disabled too (or we should use this_cpu_add())
  */
 static inline void xt_write_recseq_end(unsigned int addend)
 {
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 9ca008f0c542..32cd1f67462e 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -172,10 +172,10 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
  * equal char, int or long.  percpu_read() evaluates to a lvalue and
  * all others to void.
  *
- * These operations are guaranteed to be atomic w.r.t. preemption.
- * The generic versions use plain get/put_cpu_var().  Archs are
+ * These operations are guaranteed to be atomic.
+ * The generic versions disable interrupts.  Archs are
  * encouraged to implement single-instruction alternatives which don't
- * require preemption protection.
+ * require protection.
  */
 #ifndef percpu_read
 # define percpu_read(var)						\
@@ -347,9 +347,10 @@ do {									\
 
 #define _this_cpu_generic_to_op(pcp, val, op)				\
 do {									\
-	preempt_disable();						\
+	unsigned long flags;						\
+	local_irq_save(flags);						\
 	*__this_cpu_ptr(&(pcp)) op val;					\
-	preempt_enable();						\
+	local_irq_restore(flags);					\
 } while (0)
 
 #ifndef this_cpu_write
@@ -447,10 +448,11 @@ do {									\
 #define _this_cpu_generic_add_return(pcp, val)				\
 ({									\
 	typeof(pcp) ret__;						\
-	preempt_disable();						\
+	unsigned long flags;						\
+	local_irq_save(flags);						\
 	__this_cpu_add(pcp, val);					\
 	ret__ = __this_cpu_read(pcp);					\
-	preempt_enable();						\
+	local_irq_restore(flags);					\
 	ret__;								\
 })
 
@@ -476,10 +478,11 @@ do {									\
 
 #define _this_cpu_generic_xchg(pcp, nval)				\
 ({	typeof(pcp) ret__;						\
-	preempt_disable();						\
+	unsigned long flags;						\
+	local_irq_save(flags);						\
 	ret__ = __this_cpu_read(pcp);					\
 	__this_cpu_write(pcp, nval);					\
-	preempt_enable();						\
+	local_irq_restore(flags);					\
 	ret__;								\
 })
 
@@ -501,12 +504,14 @@ do {									\
 #endif
 
 #define _this_cpu_generic_cmpxchg(pcp, oval, nval)			\
-({	typeof(pcp) ret__;						\
-	preempt_disable();						\
+({									\
+	typeof(pcp) ret__;						\
+	unsigned long flags;						\
+	local_irq_save(flags);						\
 	ret__ = __this_cpu_read(pcp);					\
 	if (ret__ == (oval))						\
 		__this_cpu_write(pcp, nval);				\
-	preempt_enable();						\
+	local_irq_restore(flags);					\
 	ret__;								\
 })
 
@@ -538,10 +543,11 @@ do {									\
 #define _this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
 ({									\
 	int ret__;							\
-	preempt_disable();						\
+	unsigned long flags;						\
+	local_irq_save(flags);						\
 	ret__ = __this_cpu_generic_cmpxchg_double(pcp1, pcp2,		\
 			oval1, oval2, nval1, nval2);			\
-	preempt_enable();						\
+	local_irq_restore(flags);					\
 	ret__;								\
 })
 
@@ -567,9 +573,9 @@ do {									\
 #endif
 
 /*
- * Generic percpu operations that do not require preemption handling.
+ * Generic percpu operations for context that are safe from preemption/interrupts.
  * Either we do not care about races or the caller has the
- * responsibility of handling preemptions issues. Arch code can still
+ * responsibility of handling preemption/interrupt issues. Arch code can still
  * override these instructions since the arch per cpu code may be more
  * efficient and may actually get race freeness for free (that is the
  * case for x86 for example).
@@ -802,156 +808,4 @@ do {									\
 	__pcpu_double_call_return_bool(__this_cpu_cmpxchg_double_, (pcp1), (pcp2), (oval1), (oval2), (nval1), (nval2))
 #endif
 
-/*
- * IRQ safe versions of the per cpu RMW operations. Note that these operations
- * are *not* safe against modification of the same variable from another
- * processors (which one gets when using regular atomic operations)
- * They are guaranteed to be atomic vs. local interrupts and
- * preemption only.
- */
-#define irqsafe_cpu_generic_to_op(pcp, val, op)				\
-do {									\
-	unsigned long flags;						\
-	local_irq_save(flags);						\
-	*__this_cpu_ptr(&(pcp)) op val;					\
-	local_irq_restore(flags);					\
-} while (0)
-
-#ifndef irqsafe_cpu_add
-# ifndef irqsafe_cpu_add_1
-#  define irqsafe_cpu_add_1(pcp, val) irqsafe_cpu_generic_to_op((pcp), (val), +=)
-# endif
-# ifndef irqsafe_cpu_add_2
-#  define irqsafe_cpu_add_2(pcp, val) irqsafe_cpu_generic_to_op((pcp), (val), +=)
-# endif
-# ifndef irqsafe_cpu_add_4
-#  define irqsafe_cpu_add_4(pcp, val) irqsafe_cpu_generic_to_op((pcp), (val), +=)
-# endif
-# ifndef irqsafe_cpu_add_8
-#  define irqsafe_cpu_add_8(pcp, val) irqsafe_cpu_generic_to_op((pcp), (val), +=)
-# endif
-# define irqsafe_cpu_add(pcp, val) __pcpu_size_call(irqsafe_cpu_add_, (pcp), (val))
-#endif
-
-#ifndef irqsafe_cpu_sub
-# define irqsafe_cpu_sub(pcp, val)	irqsafe_cpu_add((pcp), -(val))
-#endif
-
-#ifndef irqsafe_cpu_inc
-# define irqsafe_cpu_inc(pcp)	irqsafe_cpu_add((pcp), 1)
-#endif
-
-#ifndef irqsafe_cpu_dec
-# define irqsafe_cpu_dec(pcp)	irqsafe_cpu_sub((pcp), 1)
-#endif
-
-#ifndef irqsafe_cpu_and
-# ifndef irqsafe_cpu_and_1
-#  define irqsafe_cpu_and_1(pcp, val) irqsafe_cpu_generic_to_op((pcp), (val), &=)
-# endif
-# ifndef irqsafe_cpu_and_2
-#  define irqsafe_cpu_and_2(pcp, val) irqsafe_cpu_generic_to_op((pcp), (val), &=)
-# endif
-# ifndef irqsafe_cpu_and_4
-#  define irqsafe_cpu_and_4(pcp, val) irqsafe_cpu_generic_to_op((pcp), (val), &=)
-# endif
-# ifndef irqsafe_cpu_and_8
-#  define irqsafe_cpu_and_8(pcp, val) irqsafe_cpu_generic_to_op((pcp), (val), &=)
-# endif
-# define irqsafe_cpu_and(pcp, val) __pcpu_size_call(irqsafe_cpu_and_, (val))
-#endif
-
-#ifndef irqsafe_cpu_or
-# ifndef irqsafe_cpu_or_1
-#  define irqsafe_cpu_or_1(pcp, val) irqsafe_cpu_generic_to_op((pcp), (val), |=)
-# endif
-# ifndef irqsafe_cpu_or_2
-#  define irqsafe_cpu_or_2(pcp, val) irqsafe_cpu_generic_to_op((pcp), (val), |=)
-# endif
-# ifndef irqsafe_cpu_or_4
-#  define irqsafe_cpu_or_4(pcp, val) irqsafe_cpu_generic_to_op((pcp), (val), |=)
-# endif
-# ifndef irqsafe_cpu_or_8
-#  define irqsafe_cpu_or_8(pcp, val) irqsafe_cpu_generic_to_op((pcp), (val), |=)
-# endif
-# define irqsafe_cpu_or(pcp, val) __pcpu_size_call(irqsafe_cpu_or_, (val))
-#endif
-
-#ifndef irqsafe_cpu_xor
-# ifndef irqsafe_cpu_xor_1
-#  define irqsafe_cpu_xor_1(pcp, val) irqsafe_cpu_generic_to_op((pcp), (val), ^=)
-# endif
-# ifndef irqsafe_cpu_xor_2
-#  define irqsafe_cpu_xor_2(pcp, val) irqsafe_cpu_generic_to_op((pcp), (val), ^=)
-# endif
-# ifndef irqsafe_cpu_xor_4
-#  define irqsafe_cpu_xor_4(pcp, val) irqsafe_cpu_generic_to_op((pcp), (val), ^=)
-# endif
-# ifndef irqsafe_cpu_xor_8
-#  define irqsafe_cpu_xor_8(pcp, val) irqsafe_cpu_generic_to_op((pcp), (val), ^=)
-# endif
-# define irqsafe_cpu_xor(pcp, val) __pcpu_size_call(irqsafe_cpu_xor_, (val))
-#endif
-
-#define irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)			\
-({									\
-	typeof(pcp) ret__;						\
-	unsigned long flags;						\
-	local_irq_save(flags);						\
-	ret__ = __this_cpu_read(pcp);					\
-	if (ret__ == (oval))						\
-		__this_cpu_write(pcp, nval);				\
-	local_irq_restore(flags);					\
-	ret__;								\
-})
-
-#ifndef irqsafe_cpu_cmpxchg
-# ifndef irqsafe_cpu_cmpxchg_1
-#  define irqsafe_cpu_cmpxchg_1(pcp, oval, nval)	irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)
-# endif
-# ifndef irqsafe_cpu_cmpxchg_2
-#  define irqsafe_cpu_cmpxchg_2(pcp, oval, nval)	irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)
-# endif
-# ifndef irqsafe_cpu_cmpxchg_4
-#  define irqsafe_cpu_cmpxchg_4(pcp, oval, nval)	irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)
-# endif
-# ifndef irqsafe_cpu_cmpxchg_8
-#  define irqsafe_cpu_cmpxchg_8(pcp, oval, nval)	irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)
-# endif
-# define irqsafe_cpu_cmpxchg(pcp, oval, nval)		\
-	__pcpu_size_call_return2(irqsafe_cpu_cmpxchg_, (pcp), oval, nval)
-#endif
-
-#define irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
-({									\
-	int ret__;							\
-	unsigned long flags;						\
-	local_irq_save(flags);						\
-	ret__ = __this_cpu_generic_cmpxchg_double(pcp1, pcp2,		\
-			oval1, oval2, nval1, nval2);			\
-	local_irq_restore(flags);					\
-	ret__;								\
-})
-
-#ifndef irqsafe_cpu_cmpxchg_double
-# ifndef irqsafe_cpu_cmpxchg_double_1
-#  define irqsafe_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
-	irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
-# endif
-# ifndef irqsafe_cpu_cmpxchg_double_2
-#  define irqsafe_cpu_cmpxchg_double_2(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
-	irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
-# endif
-# ifndef irqsafe_cpu_cmpxchg_double_4
-#  define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
-	irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
-# endif
-# ifndef irqsafe_cpu_cmpxchg_double_8
-#  define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
-	irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
-# endif
-# define irqsafe_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
-	__pcpu_double_call_return_bool(irqsafe_cpu_cmpxchg_double_, (pcp1), (pcp2), (oval1), (oval2), (nval1), (nval2))
-#endif
-
 #endif /* __LINUX_PERCPU_H */
diff --git a/include/net/snmp.h b/include/net/snmp.h
index 8f0f9ac0307f..e067aed7e378 100644
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -129,33 +129,33 @@ struct linux_xfrm_mib {
 			__this_cpu_inc(mib[0]->mibs[field])
 
 #define SNMP_INC_STATS_USER(mib, field)	\
-			irqsafe_cpu_inc(mib[0]->mibs[field])
+			this_cpu_inc(mib[0]->mibs[field])
 
 #define SNMP_INC_STATS_ATOMIC_LONG(mib, field)	\
 			atomic_long_inc(&mib->mibs[field])
 
 #define SNMP_INC_STATS(mib, field)	\
-			irqsafe_cpu_inc(mib[0]->mibs[field])
+			this_cpu_inc(mib[0]->mibs[field])
 
 #define SNMP_DEC_STATS(mib, field)	\
-			irqsafe_cpu_dec(mib[0]->mibs[field])
+			this_cpu_dec(mib[0]->mibs[field])
 
 #define SNMP_ADD_STATS_BH(mib, field, addend)	\
 			__this_cpu_add(mib[0]->mibs[field], addend)
 
 #define SNMP_ADD_STATS_USER(mib, field, addend)	\
-			irqsafe_cpu_add(mib[0]->mibs[field], addend)
+			this_cpu_add(mib[0]->mibs[field], addend)
 
 #define SNMP_ADD_STATS(mib, field, addend)	\
-			irqsafe_cpu_add(mib[0]->mibs[field], addend)
+			this_cpu_add(mib[0]->mibs[field], addend)
 /*
  * Use "__typeof__(*mib[0]) *ptr" instead of "__typeof__(mib[0]) ptr"
  * to make @ptr a non-percpu pointer.
  */
 #define SNMP_UPD_PO_STATS(mib, basefield, addend)	\
 	do { \
-		irqsafe_cpu_inc(mib[0]->mibs[basefield##PKTS]);		\
-		irqsafe_cpu_add(mib[0]->mibs[basefield##OCTETS], addend);	\
+		this_cpu_inc(mib[0]->mibs[basefield##PKTS]);		\
+		this_cpu_add(mib[0]->mibs[basefield##OCTETS], addend);	\
 	} while (0)
 #define SNMP_UPD_PO_STATS_BH(mib, basefield, addend)	\
 	do { \
diff --git a/mm/slub.c b/mm/slub.c
index ed3334d9b6da..0011489c28ac 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1978,7 +1978,7 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 		page->pobjects = pobjects;
 		page->next = oldpage;
 
-	} while (irqsafe_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
+	} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
 	stat(s, CPU_PARTIAL_FREE);
 	return pobjects;
 }
@@ -2304,7 +2304,7 @@ redo:
 		 * Since this is without lock semantics the protection is only against
 		 * code executing on this cpu *not* from access by other cpus.
 		 */
-		if (unlikely(!irqsafe_cpu_cmpxchg_double(
+		if (unlikely(!this_cpu_cmpxchg_double(
 				s->cpu_slab->freelist, s->cpu_slab->tid,
 				object, tid,
 				get_freepointer_safe(s, object), next_tid(tid)))) {
@@ -2534,7 +2534,7 @@ redo:
 	if (likely(page == c->page)) {
 		set_freepointer(s, object, c->freelist);
 
-		if (unlikely(!irqsafe_cpu_cmpxchg_double(
+		if (unlikely(!this_cpu_cmpxchg_double(
 				s->cpu_slab->freelist, s->cpu_slab->tid,
 				c->freelist, tid,
 				object, next_tid(tid)))) {
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index f1fa1f6e658d..64930cc2746a 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -69,12 +69,12 @@ static struct caif_device_entry_list *caif_device_list(struct net *net)
 
 static void caifd_put(struct caif_device_entry *e)
 {
-	irqsafe_cpu_dec(*e->pcpu_refcnt);
+	this_cpu_dec(*e->pcpu_refcnt);
 }
 
 static void caifd_hold(struct caif_device_entry *e)
 {
-	irqsafe_cpu_inc(*e->pcpu_refcnt);
+	this_cpu_inc(*e->pcpu_refcnt);
 }
 
 static int caifd_refcnt_read(struct caif_device_entry *e)
diff --git a/net/caif/cffrml.c b/net/caif/cffrml.c
index d3ca87bf23b7..0a7df7ef062d 100644
--- a/net/caif/cffrml.c
+++ b/net/caif/cffrml.c
@@ -177,14 +177,14 @@ void cffrml_put(struct cflayer *layr)
 {
 	struct cffrml *this = container_obj(layr);
 	if (layr != NULL && this->pcpu_refcnt != NULL)
-		irqsafe_cpu_dec(*this->pcpu_refcnt);
+		this_cpu_dec(*this->pcpu_refcnt);
 }
 
 void cffrml_hold(struct cflayer *layr)
 {
 	struct cffrml *this = container_obj(layr);
 	if (layr != NULL && this->pcpu_refcnt != NULL)
-		irqsafe_cpu_inc(*this->pcpu_refcnt);
+		this_cpu_inc(*this->pcpu_refcnt);
 }
 
 int cffrml_refcnt_read(struct cflayer *layr)
-- 
cgit v1.2.3-55-g7522


From d6185f20a0efbf175e12831d0de330e4f21725aa Mon Sep 17 00:00:00 2001
From: Nadav Har'El
Date: Thu, 22 Sep 2011 13:52:56 +0300
Subject: KVM: nVMX: Add KVM_REQ_IMMEDIATE_EXIT

This patch adds a new vcpu->requests bit, KVM_REQ_IMMEDIATE_EXIT.
This bit requests that when next entering the guest, we should run it only
for as little as possible, and exit again.

We use this new option in nested VMX: When L1 launches L2, but L0 wishes L1
to continue running so it can inject an event to it, we unfortunately cannot
just pretend to have run L2 for a little while - We must really launch L2,
otherwise certain one-off vmcs12 parameters (namely, L1 injection into L2)
will be lost. So the existing code runs L2 in this case.
But L2 could potentially run for a long time until it exits, and the
injection into L1 will be delayed. The new KVM_REQ_IMMEDIATE_EXIT allows us
to request that L2 will be entered, as necessary, but will exit as soon as
possible after entry.

Our implementation of this request uses smp_send_reschedule() to send a
self-IPI, with interrupts disabled. The interrupts remain disabled until the
guest is entered, and then, after the entry is complete (often including
processing an injection and jumping to the relevant handler), the physical
interrupt is noticed and causes an exit.

On recent Intel processors, we could have achieved the same goal by using
MTF instead of a self-IPI. Another technique worth considering in the future
is to use VM_EXIT_ACK_INTR_ON_EXIT and a highest-priority vector IPI - to
slightly improve performance by avoiding the useless interrupt handler
which ends up being called when smp_send_reschedule() is used.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c       | 11 +++++++----
 arch/x86/kvm/x86.c       |  7 ++++++-
 include/linux/kvm_host.h |  1 +
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 579a0b51696a..d75d91465246 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3945,12 +3945,15 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
-	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
-		/* We can get here when nested_run_pending caused
-		 * vmx_interrupt_allowed() to return false. In this case, do
-		 * nothing - the interrupt will be injected later.
+	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
+		/*
+		 * We get here if vmx_interrupt_allowed() said we can't
+		 * inject to L1 now because L2 must run. Ask L2 to exit
+		 * right after entry, so we can inject to L1 more promptly.
 		 */
+		kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
 		return;
+	}
 
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4c938da2ba00..e24edbc7f2ec 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5648,6 +5648,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	int r;
 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
 		vcpu->run->request_interrupt_window;
+	bool req_immediate_exit = 0;
 
 	if (vcpu->requests) {
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -5687,7 +5688,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			record_steal_time(vcpu);
 		if (kvm_check_request(KVM_REQ_NMI, vcpu))
 			process_nmi(vcpu);
-
+		req_immediate_exit =
+			kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
 	}
 
 	r = kvm_mmu_reload(vcpu);
@@ -5738,6 +5740,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 
+	if (req_immediate_exit)
+		smp_send_reschedule(vcpu->cpu);
+
 	kvm_guest_enter();
 
 	if (unlikely(vcpu->arch.switch_db_regs)) {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d52623199978..9fedeb356b95 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -50,6 +50,7 @@
 #define KVM_REQ_APF_HALT          12
 #define KVM_REQ_STEAL_UPDATE      13
 #define KVM_REQ_NMI               14
+#define KVM_REQ_IMMEDIATE_EXIT    15
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
-- 
cgit v1.2.3-55-g7522


From 51cfe38ea50aa631f58ed8c340ed6f0143c325a8 Mon Sep 17 00:00:00 2001
From: Nadav Har'El
Date: Thu, 22 Sep 2011 13:53:26 +0300
Subject: KVM: nVMX: Fix warning-causing idt-vectoring-info behavior

When L0 wishes to inject an interrupt while L2 is running, it emulates an exit
to L1 with EXIT_REASON_EXTERNAL_INTERRUPT. This was explained in the original
nVMX patch 23, titled "Correct handling of interrupt injection".

Unfortunately, it is possible (though rare) that at this point there is valid
idt_vectoring_info in vmcs02. For example, L1 injected some interrupt to L2,
and when L2 tried to run this interrupt's handler, it got a page fault - so
it returns the original interrupt vector in idt_vectoring_info. The problem
is that if this is the case, we cannot exit to L1 with EXTERNAL_INTERRUPT
like we wished to, because the VMX spec guarantees that idt_vectoring_info
and exit_reason_external_interrupt can never happen together. This is not
just specified in the spec - a KVM L1 actually prints a kernel warning
"unexpected, valid vectoring info" if we violate this guarantee, and some
users noticed these warnings in L1's logs.

In order to better emulate a processor, which would never return the external
interrupt and the idt-vectoring-info together, we need to separate the two
injection steps: First, complete L1's injection into L2 (i.e., enter L2,
injecting to it the idt-vectoring-info); Second, after entry into L2 succeeds
and it exits back to L0, exit to L1 with the EXIT_REASON_EXTERNAL_INTERRUPT.
Most of this is already in the code - the only change we need is to remain
in L2 (and not exit to L1) in this case.

Note that the previous patch ensures (by using KVM_REQ_IMMEDIATE_EXIT) that
although we do enter L2 first, it will exit immediately after processing its
injection, allowing us to promptly inject to L1.

Note how we test vmcs12->idt_vectoring_info_field; This isn't really the
vmcs12 value (we haven't exited to L1 yet, so vmcs12 hasn't been updated),
but rather the place we save, at the end of vmx_vcpu_run, the vmcs02 value
of this field. This was explained in patch 25 ("Correct handling of idt
vectoring info") of the original nVMX patch series.

Thanks to Dave Allan and to Federico Simoncelli for reporting this bug,
to Abel Gordon for helping me figure out the solution, and to Avi Kivity
for helping to improve it.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d75d91465246..6e28d582a514 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4080,11 +4080,12 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
 	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
-		struct vmcs12 *vmcs12;
-		if (to_vmx(vcpu)->nested.nested_run_pending)
+		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+		if (to_vmx(vcpu)->nested.nested_run_pending ||
+		    (vmcs12->idt_vectoring_info_field &
+		     VECTORING_INFO_VALID_MASK))
 			return 0;
 		nested_vmx_vmexit(vcpu);
-		vmcs12 = get_vmcs12(vcpu);
 		vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
 		vmcs12->vm_exit_intr_info = 0;
 		/* fall through to normal code, but now in L1, not L2 */
-- 
cgit v1.2.3-55-g7522


From f759e2b4c728cee82e4bc1132d0e41177b79a0b1 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Thu, 22 Sep 2011 16:53:17 +0800
Subject: KVM: MMU: avoid pte_list_desc running out in kvm_mmu_pte_write

kvm_mmu_pte_write is unsafe since we need to alloc pte_list_desc in the
function when spte is prefetched, unfortunately, we can not know how many
spte need to be prefetched on this path, that means we can use out of the
free  pte_list_desc object in the cache, and BUG_ON() is triggered, also some
path does not fill the cache, such as INS instruction emulated that does not
trigger page fault

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f1b36cf3e3d0..232c5a30ddc8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -593,6 +593,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 	return 0;
 }
 
+static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
+{
+	return cache->nobjs;
+}
+
 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
 				  struct kmem_cache *cache)
 {
@@ -970,6 +975,14 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
 	return &linfo->rmap_pde;
 }
 
+static bool rmap_can_add(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_memory_cache *cache;
+
+	cache = &vcpu->arch.mmu_pte_list_desc_cache;
+	return mmu_memory_cache_free_objects(cache);
+}
+
 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 {
 	struct kvm_mmu_page *sp;
@@ -3586,6 +3599,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		break;
 	}
 
+	/*
+	 * No need to care whether allocation memory is successful
+	 * or not since pte prefetch is skiped if it does not have
+	 * enough objects in the cache.
+	 */
+	mmu_topup_memory_caches(vcpu);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
 		gentry = 0;
@@ -3656,7 +3675,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			mmu_page_zap_pte(vcpu->kvm, sp, spte);
 			if (gentry &&
 			      !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
-			      & mask.word))
+			      & mask.word) && rmap_can_add(vcpu))
 				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
 			if (!remote_flush && need_remote_flush(entry, *spte))
 				remote_flush = true;
@@ -3717,10 +3736,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 		goto out;
 	}
 
-	r = mmu_topup_memory_caches(vcpu);
-	if (r)
-		goto out;
-
 	er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
 
 	switch (er) {
-- 
cgit v1.2.3-55-g7522


From d5ae7ce835cc89556dc18e2070e754f026402efa Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Thu, 22 Sep 2011 16:53:46 +0800
Subject: KVM: x86: tag the instructions which are used to write page table

The idea is from Avi:
| tag instructions that are typically used to modify the page tables, and
| drop shadow if any other instruction is used.
| The list would include, I'd guess, and, or, bts, btc, mov, xchg, cmpxchg,
| and cmpxchg8b.

This patch is used to tag the instructions and in the later path, shadow page
is dropped if it is written by other instructions

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f1e3be18a08f..a10950a37928 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -125,8 +125,9 @@
 #define Lock        (1<<26) /* lock prefix is allowed for the instruction */
 #define Priv        (1<<27) /* instruction generates #GP if current CPL != 0 */
 #define No64	    (1<<28)
+#define PageTable   (1 << 29)   /* instruction used to write page table */
 /* Source 2 operand type */
-#define Src2Shift   (29)
+#define Src2Shift   (30)
 #define Src2None    (OpNone << Src2Shift)
 #define Src2CL      (OpCL << Src2Shift)
 #define Src2ImmByte (OpImmByte << Src2Shift)
@@ -3033,10 +3034,10 @@ static struct opcode group7_rm7[] = {
 
 static struct opcode group1[] = {
 	I(Lock, em_add),
-	I(Lock, em_or),
+	I(Lock | PageTable, em_or),
 	I(Lock, em_adc),
 	I(Lock, em_sbb),
-	I(Lock, em_and),
+	I(Lock | PageTable, em_and),
 	I(Lock, em_sub),
 	I(Lock, em_xor),
 	I(0, em_cmp),
@@ -3096,18 +3097,21 @@ static struct group_dual group7 = { {
 
 static struct opcode group8[] = {
 	N, N, N, N,
-	D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
-	D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
+	D(DstMem | SrcImmByte | ModRM),
+	D(DstMem | SrcImmByte | ModRM | Lock | PageTable),
+	D(DstMem | SrcImmByte | ModRM | Lock),
+	D(DstMem | SrcImmByte | ModRM | Lock | PageTable),
 };
 
 static struct group_dual group9 = { {
-	N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
+	N, D(DstMem64 | ModRM | Lock | PageTable), N, N, N, N, N, N,
 }, {
 	N, N, N, N, N, N, N, N,
 } };
 
 static struct opcode group11[] = {
-	I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
+	I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov),
+	X7(D(Undefined)),
 };
 
 static struct gprefix pfx_0f_6f_0f_7f = {
@@ -3120,7 +3124,7 @@ static struct opcode opcode_table[256] = {
 	I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
 	I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
 	/* 0x08 - 0x0F */
-	I6ALU(Lock, em_or),
+	I6ALU(Lock | PageTable, em_or),
 	I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
 	N,
 	/* 0x10 - 0x17 */
@@ -3132,7 +3136,7 @@ static struct opcode opcode_table[256] = {
 	I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
 	I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
 	/* 0x20 - 0x27 */
-	I6ALU(Lock, em_and), N, N,
+	I6ALU(Lock | PageTable, em_and), N, N,
 	/* 0x28 - 0x2F */
 	I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
 	/* 0x30 - 0x37 */
@@ -3165,11 +3169,11 @@ static struct opcode opcode_table[256] = {
 	G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
 	G(DstMem | SrcImmByte | ModRM | Group, group1),
 	I2bv(DstMem | SrcReg | ModRM, em_test),
-	I2bv(DstMem | SrcReg | ModRM | Lock, em_xchg),
+	I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
 	/* 0x88 - 0x8F */
-	I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
+	I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov),
 	I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
-	I(DstMem | SrcNone | ModRM | Mov, em_mov_rm_sreg),
+	I(DstMem | SrcNone | ModRM | Mov | PageTable, em_mov_rm_sreg),
 	D(ModRM | SrcMem | NoAccess | DstReg),
 	I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm),
 	G(0, group1A),
@@ -3182,7 +3186,7 @@ static struct opcode opcode_table[256] = {
 	II(ImplicitOps | Stack, em_popf, popf), N, N,
 	/* 0xA0 - 0xA7 */
 	I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
-	I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
+	I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
 	I2bv(SrcSI | DstDI | Mov | String, em_mov),
 	I2bv(SrcSI | DstDI | String, em_cmp),
 	/* 0xA8 - 0xAF */
@@ -3280,12 +3284,13 @@ static struct opcode twobyte_table[256] = {
 	D(DstMem | SrcReg | Src2CL | ModRM), N, N,
 	/* 0xA8 - 0xAF */
 	I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
-	DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	DI(ImplicitOps, rsm),
+	D(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable),
 	D(DstMem | SrcReg | Src2ImmByte | ModRM),
 	D(DstMem | SrcReg | Src2CL | ModRM),
 	D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
 	/* 0xB0 - 0xB7 */
-	D2bv(DstMem | SrcReg | ModRM | Lock),
+	D2bv(DstMem | SrcReg | ModRM | Lock | PageTable),
 	I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
 	D(DstMem | SrcReg | ModRM | BitOp | Lock),
 	I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
@@ -3293,7 +3298,7 @@ static struct opcode twobyte_table[256] = {
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xB8 - 0xBF */
 	N, N,
-	G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable),
 	D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
-- 
cgit v1.2.3-55-g7522


From 1cb3f3ae5a3855ba430430706da4201ace1d6ec4 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Thu, 22 Sep 2011 17:02:48 +0800
Subject: KVM: x86: retry non-page-table writing instructions

If the emulation is caused by #PF and it is non-page_table writing instruction,
it means the VM-EXIT is caused by shadow page protected, we can zap the shadow
page and retry this instruction directly

The idea is from Avi

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  5 ++++
 arch/x86/kvm/emulate.c             |  5 ++++
 arch/x86/kvm/mmu.c                 | 25 +++++++++++++++-----
 arch/x86/kvm/x86.c                 | 47 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 77 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index a026507893e9..9a4acf41709c 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -364,6 +364,7 @@ enum x86_intercept {
 #endif
 
 int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
+bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
 #define EMULATION_FAILED -1
 #define EMULATION_OK 0
 #define EMULATION_RESTART 1
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b4973f4dab98..4ceefa9567ed 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -444,6 +444,9 @@ struct kvm_vcpu_arch {
 
 	cpumask_var_t wbinvd_dirty_mask;
 
+	unsigned long last_retry_eip;
+	unsigned long last_retry_addr;
+
 	struct {
 		bool halted;
 		gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
@@ -692,6 +695,7 @@ enum emulation_result {
 #define EMULTYPE_NO_DECODE	    (1 << 0)
 #define EMULTYPE_TRAP_UD	    (1 << 1)
 #define EMULTYPE_SKIP		    (1 << 2)
+#define EMULTYPE_RETRY		    (1 << 3)
 int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
 			    int emulation_type, void *insn, int insn_len);
 
@@ -756,6 +760,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       const u8 *new, int bytes,
 		       bool guest_initiated);
+int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a10950a37928..8547958e3582 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3702,6 +3702,11 @@ done:
 	return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
 }
 
+bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt)
+{
+	return ctxt->d & PageTable;
+}
+
 static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
 {
 	/* The second termination condition only applies for REPE
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 232c5a30ddc8..7a22eb81b4ca 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1998,7 +1998,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
 	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
 }
 
-static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
+int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node;
@@ -2007,7 +2007,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 
 	pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
 	r = 0;
-
+	spin_lock(&kvm->mmu_lock);
 	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
 		pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
 			 sp->role.word);
@@ -2015,8 +2015,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
 	}
 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
+	spin_unlock(&kvm->mmu_lock);
+
 	return r;
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
 
 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 {
@@ -3698,9 +3701,8 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
 
 	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
 
-	spin_lock(&vcpu->kvm->mmu_lock);
 	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-	spin_unlock(&vcpu->kvm->mmu_lock);
+
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
@@ -3721,10 +3723,18 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 
+static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
+{
+	if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu))
+		return vcpu_match_mmio_gpa(vcpu, addr);
+
+	return vcpu_match_mmio_gva(vcpu, addr);
+}
+
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 		       void *insn, int insn_len)
 {
-	int r;
+	int r, emulation_type = EMULTYPE_RETRY;
 	enum emulation_result er;
 
 	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
@@ -3736,7 +3746,10 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 		goto out;
 	}
 
-	er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
+	if (is_mmio_page_fault(vcpu, cr2))
+		emulation_type = 0;
+
+	er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
 
 	switch (er) {
 	case EMULATE_DONE:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e24edbc7f2ec..7ba1ab73fd03 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4836,6 +4836,50 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
 	return false;
 }
 
+static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
+			      unsigned long cr2,  int emulation_type)
+{
+	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+	unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
+
+	last_retry_eip = vcpu->arch.last_retry_eip;
+	last_retry_addr = vcpu->arch.last_retry_addr;
+
+	/*
+	 * If the emulation is caused by #PF and it is non-page_table
+	 * writing instruction, it means the VM-EXIT is caused by shadow
+	 * page protected, we can zap the shadow page and retry this
+	 * instruction directly.
+	 *
+	 * Note: if the guest uses a non-page-table modifying instruction
+	 * on the PDE that points to the instruction, then we will unmap
+	 * the instruction and go to an infinite loop. So, we cache the
+	 * last retried eip and the last fault address, if we meet the eip
+	 * and the address again, we can break out of the potential infinite
+	 * loop.
+	 */
+	vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
+
+	if (!(emulation_type & EMULTYPE_RETRY))
+		return false;
+
+	if (x86_page_table_writing_insn(ctxt))
+		return false;
+
+	if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
+		return false;
+
+	vcpu->arch.last_retry_eip = ctxt->eip;
+	vcpu->arch.last_retry_addr = cr2;
+
+	if (!vcpu->arch.mmu.direct_map)
+		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
+
+	kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+
+	return true;
+}
+
 int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 			    unsigned long cr2,
 			    int emulation_type,
@@ -4877,6 +4921,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 		return EMULATE_DONE;
 	}
 
+	if (retry_instruction(ctxt, cr2, emulation_type))
+		return EMULATE_DONE;
+
 	/* this is needed for vmware backdoor interface to work since it
 	   changes registers values  during IO operation */
 	if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
-- 
cgit v1.2.3-55-g7522


From 6f6fbe98c3a9f3e9d69cd354a0459989e594e707 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Thu, 22 Sep 2011 16:55:10 +0800
Subject: KVM: x86: cleanup port-in/port-out emulated

Remove the same code between emulator_pio_in_emulated and
emulator_pio_out_emulated

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 59 ++++++++++++++++++++++++------------------------------
 1 file changed, 26 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7ba1ab73fd03..a2154487917d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4349,32 +4349,24 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 	return r;
 }
 
-
-static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
-				    int size, unsigned short port, void *val,
-				    unsigned int count)
+static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
+			       unsigned short port, void *val,
+			       unsigned int count, bool in)
 {
-	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-
-	if (vcpu->arch.pio.count)
-		goto data_avail;
-
-	trace_kvm_pio(0, port, size, count);
+	trace_kvm_pio(!in, port, size, count);
 
 	vcpu->arch.pio.port = port;
-	vcpu->arch.pio.in = 1;
+	vcpu->arch.pio.in = in;
 	vcpu->arch.pio.count  = count;
 	vcpu->arch.pio.size = size;
 
 	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
-	data_avail:
-		memcpy(val, vcpu->arch.pio_data, size * count);
 		vcpu->arch.pio.count = 0;
 		return 1;
 	}
 
 	vcpu->run->exit_reason = KVM_EXIT_IO;
-	vcpu->run->io.direction = KVM_EXIT_IO_IN;
+	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
 	vcpu->run->io.size = size;
 	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
 	vcpu->run->io.count = count;
@@ -4383,36 +4375,37 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 	return 0;
 }
 
-static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
-				     int size, unsigned short port,
-				     const void *val, unsigned int count)
+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+				    int size, unsigned short port, void *val,
+				    unsigned int count)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+	int ret;
 
-	trace_kvm_pio(1, port, size, count);
-
-	vcpu->arch.pio.port = port;
-	vcpu->arch.pio.in = 0;
-	vcpu->arch.pio.count = count;
-	vcpu->arch.pio.size = size;
-
-	memcpy(vcpu->arch.pio_data, val, size * count);
+	if (vcpu->arch.pio.count)
+		goto data_avail;
 
-	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+	ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
+	if (ret) {
+data_avail:
+		memcpy(val, vcpu->arch.pio_data, size * count);
 		vcpu->arch.pio.count = 0;
 		return 1;
 	}
 
-	vcpu->run->exit_reason = KVM_EXIT_IO;
-	vcpu->run->io.direction = KVM_EXIT_IO_OUT;
-	vcpu->run->io.size = size;
-	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
-	vcpu->run->io.count = count;
-	vcpu->run->io.port = port;
-
 	return 0;
 }
 
+static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
+				     int size, unsigned short port,
+				     const void *val, unsigned int count)
+{
+	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+	memcpy(vcpu->arch.pio_data, val, size * count);
+	return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
+}
+
 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
 {
 	return kvm_x86_ops->get_segment_base(vcpu, seg);
-- 
cgit v1.2.3-55-g7522


From d01f8d5e02cc79998e3160f7ad545f77891b00e5 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Thu, 22 Sep 2011 16:55:36 +0800
Subject: KVM: MMU: do not mark accessed bit on pte write path

In current code, the accessed bit is always set when page fault occurred,
do not need to set it on pte write path

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/mmu.c              | 22 +---------------------
 2 files changed, 1 insertion(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4ceefa9567ed..f8ab0d760231 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -356,7 +356,6 @@ struct kvm_vcpu_arch {
 	gfn_t last_pt_write_gfn;
 	int   last_pt_write_count;
 	u64  *last_pte_updated;
-	gfn_t last_pte_gfn;
 
 	struct fpu guest_fpu;
 	u64 xcr0;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7a22eb81b4ca..b432a71a1839 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2207,11 +2207,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	if (set_mmio_spte(sptep, gfn, pfn, pte_access))
 		return 0;
 
-	/*
-	 * We don't set the accessed bit, since we sometimes want to see
-	 * whether the guest actually used the pte (in order to detect
-	 * demand paging).
-	 */
 	spte = PT_PRESENT_MASK;
 	if (!speculative)
 		spte |= shadow_accessed_mask;
@@ -2362,10 +2357,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		}
 	}
 	kvm_release_pfn_clean(pfn);
-	if (speculative) {
+	if (speculative)
 		vcpu->arch.last_pte_updated = sptep;
-		vcpu->arch.last_pte_gfn = gfn;
-	}
 }
 
 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -3533,18 +3526,6 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
 	return !!(spte && (*spte & shadow_accessed_mask));
 }
 
-static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-	u64 *spte = vcpu->arch.last_pte_updated;
-
-	if (spte
-	    && vcpu->arch.last_pte_gfn == gfn
-	    && shadow_accessed_mask
-	    && !(*spte & shadow_accessed_mask)
-	    && is_shadow_present_pte(*spte))
-		set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
-}
-
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       const u8 *new, int bytes,
 		       bool guest_initiated)
@@ -3615,7 +3596,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	++vcpu->kvm->stat.mmu_pte_write;
 	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
 	if (guest_initiated) {
-		kvm_mmu_access_page(vcpu, gfn);
 		if (gfn == vcpu->arch.last_pt_write_gfn
 		    && !last_updated_pte_accessed(vcpu)) {
 			++vcpu->arch.last_pt_write_count;
-- 
cgit v1.2.3-55-g7522


From 505aef8f30a95f7e4abf2c07e54ded1521587ba0 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Thu, 22 Sep 2011 16:56:06 +0800
Subject: KVM: MMU: cleanup FNAME(invlpg)

Directly Use mmu_page_zap_pte to zap spte in FNAME(invlpg), also remove the
same code between FNAME(invlpg) and FNAME(sync_page)

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 16 ++++++++++------
 arch/x86/kvm/paging_tmpl.h | 44 +++++++++++++++++---------------------------
 2 files changed, 27 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b432a71a1839..d15f908649e7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1809,7 +1809,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	}
 }
 
-static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
 			     u64 *spte)
 {
 	u64 pte;
@@ -1817,17 +1817,21 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
 
 	pte = *spte;
 	if (is_shadow_present_pte(pte)) {
-		if (is_last_spte(pte, sp->role.level))
+		if (is_last_spte(pte, sp->role.level)) {
 			drop_spte(kvm, spte);
-		else {
+			if (is_large_pte(pte))
+				--kvm->stat.lpages;
+		} else {
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
 			drop_parent_pte(child, spte);
 		}
-	} else if (is_mmio_spte(pte))
+		return true;
+	}
+
+	if (is_mmio_spte(pte))
 		mmu_spte_clear_no_track(spte);
 
-	if (is_large_pte(pte))
-		--kvm->stat.lpages;
+	return false;
 }
 
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 92994100638b..d8d3906649da 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -656,6 +656,18 @@ out_unlock:
 	return 0;
 }
 
+static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
+{
+	int offset = 0;
+
+	WARN_ON(sp->role.level != 1);
+
+	if (PTTYPE == 32)
+		offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+	return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
+}
+
 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
 	struct kvm_shadow_walk_iterator iterator;
@@ -663,7 +675,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 	gpa_t pte_gpa = -1;
 	int level;
 	u64 *sptep;
-	int need_flush = 0;
 
 	vcpu_clear_mmio_info(vcpu, gva);
 
@@ -675,36 +686,20 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 
 		sp = page_header(__pa(sptep));
 		if (is_last_spte(*sptep, level)) {
-			int offset, shift;
-
 			if (!sp->unsync)
 				break;
 
-			shift = PAGE_SHIFT -
-				  (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
-			offset = sp->role.quadrant << shift;
-
-			pte_gpa = (sp->gfn << PAGE_SHIFT) + offset;
+			pte_gpa = FNAME(get_level1_sp_gpa)(sp);
 			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
 
-			if (is_shadow_present_pte(*sptep)) {
-				if (is_large_pte(*sptep))
-					--vcpu->kvm->stat.lpages;
-				drop_spte(vcpu->kvm, sptep);
-				need_flush = 1;
-			} else if (is_mmio_spte(*sptep))
-				mmu_spte_clear_no_track(sptep);
-
-			break;
+			if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
+				kvm_flush_remote_tlbs(vcpu->kvm);
 		}
 
 		if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
 			break;
 	}
 
-	if (need_flush)
-		kvm_flush_remote_tlbs(vcpu->kvm);
-
 	atomic_inc(&vcpu->kvm->arch.invlpg_counter);
 
 	spin_unlock(&vcpu->kvm->mmu_lock);
@@ -769,19 +764,14 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
  */
 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
-	int i, offset, nr_present;
+	int i, nr_present = 0;
 	bool host_writable;
 	gpa_t first_pte_gpa;
 
-	offset = nr_present = 0;
-
 	/* direct kvm_mmu_page can not be unsync. */
 	BUG_ON(sp->role.direct);
 
-	if (PTTYPE == 32)
-		offset = sp->role.quadrant << PT64_LEVEL_BITS;
-
-	first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
+	first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
 
 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 		unsigned pte_access;
-- 
cgit v1.2.3-55-g7522


From f57f2ef58f6703e6df70ed52a198920cb3e8edba Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Thu, 22 Sep 2011 16:56:39 +0800
Subject: KVM: MMU: fast prefetch spte on invlpg path

Fast prefetch spte for the unsync shadow page on invlpg path

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  4 +---
 arch/x86/kvm/mmu.c              | 38 +++++++++++++++-----------------------
 arch/x86/kvm/paging_tmpl.h      | 30 ++++++++++++++++++------------
 arch/x86/kvm/x86.c              |  4 ++--
 4 files changed, 36 insertions(+), 40 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f8ab0d760231..3c9ea26c7aea 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -461,7 +461,6 @@ struct kvm_arch {
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_max_mmu_pages;
 	unsigned int indirect_shadow_pages;
-	atomic_t invlpg_counter;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
 	/*
 	 * Hash table of struct kvm_mmu_page.
@@ -757,8 +756,7 @@ int fx_init(struct kvm_vcpu *vcpu);
 
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes,
-		       bool guest_initiated);
+		       const u8 *new, int bytes);
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d15f908649e7..c01137f10c6b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3531,8 +3531,7 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
 }
 
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes,
-		       bool guest_initiated)
+		       const u8 *new, int bytes)
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	union kvm_mmu_page_role mask = { .word = 0 };
@@ -3541,7 +3540,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	LIST_HEAD(invalid_list);
 	u64 entry, gentry, *spte;
 	unsigned pte_size, page_offset, misaligned, quadrant, offset;
-	int level, npte, invlpg_counter, r, flooded = 0;
+	int level, npte, r, flooded = 0;
 	bool remote_flush, local_flush, zap_page;
 
 	/*
@@ -3556,19 +3555,16 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 
 	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
 
-	invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
-
 	/*
 	 * Assume that the pte write on a page table of the same type
 	 * as the current vcpu paging mode since we update the sptes only
 	 * when they have the same mode.
 	 */
-	if ((is_pae(vcpu) && bytes == 4) || !new) {
+	if (is_pae(vcpu) && bytes == 4) {
 		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
-		if (is_pae(vcpu)) {
-			gpa &= ~(gpa_t)7;
-			bytes = 8;
-		}
+		gpa &= ~(gpa_t)7;
+		bytes = 8;
+
 		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
 		if (r)
 			gentry = 0;
@@ -3594,22 +3590,18 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	 */
 	mmu_topup_memory_caches(vcpu);
 	spin_lock(&vcpu->kvm->mmu_lock);
-	if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
-		gentry = 0;
 	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
 	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
-	if (guest_initiated) {
-		if (gfn == vcpu->arch.last_pt_write_gfn
-		    && !last_updated_pte_accessed(vcpu)) {
-			++vcpu->arch.last_pt_write_count;
-			if (vcpu->arch.last_pt_write_count >= 3)
-				flooded = 1;
-		} else {
-			vcpu->arch.last_pt_write_gfn = gfn;
-			vcpu->arch.last_pt_write_count = 1;
-			vcpu->arch.last_pte_updated = NULL;
-		}
+	if (gfn == vcpu->arch.last_pt_write_gfn
+	    && !last_updated_pte_accessed(vcpu)) {
+		++vcpu->arch.last_pt_write_count;
+		if (vcpu->arch.last_pt_write_count >= 3)
+			flooded = 1;
+	} else {
+		vcpu->arch.last_pt_write_gfn = gfn;
+		vcpu->arch.last_pt_write_count = 1;
+		vcpu->arch.last_pte_updated = NULL;
 	}
 
 	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d8d3906649da..9efb86035774 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -672,20 +672,27 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
-	gpa_t pte_gpa = -1;
 	int level;
 	u64 *sptep;
 
 	vcpu_clear_mmio_info(vcpu, gva);
 
-	spin_lock(&vcpu->kvm->mmu_lock);
+	/*
+	 * No need to check return value here, rmap_can_add() can
+	 * help us to skip pte prefetch later.
+	 */
+	mmu_topup_memory_caches(vcpu);
 
+	spin_lock(&vcpu->kvm->mmu_lock);
 	for_each_shadow_entry(vcpu, gva, iterator) {
 		level = iterator.level;
 		sptep = iterator.sptep;
 
 		sp = page_header(__pa(sptep));
 		if (is_last_spte(*sptep, level)) {
+			pt_element_t gpte;
+			gpa_t pte_gpa;
+
 			if (!sp->unsync)
 				break;
 
@@ -694,22 +701,21 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 
 			if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
 				kvm_flush_remote_tlbs(vcpu->kvm);
+
+			if (!rmap_can_add(vcpu))
+				break;
+
+			if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
+						  sizeof(pt_element_t)))
+				break;
+
+			FNAME(update_pte)(vcpu, sp, sptep, &gpte);
 		}
 
 		if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
 			break;
 	}
-
-	atomic_inc(&vcpu->kvm->arch.invlpg_counter);
-
 	spin_unlock(&vcpu->kvm->mmu_lock);
-
-	if (pte_gpa == -1)
-		return;
-
-	if (mmu_topup_memory_caches(vcpu))
-		return;
-	kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
 }
 
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a2154487917d..9c980ce26e61 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4087,7 +4087,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
 	if (ret < 0)
 		return 0;
-	kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
+	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
 	return 1;
 }
 
@@ -4324,7 +4324,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 	if (!exchanged)
 		return X86EMUL_CMPXCHG_FAILED;
 
-	kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
+	kvm_mmu_pte_write(vcpu, gpa, new, bytes);
 
 	return X86EMUL_CONTINUE;
 
-- 
cgit v1.2.3-55-g7522


From f8734352c6f9c4f3d85f0c97b7731b7f925c62fd Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Thu, 22 Sep 2011 16:56:58 +0800
Subject: KVM: MMU: remove unnecessary kvm_mmu_free_some_pages

In kvm_mmu_pte_write, we do not need to alloc shadow page, so calling
kvm_mmu_free_some_pages is really unnecessary

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c01137f10c6b..7e57938bb86a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3590,7 +3590,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	 */
 	mmu_topup_memory_caches(vcpu);
 	spin_lock(&vcpu->kvm->mmu_lock);
-	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
 	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
 	if (gfn == vcpu->arch.last_pt_write_gfn
-- 
cgit v1.2.3-55-g7522


From 889e5cbced6c191bb7e25c1b30b43e59a12561f9 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Thu, 22 Sep 2011 16:57:23 +0800
Subject: KVM: MMU: split kvm_mmu_pte_write function

kvm_mmu_pte_write is too long, we split it for better readable

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 194 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 119 insertions(+), 75 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7e57938bb86a..986aea55366b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3530,48 +3530,28 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
 	return !!(spte && (*spte & shadow_accessed_mask));
 }
 
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes)
+static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
+				    const u8 *new, int *bytes)
 {
-	gfn_t gfn = gpa >> PAGE_SHIFT;
-	union kvm_mmu_page_role mask = { .word = 0 };
-	struct kvm_mmu_page *sp;
-	struct hlist_node *node;
-	LIST_HEAD(invalid_list);
-	u64 entry, gentry, *spte;
-	unsigned pte_size, page_offset, misaligned, quadrant, offset;
-	int level, npte, r, flooded = 0;
-	bool remote_flush, local_flush, zap_page;
-
-	/*
-	 * If we don't have indirect shadow pages, it means no page is
-	 * write-protected, so we can exit simply.
-	 */
-	if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
-		return;
-
-	zap_page = remote_flush = local_flush = false;
-	offset = offset_in_page(gpa);
-
-	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
+	u64 gentry;
+	int r;
 
 	/*
 	 * Assume that the pte write on a page table of the same type
 	 * as the current vcpu paging mode since we update the sptes only
 	 * when they have the same mode.
 	 */
-	if (is_pae(vcpu) && bytes == 4) {
+	if (is_pae(vcpu) && *bytes == 4) {
 		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
-		gpa &= ~(gpa_t)7;
-		bytes = 8;
-
-		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
+		*gpa &= ~(gpa_t)7;
+		*bytes = 8;
+		r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8));
 		if (r)
 			gentry = 0;
 		new = (const u8 *)&gentry;
 	}
 
-	switch (bytes) {
+	switch (*bytes) {
 	case 4:
 		gentry = *(const u32 *)new;
 		break;
@@ -3583,71 +3563,135 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		break;
 	}
 
-	/*
-	 * No need to care whether allocation memory is successful
-	 * or not since pte prefetch is skiped if it does not have
-	 * enough objects in the cache.
-	 */
-	mmu_topup_memory_caches(vcpu);
-	spin_lock(&vcpu->kvm->mmu_lock);
-	++vcpu->kvm->stat.mmu_pte_write;
-	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
+	return gentry;
+}
+
+/*
+ * If we're seeing too many writes to a page, it may no longer be a page table,
+ * or we may be forking, in which case it is better to unmap the page.
+ */
+static bool detect_write_flooding(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	bool flooded = false;
+
 	if (gfn == vcpu->arch.last_pt_write_gfn
 	    && !last_updated_pte_accessed(vcpu)) {
 		++vcpu->arch.last_pt_write_count;
 		if (vcpu->arch.last_pt_write_count >= 3)
-			flooded = 1;
+			flooded = true;
 	} else {
 		vcpu->arch.last_pt_write_gfn = gfn;
 		vcpu->arch.last_pt_write_count = 1;
 		vcpu->arch.last_pte_updated = NULL;
 	}
 
+	return flooded;
+}
+
+/*
+ * Misaligned accesses are too much trouble to fix up; also, they usually
+ * indicate a page is not used as a page table.
+ */
+static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
+				    int bytes)
+{
+	unsigned offset, pte_size, misaligned;
+
+	pgprintk("misaligned: gpa %llx bytes %d role %x\n",
+		 gpa, bytes, sp->role.word);
+
+	offset = offset_in_page(gpa);
+	pte_size = sp->role.cr4_pae ? 8 : 4;
+	misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+	misaligned |= bytes < 4;
+
+	return misaligned;
+}
+
+static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
+{
+	unsigned page_offset, quadrant;
+	u64 *spte;
+	int level;
+
+	page_offset = offset_in_page(gpa);
+	level = sp->role.level;
+	*nspte = 1;
+	if (!sp->role.cr4_pae) {
+		page_offset <<= 1;	/* 32->64 */
+		/*
+		 * A 32-bit pde maps 4MB while the shadow pdes map
+		 * only 2MB.  So we need to double the offset again
+		 * and zap two pdes instead of one.
+		 */
+		if (level == PT32_ROOT_LEVEL) {
+			page_offset &= ~7; /* kill rounding error */
+			page_offset <<= 1;
+			*nspte = 2;
+		}
+		quadrant = page_offset >> PAGE_SHIFT;
+		page_offset &= ~PAGE_MASK;
+		if (quadrant != sp->role.quadrant)
+			return NULL;
+	}
+
+	spte = &sp->spt[page_offset / sizeof(*spte)];
+	return spte;
+}
+
+void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+		       const u8 *new, int bytes)
+{
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	union kvm_mmu_page_role mask = { .word = 0 };
+	struct kvm_mmu_page *sp;
+	struct hlist_node *node;
+	LIST_HEAD(invalid_list);
+	u64 entry, gentry, *spte;
+	int npte;
+	bool remote_flush, local_flush, zap_page, flooded, misaligned;
+
+	/*
+	 * If we don't have indirect shadow pages, it means no page is
+	 * write-protected, so we can exit simply.
+	 */
+	if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
+		return;
+
+	zap_page = remote_flush = local_flush = false;
+
+	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
+
+	gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes);
+
+	/*
+	 * No need to care whether allocation memory is successful
+	 * or not since pte prefetch is skiped if it does not have
+	 * enough objects in the cache.
+	 */
+	mmu_topup_memory_caches(vcpu);
+
+	spin_lock(&vcpu->kvm->mmu_lock);
+	++vcpu->kvm->stat.mmu_pte_write;
+	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
+
+	flooded = detect_write_flooding(vcpu, gfn);
 	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
-		pte_size = sp->role.cr4_pae ? 8 : 4;
-		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
-		misaligned |= bytes < 4;
+		misaligned = detect_write_misaligned(sp, gpa, bytes);
+
 		if (misaligned || flooded) {
-			/*
-			 * Misaligned accesses are too much trouble to fix
-			 * up; also, they usually indicate a page is not used
-			 * as a page table.
-			 *
-			 * If we're seeing too many writes to a page,
-			 * it may no longer be a page table, or we may be
-			 * forking, in which case it is better to unmap the
-			 * page.
-			 */
-			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
-				 gpa, bytes, sp->role.word);
 			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
 						     &invalid_list);
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
 		}
-		page_offset = offset;
-		level = sp->role.level;
-		npte = 1;
-		if (!sp->role.cr4_pae) {
-			page_offset <<= 1;	/* 32->64 */
-			/*
-			 * A 32-bit pde maps 4MB while the shadow pdes map
-			 * only 2MB.  So we need to double the offset again
-			 * and zap two pdes instead of one.
-			 */
-			if (level == PT32_ROOT_LEVEL) {
-				page_offset &= ~7; /* kill rounding error */
-				page_offset <<= 1;
-				npte = 2;
-			}
-			quadrant = page_offset >> PAGE_SHIFT;
-			page_offset &= ~PAGE_MASK;
-			if (quadrant != sp->role.quadrant)
-				continue;
-		}
+
+		spte = get_written_sptes(sp, gpa, &npte);
+		if (!spte)
+			continue;
+
 		local_flush = true;
-		spte = &sp->spt[page_offset / sizeof(*spte)];
 		while (npte--) {
 			entry = *spte;
 			mmu_page_zap_pte(vcpu->kvm, sp, spte);
-- 
cgit v1.2.3-55-g7522


From 5d9ca30e96f567b67a36727aa4ebb34911a2b84a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Thu, 22 Sep 2011 16:57:55 +0800
Subject: KVM: MMU: fix detecting misaligned accessed

Sometimes, we only modify the last one byte of a pte to update status bit,
for example, clear_bit is used to clear r/w bit in linux kernel and 'andb'
instruction is used in this function, in this case, kvm_mmu_pte_write will
treat it as misaligned access, and the shadow page table is zapped

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 986aea55366b..ca6f72ab4c3b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3602,6 +3602,14 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
 
 	offset = offset_in_page(gpa);
 	pte_size = sp->role.cr4_pae ? 8 : 4;
+
+	/*
+	 * Sometimes, the OS only writes the last one bytes to update status
+	 * bits, for example, in linux, andb instruction is used in clear_bit().
+	 */
+	if (!(offset & (pte_size - 1)) && bytes == 1)
+		return false;
+
 	misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
 	misaligned |= bytes < 4;
 
-- 
cgit v1.2.3-55-g7522


From a30f47cb150dd8d109923eeb65fe73e8b3e09046 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Thu, 22 Sep 2011 16:58:36 +0800
Subject: KVM: MMU: improve write flooding detected

Detecting write-flooding does not work well, when we handle page written, if
the last speculative spte is not accessed, we treat the page is
write-flooding, however, we can speculative spte on many path, such as pte
prefetch, page synced, that means the last speculative spte may be not point
to the written page and the written page can be accessed via other sptes, so
depends on the Accessed bit of the last speculative spte is not enough

Instead of detected page accessed, we can detect whether the spte is accessed
after it is written, if the spte is not accessed but it is written frequently,
we treat is not a page table or it not used for a long time

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  6 ++--
 arch/x86/kvm/mmu.c              | 62 +++++++++++++++++------------------------
 arch/x86/kvm/paging_tmpl.h      | 12 ++++----
 3 files changed, 32 insertions(+), 48 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3c9ea26c7aea..c1f19de8b51c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -239,6 +239,8 @@ struct kvm_mmu_page {
 	int clear_spte_count;
 #endif
 
+	int write_flooding_count;
+
 	struct rcu_head rcu;
 };
 
@@ -353,10 +355,6 @@ struct kvm_vcpu_arch {
 	struct kvm_mmu_memory_cache mmu_page_cache;
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
 
-	gfn_t last_pt_write_gfn;
-	int   last_pt_write_count;
-	u64  *last_pte_updated;
-
 	struct fpu guest_fpu;
 	u64 xcr0;
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ca6f72ab4c3b..e9534cec003f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1653,6 +1653,18 @@ static void init_shadow_page_table(struct kvm_mmu_page *sp)
 		sp->spt[i] = 0ull;
 }
 
+static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
+{
+	sp->write_flooding_count = 0;
+}
+
+static void clear_sp_write_flooding_count(u64 *spte)
+{
+	struct kvm_mmu_page *sp =  page_header(__pa(spte));
+
+	__clear_sp_write_flooding_count(sp);
+}
+
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     gfn_t gfn,
 					     gva_t gaddr,
@@ -1696,6 +1708,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		} else if (sp->unsync)
 			kvm_mmu_mark_parents_unsync(sp);
 
+		__clear_sp_write_flooding_count(sp);
 		trace_kvm_mmu_get_page(sp, false);
 		return sp;
 	}
@@ -1848,15 +1861,6 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
 	mmu_page_remove_parent_pte(sp, parent_pte);
 }
 
-static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
-{
-	int i;
-	struct kvm_vcpu *vcpu;
-
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		vcpu->arch.last_pte_updated = NULL;
-}
-
 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	u64 *parent_pte;
@@ -1916,7 +1920,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 	}
 
 	sp->role.invalid = 1;
-	kvm_mmu_reset_last_pte_updated(kvm);
 	return ret;
 }
 
@@ -2361,8 +2364,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		}
 	}
 	kvm_release_pfn_clean(pfn);
-	if (speculative)
-		vcpu->arch.last_pte_updated = sptep;
 }
 
 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -3523,13 +3524,6 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
 		kvm_mmu_flush_tlb(vcpu);
 }
 
-static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
-{
-	u64 *spte = vcpu->arch.last_pte_updated;
-
-	return !!(spte && (*spte & shadow_accessed_mask));
-}
-
 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
 				    const u8 *new, int *bytes)
 {
@@ -3570,22 +3564,16 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
  * If we're seeing too many writes to a page, it may no longer be a page table,
  * or we may be forking, in which case it is better to unmap the page.
  */
-static bool detect_write_flooding(struct kvm_vcpu *vcpu, gfn_t gfn)
+static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte)
 {
-	bool flooded = false;
-
-	if (gfn == vcpu->arch.last_pt_write_gfn
-	    && !last_updated_pte_accessed(vcpu)) {
-		++vcpu->arch.last_pt_write_count;
-		if (vcpu->arch.last_pt_write_count >= 3)
-			flooded = true;
-	} else {
-		vcpu->arch.last_pt_write_gfn = gfn;
-		vcpu->arch.last_pt_write_count = 1;
-		vcpu->arch.last_pte_updated = NULL;
-	}
+	/*
+	 * Skip write-flooding detected for the sp whose level is 1, because
+	 * it can become unsync, then the guest page is not write-protected.
+	 */
+	if (sp->role.level == 1)
+		return false;
 
-	return flooded;
+	return ++sp->write_flooding_count >= 3;
 }
 
 /*
@@ -3657,7 +3645,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	LIST_HEAD(invalid_list);
 	u64 entry, gentry, *spte;
 	int npte;
-	bool remote_flush, local_flush, zap_page, flooded, misaligned;
+	bool remote_flush, local_flush, zap_page;
 
 	/*
 	 * If we don't have indirect shadow pages, it means no page is
@@ -3683,12 +3671,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	++vcpu->kvm->stat.mmu_pte_write;
 	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
 
-	flooded = detect_write_flooding(vcpu, gfn);
 	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
-		misaligned = detect_write_misaligned(sp, gpa, bytes);
+		spte = get_written_sptes(sp, gpa, &npte);
 
-		if (misaligned || flooded) {
+		if (detect_write_misaligned(sp, gpa, bytes) ||
+		      detect_write_flooding(sp, spte)) {
 			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
 						     &invalid_list);
 			++vcpu->kvm->stat.mmu_flooded;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9efb86035774..52e9d58cec2b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -497,6 +497,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	     shadow_walk_next(&it)) {
 		gfn_t table_gfn;
 
+		clear_sp_write_flooding_count(it.sptep);
 		drop_large_spte(vcpu, it.sptep);
 
 		sp = NULL;
@@ -522,6 +523,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	     shadow_walk_next(&it)) {
 		gfn_t direct_gfn;
 
+		clear_sp_write_flooding_count(it.sptep);
 		validate_direct_spte(vcpu, it.sptep, direct_access);
 
 		drop_large_spte(vcpu, it.sptep);
@@ -536,6 +538,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		link_shadow_page(it.sptep, sp);
 	}
 
+	clear_sp_write_flooding_count(it.sptep);
 	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
 		     user_fault, write_fault, emulate, it.level,
 		     gw->gfn, pfn, prefault, map_writable);
@@ -599,11 +602,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	 */
 	if (!r) {
 		pgprintk("%s: guest page fault\n", __func__);
-		if (!prefault) {
+		if (!prefault)
 			inject_page_fault(vcpu, &walker.fault);
-			/* reset fork detector */
-			vcpu->arch.last_pt_write_count = 0;
-		}
+
 		return 0;
 	}
 
@@ -641,9 +642,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
 		 sptep, *sptep, emulate);
 
-	if (!emulate)
-		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
-
 	++vcpu->stat.pf_fixed;
 	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
 	spin_unlock(&vcpu->kvm->mmu_lock);
-- 
cgit v1.2.3-55-g7522


From 3f2e5260f5a17d37be3e3c83aca2f335b9bf3893 Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Wed, 14 Sep 2011 09:58:32 +0200
Subject: KVM: x86: Simplify kvm timer handler

The vcpu reference of a kvm_timer can't become NULL while the timer is
valid, so drop this redundant test. This also makes it pointless to
carry a separate __kvm_timer_fn, fold it into kvm_timer_fn.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/timer.c | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index ae432ea1cd83..6b85cc647f34 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -18,9 +18,10 @@
 #include <linux/atomic.h>
 #include "kvm_timer.h"
 
-static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
+enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
 {
-	int restart_timer = 0;
+	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+	struct kvm_vcpu *vcpu = ktimer->vcpu;
 	wait_queue_head_t *q = &vcpu->wq;
 
 	/*
@@ -40,26 +41,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
 
 	if (ktimer->t_ops->is_periodic(ktimer)) {
 		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
-		restart_timer = 1;
-	}
-
-	return restart_timer;
-}
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
-{
-	int restart_timer;
-	struct kvm_vcpu *vcpu;
-	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
-
-	vcpu = ktimer->vcpu;
-	if (!vcpu)
-		return HRTIMER_NORESTART;
-
-	restart_timer = __kvm_timer_fn(vcpu, ktimer);
-	if (restart_timer)
 		return HRTIMER_RESTART;
-	else
+	} else
 		return HRTIMER_NORESTART;
 }
-
-- 
cgit v1.2.3-55-g7522


From 5202397df819d3c5a3f201bd4af6b86542115fb6 Mon Sep 17 00:00:00 2001
From: Chris Wright
Date: Tue, 1 Nov 2011 17:28:47 -0700
Subject: KVM guest: remove KVM guest pv mmu support

This has not been used for some years now.  It's time to remove it.

Signed-off-by: Chris Wright <chrisw@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kernel/kvm.c | 181 --------------------------------------------------
 1 file changed, 181 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a9c2116001d6..f0c6fd6f176b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -39,8 +39,6 @@
 #include <asm/desc.h>
 #include <asm/tlbflush.h>
 
-#define MMU_QUEUE_SIZE 1024
-
 static int kvmapf = 1;
 
 static int parse_no_kvmapf(char *arg)
@@ -60,21 +58,10 @@ static int parse_no_stealacc(char *arg)
 
 early_param("no-steal-acc", parse_no_stealacc);
 
-struct kvm_para_state {
-	u8 mmu_queue[MMU_QUEUE_SIZE];
-	int mmu_queue_len;
-};
-
-static DEFINE_PER_CPU(struct kvm_para_state, para_state);
 static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
 static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
 static int has_steal_clock = 0;
 
-static struct kvm_para_state *kvm_para_state(void)
-{
-	return &per_cpu(para_state, raw_smp_processor_id());
-}
-
 /*
  * No need for any "IO delay" on KVM
  */
@@ -271,151 +258,6 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
 	}
 }
 
-static void kvm_mmu_op(void *buffer, unsigned len)
-{
-	int r;
-	unsigned long a1, a2;
-
-	do {
-		a1 = __pa(buffer);
-		a2 = 0;   /* on i386 __pa() always returns <4G */
-		r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
-		buffer += r;
-		len -= r;
-	} while (len);
-}
-
-static void mmu_queue_flush(struct kvm_para_state *state)
-{
-	if (state->mmu_queue_len) {
-		kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
-		state->mmu_queue_len = 0;
-	}
-}
-
-static void kvm_deferred_mmu_op(void *buffer, int len)
-{
-	struct kvm_para_state *state = kvm_para_state();
-
-	if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) {
-		kvm_mmu_op(buffer, len);
-		return;
-	}
-	if (state->mmu_queue_len + len > sizeof state->mmu_queue)
-		mmu_queue_flush(state);
-	memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
-	state->mmu_queue_len += len;
-}
-
-static void kvm_mmu_write(void *dest, u64 val)
-{
-	__u64 pte_phys;
-	struct kvm_mmu_op_write_pte wpte;
-
-#ifdef CONFIG_HIGHPTE
-	struct page *page;
-	unsigned long dst = (unsigned long) dest;
-
-	page = kmap_atomic_to_page(dest);
-	pte_phys = page_to_pfn(page);
-	pte_phys <<= PAGE_SHIFT;
-	pte_phys += (dst & ~(PAGE_MASK));
-#else
-	pte_phys = (unsigned long)__pa(dest);
-#endif
-	wpte.header.op = KVM_MMU_OP_WRITE_PTE;
-	wpte.pte_val = val;
-	wpte.pte_phys = pte_phys;
-
-	kvm_deferred_mmu_op(&wpte, sizeof wpte);
-}
-
-/*
- * We only need to hook operations that are MMU writes.  We hook these so that
- * we can use lazy MMU mode to batch these operations.  We could probably
- * improve the performance of the host code if we used some of the information
- * here to simplify processing of batched writes.
- */
-static void kvm_set_pte(pte_t *ptep, pte_t pte)
-{
-	kvm_mmu_write(ptep, pte_val(pte));
-}
-
-static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
-			   pte_t *ptep, pte_t pte)
-{
-	kvm_mmu_write(ptep, pte_val(pte));
-}
-
-static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
-{
-	kvm_mmu_write(pmdp, pmd_val(pmd));
-}
-
-#if PAGETABLE_LEVELS >= 3
-#ifdef CONFIG_X86_PAE
-static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
-{
-	kvm_mmu_write(ptep, pte_val(pte));
-}
-
-static void kvm_pte_clear(struct mm_struct *mm,
-			  unsigned long addr, pte_t *ptep)
-{
-	kvm_mmu_write(ptep, 0);
-}
-
-static void kvm_pmd_clear(pmd_t *pmdp)
-{
-	kvm_mmu_write(pmdp, 0);
-}
-#endif
-
-static void kvm_set_pud(pud_t *pudp, pud_t pud)
-{
-	kvm_mmu_write(pudp, pud_val(pud));
-}
-
-#if PAGETABLE_LEVELS == 4
-static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
-{
-	kvm_mmu_write(pgdp, pgd_val(pgd));
-}
-#endif
-#endif /* PAGETABLE_LEVELS >= 3 */
-
-static void kvm_flush_tlb(void)
-{
-	struct kvm_mmu_op_flush_tlb ftlb = {
-		.header.op = KVM_MMU_OP_FLUSH_TLB,
-	};
-
-	kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
-}
-
-static void kvm_release_pt(unsigned long pfn)
-{
-	struct kvm_mmu_op_release_pt rpt = {
-		.header.op = KVM_MMU_OP_RELEASE_PT,
-		.pt_phys = (u64)pfn << PAGE_SHIFT,
-	};
-
-	kvm_mmu_op(&rpt, sizeof rpt);
-}
-
-static void kvm_enter_lazy_mmu(void)
-{
-	paravirt_enter_lazy_mmu();
-}
-
-static void kvm_leave_lazy_mmu(void)
-{
-	struct kvm_para_state *state = kvm_para_state();
-
-	mmu_queue_flush(state);
-	paravirt_leave_lazy_mmu();
-}
-
 static void __init paravirt_ops_setup(void)
 {
 	pv_info.name = "KVM";
@@ -424,29 +266,6 @@ static void __init paravirt_ops_setup(void)
 	if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
 		pv_cpu_ops.io_delay = kvm_io_delay;
 
-	if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
-		pv_mmu_ops.set_pte = kvm_set_pte;
-		pv_mmu_ops.set_pte_at = kvm_set_pte_at;
-		pv_mmu_ops.set_pmd = kvm_set_pmd;
-#if PAGETABLE_LEVELS >= 3
-#ifdef CONFIG_X86_PAE
-		pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
-		pv_mmu_ops.pte_clear = kvm_pte_clear;
-		pv_mmu_ops.pmd_clear = kvm_pmd_clear;
-#endif
-		pv_mmu_ops.set_pud = kvm_set_pud;
-#if PAGETABLE_LEVELS == 4
-		pv_mmu_ops.set_pgd = kvm_set_pgd;
-#endif
-#endif
-		pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
-		pv_mmu_ops.release_pte = kvm_release_pt;
-		pv_mmu_ops.release_pmd = kvm_release_pt;
-		pv_mmu_ops.release_pud = kvm_release_pt;
-
-		pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
-		pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
-	}
 #ifdef CONFIG_X86_IO_APIC
 	no_timer_check = 1;
 #endif
-- 
cgit v1.2.3-55-g7522


From fb92045843a8cd99c7b843d9b567a680a3854ba1 Mon Sep 17 00:00:00 2001
From: Chris Wright
Date: Tue, 1 Nov 2011 17:31:18 -0700
Subject: KVM: MMU: remove KVM host pv mmu support

The host side pv mmu support has been marked for feature removal in
January 2011.  It's not in use, is slower than shadow or hardware
assisted paging, and a maintenance burden.  It's November 2011, time to
remove it.

Signed-off-by: Chris Wright <chrisw@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/feature-removal-schedule.txt |   9 --
 arch/x86/include/asm/kvm_host.h            |  13 ---
 arch/x86/kvm/mmu.c                         | 135 -----------------------------
 arch/x86/kvm/x86.c                         |  12 ---
 4 files changed, 169 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 3d849122b5b1..165019d527e6 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -362,15 +362,6 @@ Who: 	anybody or Florian Mickler <florian@mickler.org>
 
 ----------------------------
 
-What:	KVM paravirt mmu host support
-When:	January 2011
-Why:	The paravirt mmu host support is slower than non-paravirt mmu, both
-	on newer and older hardware.  It is already not exposed to the guest,
-	and kept only for live migration purposes.
-Who:	Avi Kivity <avi@redhat.com>
-
-----------------------------
-
 What:	iwlwifi 50XX module parameters
 When:	3.0
 Why:	The "..50" modules parameters were used to configure 5000 series and
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c1f19de8b51c..6d8326409974 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -244,13 +244,6 @@ struct kvm_mmu_page {
 	struct rcu_head rcu;
 };
 
-struct kvm_pv_mmu_op_buffer {
-	void *ptr;
-	unsigned len;
-	unsigned processed;
-	char buf[512] __aligned(sizeof(long));
-};
-
 struct kvm_pio_request {
 	unsigned long count;
 	int in;
@@ -347,10 +340,6 @@ struct kvm_vcpu_arch {
 	 */
 	struct kvm_mmu *walk_mmu;
 
-	/* only needed in kvm_pv_mmu_op() path, but it's hot so
-	 * put it here to avoid allocation */
-	struct kvm_pv_mmu_op_buffer mmu_op_buffer;
-
 	struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
 	struct kvm_mmu_memory_cache mmu_page_cache;
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
@@ -667,8 +656,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
 
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 			  const void *val, int bytes);
-int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
-		  gpa_t addr, unsigned long *ret);
 u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 extern bool tdp_enabled;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e9534cec003f..a9b3a32bed08 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2028,20 +2028,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
 
-static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
-{
-	struct kvm_mmu_page *sp;
-	struct hlist_node *node;
-	LIST_HEAD(invalid_list);
-
-	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
-		pgprintk("%s: zap %llx %x\n",
-			 __func__, gfn, sp->role.word);
-		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
-	}
-	kvm_mmu_commit_zap_page(kvm, &invalid_list);
-}
-
 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
 {
 	int slot = memslot_id(kvm, gfn);
@@ -4004,127 +3990,6 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
 	return nr_mmu_pages;
 }
 
-static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
-				unsigned len)
-{
-	if (len > buffer->len)
-		return NULL;
-	return buffer->ptr;
-}
-
-static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
-				unsigned len)
-{
-	void *ret;
-
-	ret = pv_mmu_peek_buffer(buffer, len);
-	if (!ret)
-		return ret;
-	buffer->ptr += len;
-	buffer->len -= len;
-	buffer->processed += len;
-	return ret;
-}
-
-static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
-			     gpa_t addr, gpa_t value)
-{
-	int bytes = 8;
-	int r;
-
-	if (!is_long_mode(vcpu) && !is_pae(vcpu))
-		bytes = 4;
-
-	r = mmu_topup_memory_caches(vcpu);
-	if (r)
-		return r;
-
-	if (!emulator_write_phys(vcpu, addr, &value, bytes))
-		return -EFAULT;
-
-	return 1;
-}
-
-static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
-{
-	(void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
-	return 1;
-}
-
-static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
-{
-	spin_lock(&vcpu->kvm->mmu_lock);
-	mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
-	spin_unlock(&vcpu->kvm->mmu_lock);
-	return 1;
-}
-
-static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
-			     struct kvm_pv_mmu_op_buffer *buffer)
-{
-	struct kvm_mmu_op_header *header;
-
-	header = pv_mmu_peek_buffer(buffer, sizeof *header);
-	if (!header)
-		return 0;
-	switch (header->op) {
-	case KVM_MMU_OP_WRITE_PTE: {
-		struct kvm_mmu_op_write_pte *wpte;
-
-		wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
-		if (!wpte)
-			return 0;
-		return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
-					wpte->pte_val);
-	}
-	case KVM_MMU_OP_FLUSH_TLB: {
-		struct kvm_mmu_op_flush_tlb *ftlb;
-
-		ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
-		if (!ftlb)
-			return 0;
-		return kvm_pv_mmu_flush_tlb(vcpu);
-	}
-	case KVM_MMU_OP_RELEASE_PT: {
-		struct kvm_mmu_op_release_pt *rpt;
-
-		rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
-		if (!rpt)
-			return 0;
-		return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
-	}
-	default: return 0;
-	}
-}
-
-int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
-		  gpa_t addr, unsigned long *ret)
-{
-	int r;
-	struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
-
-	buffer->ptr = buffer->buf;
-	buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
-	buffer->processed = 0;
-
-	r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
-	if (r)
-		goto out;
-
-	while (buffer->len) {
-		r = kvm_pv_mmu_op_one(vcpu, buffer);
-		if (r < 0)
-			goto out;
-		if (r == 0)
-			break;
-	}
-
-	r = 1;
-out:
-	*ret = buffer->processed;
-	return r;
-}
-
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 {
 	struct kvm_shadow_walk_iterator iterator;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9c980ce26e61..a3b25a524c9b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5273,15 +5273,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
-static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
-			   unsigned long a1)
-{
-	if (is_long_mode(vcpu))
-		return a0;
-	else
-		return a0 | ((gpa_t)a1 << 32);
-}
-
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 {
 	u64 param, ingpa, outgpa, ret;
@@ -5377,9 +5368,6 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 	case KVM_HC_VAPIC_POLL_IRQ:
 		ret = 0;
 		break;
-	case KVM_HC_MMU_OP:
-		r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
-		break;
 	default:
 		ret = -KVM_ENOSYS;
 		break;
-- 
cgit v1.2.3-55-g7522


From d6eebf8b80316ea61718dc115cd6a20c16195327 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Mon, 14 Nov 2011 18:21:34 +0900
Subject: KVM: MMU: Clean up BUG_ON() conditions in rmap_write_protect()

Remove redundant checks and use is_large_pte() macro.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a9b3a32bed08..973f25480afa 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1027,7 +1027,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 
 	spte = rmap_next(kvm, rmapp, NULL);
 	while (spte) {
-		BUG_ON(!spte);
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
 		if (is_writable_pte(*spte)) {
@@ -1043,9 +1042,8 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 		rmapp = gfn_to_rmap(kvm, gfn, i);
 		spte = rmap_next(kvm, rmapp, NULL);
 		while (spte) {
-			BUG_ON(!spte);
 			BUG_ON(!(*spte & PT_PRESENT_MASK));
-			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
+			BUG_ON(!is_large_pte(*spte));
 			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
 			if (is_writable_pte(*spte)) {
 				drop_spte(kvm, spte);
-- 
cgit v1.2.3-55-g7522


From 9b9b1492364758de82c19c36f07baa9ae162c7e5 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Mon, 14 Nov 2011 18:22:28 +0900
Subject: KVM: MMU: Split gfn_to_rmap() into two functions

rmap_write_protect() calls gfn_to_rmap() for each level with gfn fixed.
This results in calling gfn_to_memslot() repeatedly with that gfn.

This patch introduces __gfn_to_rmap() which takes the slot as an
argument to avoid this.

This is also needed for the following dirty logging optimization.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 973f25480afa..fa71085f75a3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -958,23 +958,29 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
 	}
 }
 
-/*
- * Take gfn and return the reverse mapping to it.
- */
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
+static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level,
+				    struct kvm_memory_slot *slot)
 {
-	struct kvm_memory_slot *slot;
 	struct kvm_lpage_info *linfo;
 
-	slot = gfn_to_memslot(kvm, gfn);
 	if (likely(level == PT_PAGE_TABLE_LEVEL))
 		return &slot->rmap[gfn - slot->base_gfn];
 
 	linfo = lpage_info_slot(gfn, slot, level);
-
 	return &linfo->rmap_pde;
 }
 
+/*
+ * Take gfn and return the reverse mapping to it.
+ */
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
+{
+	struct kvm_memory_slot *slot;
+
+	slot = gfn_to_memslot(kvm, gfn);
+	return __gfn_to_rmap(kvm, gfn, level, slot);
+}
+
 static bool rmap_can_add(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu_memory_cache *cache;
@@ -1019,12 +1025,14 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 
 static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
+	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
 	u64 *spte;
 	int i, write_protected = 0;
 
-	rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
+	slot = gfn_to_memslot(kvm, gfn);
 
+	rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot);
 	spte = rmap_next(kvm, rmapp, NULL);
 	while (spte) {
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
@@ -1039,7 +1047,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 	/* check for huge page mappings */
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		rmapp = gfn_to_rmap(kvm, gfn, i);
+		rmapp = __gfn_to_rmap(kvm, gfn, i, slot);
 		spte = rmap_next(kvm, rmapp, NULL);
 		while (spte) {
 			BUG_ON(!(*spte & PT_PRESENT_MASK));
-- 
cgit v1.2.3-55-g7522


From 7850ac5420803996e2960d15b924021f28e0dffc Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Mon, 14 Nov 2011 18:23:34 +0900
Subject: KVM: Count the number of dirty pages for dirty logging

Needed for the next patch which uses this number to decide how to write
protect a slot.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c       | 9 +++------
 include/linux/kvm_host.h | 1 +
 virt/kvm/kvm_main.c      | 4 +++-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a3b25a524c9b..220c83b0fbda 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3466,10 +3466,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 				      struct kvm_dirty_log *log)
 {
-	int r, i;
+	int r;
 	struct kvm_memory_slot *memslot;
 	unsigned long n;
-	unsigned long is_dirty = 0;
 
 	mutex_lock(&kvm->slots_lock);
 
@@ -3484,11 +3483,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 
 	n = kvm_dirty_bitmap_bytes(memslot);
 
-	for (i = 0; !is_dirty && i < n/sizeof(long); i++)
-		is_dirty = memslot->dirty_bitmap[i];
-
 	/* If nothing is dirty, don't bother messing with page tables. */
-	if (is_dirty) {
+	if (memslot->nr_dirty_pages) {
 		struct kvm_memslots *slots, *old_slots;
 		unsigned long *dirty_bitmap;
 
@@ -3503,6 +3499,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 			goto out;
 		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
 		slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
+		slots->memslots[log->slot].nr_dirty_pages = 0;
 		slots->generation++;
 
 		old_slots = kvm->memslots;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c6a2ec925846..7c654aa46b6c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -181,6 +181,7 @@ struct kvm_memory_slot {
 	unsigned long *rmap;
 	unsigned long *dirty_bitmap;
 	unsigned long *dirty_bitmap_head;
+	unsigned long nr_dirty_pages;
 	struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
 	unsigned long userspace_addr;
 	int user_alloc;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4c5b9a239674..af5c988cafcc 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -625,6 +625,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 		return -ENOMEM;
 
 	memslot->dirty_bitmap_head = memslot->dirty_bitmap;
+	memslot->nr_dirty_pages = 0;
 	return 0;
 }
 #endif /* !CONFIG_S390 */
@@ -1491,7 +1492,8 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
 	if (memslot && memslot->dirty_bitmap) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
 
-		__set_bit_le(rel_gfn, memslot->dirty_bitmap);
+		if (!__test_and_set_bit_le(rel_gfn, memslot->dirty_bitmap))
+			memslot->nr_dirty_pages++;
 	}
 }
 
-- 
cgit v1.2.3-55-g7522


From 95d4c16ce78cb6b7549a09159c409d52ddd18dae Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Mon, 14 Nov 2011 18:24:50 +0900
Subject: KVM: Optimize dirty logging by rmap_write_protect()

Currently, write protecting a slot needs to walk all the shadow pages
and checks ones which have a pte mapping a page in it.

The walk is overly heavy when dirty pages in that slot are not so many
and checking the shadow pages would result in unwanted cache pollution.

To mitigate this problem, we use rmap_write_protect() and check only
the sptes which can be reached from gfns marked in the dirty bitmap
when the number of dirty pages are less than that of shadow pages.

This criterion is reasonable in its meaning and worked well in our test:
write protection became some times faster than before when the ratio of
dirty pages are low and was not worse even when the ratio was near the
criterion.

Note that the locking for this write protection becomes fine grained.
The reason why this is safe is descripted in the comments.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/mmu.c              | 14 +++++++---
 arch/x86/kvm/x86.c              | 58 ++++++++++++++++++++++++++++++++++++-----
 3 files changed, 63 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6d8326409974..69b652547489 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -648,6 +648,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
+			       struct kvm_memory_slot *slot);
 void kvm_mmu_zap_all(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fa71085f75a3..aecdea265f7e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1023,15 +1023,13 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 		rmap_remove(kvm, sptep);
 }
 
-static int rmap_write_protect(struct kvm *kvm, u64 gfn)
+int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
+			       struct kvm_memory_slot *slot)
 {
-	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
 	u64 *spte;
 	int i, write_protected = 0;
 
-	slot = gfn_to_memslot(kvm, gfn);
-
 	rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot);
 	spte = rmap_next(kvm, rmapp, NULL);
 	while (spte) {
@@ -1066,6 +1064,14 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 	return write_protected;
 }
 
+static int rmap_write_protect(struct kvm *kvm, u64 gfn)
+{
+	struct kvm_memory_slot *slot;
+
+	slot = gfn_to_memslot(kvm, gfn);
+	return kvm_mmu_rmap_write_protect(kvm, gfn, slot);
+}
+
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			   unsigned long data)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 220c83b0fbda..af546b768ffd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3460,6 +3460,50 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 	return 0;
 }
 
+/**
+ * write_protect_slot - write protect a slot for dirty logging
+ * @kvm: the kvm instance
+ * @memslot: the slot we protect
+ * @dirty_bitmap: the bitmap indicating which pages are dirty
+ * @nr_dirty_pages: the number of dirty pages
+ *
+ * We have two ways to find all sptes to protect:
+ * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and
+ *    checks ones that have a spte mapping a page in the slot.
+ * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap.
+ *
+ * Generally speaking, if there are not so many dirty pages compared to the
+ * number of shadow pages, we should use the latter.
+ *
+ * Note that letting others write into a page marked dirty in the old bitmap
+ * by using the remaining tlb entry is not a problem.  That page will become
+ * write protected again when we flush the tlb and then be reported dirty to
+ * the user space by copying the old bitmap.
+ */
+static void write_protect_slot(struct kvm *kvm,
+			       struct kvm_memory_slot *memslot,
+			       unsigned long *dirty_bitmap,
+			       unsigned long nr_dirty_pages)
+{
+	/* Not many dirty pages compared to # of shadow pages. */
+	if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
+		unsigned long gfn_offset;
+
+		for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
+			unsigned long gfn = memslot->base_gfn + gfn_offset;
+
+			spin_lock(&kvm->mmu_lock);
+			kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
+			spin_unlock(&kvm->mmu_lock);
+		}
+		kvm_flush_remote_tlbs(kvm);
+	} else {
+		spin_lock(&kvm->mmu_lock);
+		kvm_mmu_slot_remove_write_access(kvm, memslot->id);
+		spin_unlock(&kvm->mmu_lock);
+	}
+}
+
 /*
  * Get (and clear) the dirty memory log for a memory slot.
  */
@@ -3468,7 +3512,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 {
 	int r;
 	struct kvm_memory_slot *memslot;
-	unsigned long n;
+	unsigned long n, nr_dirty_pages;
 
 	mutex_lock(&kvm->slots_lock);
 
@@ -3482,9 +3526,10 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		goto out;
 
 	n = kvm_dirty_bitmap_bytes(memslot);
+	nr_dirty_pages = memslot->nr_dirty_pages;
 
 	/* If nothing is dirty, don't bother messing with page tables. */
-	if (memslot->nr_dirty_pages) {
+	if (nr_dirty_pages) {
 		struct kvm_memslots *slots, *old_slots;
 		unsigned long *dirty_bitmap;
 
@@ -3498,8 +3543,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		if (!slots)
 			goto out;
 		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
-		slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
-		slots->memslots[log->slot].nr_dirty_pages = 0;
+		memslot = &slots->memslots[log->slot];
+		memslot->dirty_bitmap = dirty_bitmap;
+		memslot->nr_dirty_pages = 0;
 		slots->generation++;
 
 		old_slots = kvm->memslots;
@@ -3508,9 +3554,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
 		kfree(old_slots);
 
-		spin_lock(&kvm->mmu_lock);
-		kvm_mmu_slot_remove_write_access(kvm, log->slot);
-		spin_unlock(&kvm->mmu_lock);
+		write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages);
 
 		r = -EFAULT;
 		if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
-- 
cgit v1.2.3-55-g7522


From 46199f33c29533e7ad2a7d2128dc30175d1d4157 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 17 Nov 2011 10:56:09 +0200
Subject: KVM: VMX: remove unneeded vmx_load_host_state() calls.

vmx_load_host_state() does not handle msrs switching (except
MSR_KERNEL_GS_BASE) since commit 26bb0981b3f. Remove call to it
where it is no longer make sense.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6e28d582a514..ba24022f4575 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1747,7 +1747,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 	int save_nmsrs, index;
 	unsigned long *msr_bitmap;
 
-	vmx_load_host_state(vmx);
 	save_nmsrs = 0;
 #ifdef CONFIG_X86_64
 	if (is_long_mode(&vmx->vcpu)) {
@@ -2142,12 +2141,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 			return 1;
 		/* Otherwise falls through */
 	default:
-		vmx_load_host_state(to_vmx(vcpu));
 		if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
 			return 0;
 		msr = find_msr_entry(to_vmx(vcpu), msr_index);
 		if (msr) {
-			vmx_load_host_state(to_vmx(vcpu));
 			data = msr->data;
 			break;
 		}
@@ -2171,7 +2168,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 
 	switch (msr_index) {
 	case MSR_EFER:
-		vmx_load_host_state(vmx);
 		ret = kvm_set_msr_common(vcpu, msr_index, data);
 		break;
 #ifdef CONFIG_X86_64
@@ -2220,7 +2216,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 			break;
 		msr = find_msr_entry(vmx, msr_index);
 		if (msr) {
-			vmx_load_host_state(vmx);
 			msr->data = data;
 			break;
 		}
-- 
cgit v1.2.3-55-g7522


From d7841a4b1b6e8509881e1ec21c024c82ccf565a6 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Tue, 22 Nov 2011 15:16:54 +0900
Subject: KVM: x86 emulator: Use opcode::execute for IN/OUT

IN : E4, E5, EC, ED
OUT: E6, E7, EE, EF

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 54 ++++++++++++++++++++++++++------------------------
 1 file changed, 28 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8547958e3582..8ba4ea8cac72 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2776,6 +2776,24 @@ static int em_jcxz(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_in(struct x86_emulate_ctxt *ctxt)
+{
+	if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
+			     &ctxt->dst.val))
+		return X86EMUL_IO_NEEDED;
+
+	return X86EMUL_CONTINUE;
+}
+
+static int em_out(struct x86_emulate_ctxt *ctxt)
+{
+	ctxt->ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
+				    &ctxt->src.val, 1);
+	/* Disable writeback. */
+	ctxt->dst.type = OP_NONE;
+	return X86EMUL_CONTINUE;
+}
+
 static int em_cli(struct x86_emulate_ctxt *ctxt)
 {
 	if (emulator_bad_iopl(ctxt))
@@ -3004,6 +3022,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 #define D2bv(_f)      D((_f) | ByteOp), D(_f)
 #define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
 #define I2bv(_f, _e)  I((_f) | ByteOp, _e), I(_f, _e)
+#define I2bvIP(_f, _e, _i, _p) \
+	IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p)
 
 #define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e),		\
 		I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e),	\
@@ -3217,13 +3237,13 @@ static struct opcode opcode_table[256] = {
 	/* 0xE0 - 0xE7 */
 	X3(I(SrcImmByte, em_loop)),
 	I(SrcImmByte, em_jcxz),
-	D2bvIP(SrcImmUByte | DstAcc, in,  check_perm_in),
-	D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out),
+	I2bvIP(SrcImmUByte | DstAcc, em_in,  in,  check_perm_in),
+	I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
 	/* 0xE8 - 0xEF */
 	D(SrcImm | Stack), D(SrcImm | ImplicitOps),
 	I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps),
-	D2bvIP(SrcDX | DstAcc, in,  check_perm_in),
-	D2bvIP(SrcAcc | DstDX, out, check_perm_out),
+	I2bvIP(SrcDX | DstAcc, em_in,  in,  check_perm_in),
+	I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out),
 	/* 0xF0 - 0xF7 */
 	N, DI(ImplicitOps, icebp), N, N,
 	DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
@@ -3325,6 +3345,7 @@ static struct opcode twobyte_table[256] = {
 #undef D2bv
 #undef D2bvIP
 #undef I2bv
+#undef I2bvIP
 #undef I6ALU
 
 static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
@@ -3867,11 +3888,12 @@ special_insn:
 	case 0x6c:		/* insb */
 	case 0x6d:		/* insw/insd */
 		ctxt->src.val = ctxt->regs[VCPU_REGS_RDX];
-		goto do_io_in;
+		rc = em_in(ctxt);
+		break;
 	case 0x6e:		/* outsb */
 	case 0x6f:		/* outsw/outsd */
 		ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX];
-		goto do_io_out;
+		rc = em_out(ctxt);
 		break;
 	case 0x70 ... 0x7f: /* jcc (short) */
 		if (test_cc(ctxt->b, ctxt->eflags))
@@ -3915,12 +3937,6 @@ special_insn:
 		ctxt->src.val = ctxt->regs[VCPU_REGS_RCX];
 		rc = em_grp2(ctxt);
 		break;
-	case 0xe4: 	/* inb */
-	case 0xe5: 	/* in */
-		goto do_io_in;
-	case 0xe6: /* outb */
-	case 0xe7: /* out */
-		goto do_io_out;
 	case 0xe8: /* call (near) */ {
 		long int rel = ctxt->src.val;
 		ctxt->src.val = (unsigned long) ctxt->_eip;
@@ -3933,20 +3949,6 @@ special_insn:
 		jmp_rel(ctxt, ctxt->src.val);
 		ctxt->dst.type = OP_NONE; /* Disable writeback. */
 		break;
-	case 0xec: /* in al,dx */
-	case 0xed: /* in (e/r)ax,dx */
-	do_io_in:
-		if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
-				     &ctxt->dst.val))
-			goto done; /* IO is needed */
-		break;
-	case 0xee: /* out dx,al */
-	case 0xef: /* out dx,(e/r)ax */
-	do_io_out:
-		ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
-				      &ctxt->src.val, 1);
-		ctxt->dst.type = OP_NONE;	/* Disable writeback. */
-		break;
 	case 0xf4:              /* hlt */
 		ctxt->ops->halt(ctxt);
 		break;
-- 
cgit v1.2.3-55-g7522


From ce7faab24fbfb0b5207636ee4795e924bcf97e8a Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Tue, 22 Nov 2011 15:17:48 +0900
Subject: KVM: x86 emulator: Use opcode::execute for BT family

BT : 0F A3
BTS: 0F AB
BTR: 0F B3
BTC: 0F BB

Group 8: 0F BA

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 77 +++++++++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 39 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8ba4ea8cac72..7a9ce6dbd1ce 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2813,6 +2813,35 @@ static int em_sti(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_bt(struct x86_emulate_ctxt *ctxt)
+{
+	/* Disable writeback. */
+	ctxt->dst.type = OP_NONE;
+	/* only subword offset */
+	ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
+
+	emulate_2op_SrcV_nobyte(ctxt, "bt");
+	return X86EMUL_CONTINUE;
+}
+
+static int em_bts(struct x86_emulate_ctxt *ctxt)
+{
+	emulate_2op_SrcV_nobyte(ctxt, "bts");
+	return X86EMUL_CONTINUE;
+}
+
+static int em_btr(struct x86_emulate_ctxt *ctxt)
+{
+	emulate_2op_SrcV_nobyte(ctxt, "btr");
+	return X86EMUL_CONTINUE;
+}
+
+static int em_btc(struct x86_emulate_ctxt *ctxt)
+{
+	emulate_2op_SrcV_nobyte(ctxt, "btc");
+	return X86EMUL_CONTINUE;
+}
+
 static bool valid_cr(int nr)
 {
 	switch (nr) {
@@ -3117,10 +3146,10 @@ static struct group_dual group7 = { {
 
 static struct opcode group8[] = {
 	N, N, N, N,
-	D(DstMem | SrcImmByte | ModRM),
-	D(DstMem | SrcImmByte | ModRM | Lock | PageTable),
-	D(DstMem | SrcImmByte | ModRM | Lock),
-	D(DstMem | SrcImmByte | ModRM | Lock | PageTable),
+	I(DstMem | SrcImmByte | ModRM, em_bt),
+	I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts),
+	I(DstMem | SrcImmByte | ModRM | Lock, em_btr),
+	I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc),
 };
 
 static struct group_dual group9 = { {
@@ -3299,26 +3328,27 @@ static struct opcode twobyte_table[256] = {
 	X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
 	/* 0xA0 - 0xA7 */
 	I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
-	DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp),
+	DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),
 	D(DstMem | SrcReg | Src2ImmByte | ModRM),
 	D(DstMem | SrcReg | Src2CL | ModRM), N, N,
 	/* 0xA8 - 0xAF */
 	I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
 	DI(ImplicitOps, rsm),
-	D(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable),
+	I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
 	D(DstMem | SrcReg | Src2ImmByte | ModRM),
 	D(DstMem | SrcReg | Src2CL | ModRM),
 	D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
 	/* 0xB0 - 0xB7 */
 	D2bv(DstMem | SrcReg | ModRM | Lock | PageTable),
 	I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
-	D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
 	I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
 	I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xB8 - 0xBF */
 	N, N,
-	G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable),
+	G(BitOp, group8),
+	I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
 	D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
@@ -4103,21 +4133,10 @@ twobyte_insn:
 	case 0x90 ... 0x9f:     /* setcc r/m8 */
 		ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
 		break;
-	case 0xa3:
-	      bt:		/* bt */
-		ctxt->dst.type = OP_NONE;
-		/* only subword offset */
-		ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
-		emulate_2op_SrcV_nobyte(ctxt, "bt");
-		break;
 	case 0xa4: /* shld imm8, r, r/m */
 	case 0xa5: /* shld cl, r, r/m */
 		emulate_2op_cl(ctxt, "shld");
 		break;
-	case 0xab:
-	      bts:		/* bts */
-		emulate_2op_SrcV_nobyte(ctxt, "bts");
-		break;
 	case 0xac: /* shrd imm8, r, r/m */
 	case 0xad: /* shrd cl, r, r/m */
 		emulate_2op_cl(ctxt, "shrd");
@@ -4141,31 +4160,11 @@ twobyte_insn:
 			ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
 		}
 		break;
-	case 0xb3:
-	      btr:		/* btr */
-		emulate_2op_SrcV_nobyte(ctxt, "btr");
-		break;
 	case 0xb6 ... 0xb7:	/* movzx */
 		ctxt->dst.bytes = ctxt->op_bytes;
 		ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
 						       : (u16) ctxt->src.val;
 		break;
-	case 0xba:		/* Grp8 */
-		switch (ctxt->modrm_reg & 3) {
-		case 0:
-			goto bt;
-		case 1:
-			goto bts;
-		case 2:
-			goto btr;
-		case 3:
-			goto btc;
-		}
-		break;
-	case 0xbb:
-	      btc:		/* btc */
-		emulate_2op_SrcV_nobyte(ctxt, "btc");
-		break;
 	case 0xbc: {		/* bsf */
 		u8 zf;
 		__asm__ ("bsf %2, %0; setz %1"
-- 
cgit v1.2.3-55-g7522


From d4ddafcdf2201326ec9717172767cfad0ede1472 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Tue, 22 Nov 2011 15:18:35 +0900
Subject: KVM: x86 emulator: Use opcode::execute for CALL

CALL: E8

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7a9ce6dbd1ce..6b7a03b18f89 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2482,6 +2482,15 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_call(struct x86_emulate_ctxt *ctxt)
+{
+	long rel = ctxt->src.val;
+
+	ctxt->src.val = (unsigned long)ctxt->_eip;
+	jmp_rel(ctxt, rel);
+	return em_push(ctxt);
+}
+
 static int em_call_far(struct x86_emulate_ctxt *ctxt)
 {
 	u16 sel, old_cs;
@@ -3269,7 +3278,7 @@ static struct opcode opcode_table[256] = {
 	I2bvIP(SrcImmUByte | DstAcc, em_in,  in,  check_perm_in),
 	I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
 	/* 0xE8 - 0xEF */
-	D(SrcImm | Stack), D(SrcImm | ImplicitOps),
+	I(SrcImm | Stack, em_call), D(SrcImm | ImplicitOps),
 	I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps),
 	I2bvIP(SrcDX | DstAcc, em_in,  in,  check_perm_in),
 	I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out),
@@ -3967,13 +3976,6 @@ special_insn:
 		ctxt->src.val = ctxt->regs[VCPU_REGS_RCX];
 		rc = em_grp2(ctxt);
 		break;
-	case 0xe8: /* call (near) */ {
-		long int rel = ctxt->src.val;
-		ctxt->src.val = (unsigned long) ctxt->_eip;
-		jmp_rel(ctxt, rel);
-		rc = em_push(ctxt);
-		break;
-	}
 	case 0xe9: /* jmp rel */
 	case 0xeb: /* jmp rel short */
 		jmp_rel(ctxt, ctxt->src.val);
-- 
cgit v1.2.3-55-g7522


From bc00f8d2c2df33c6e7fc8a12c94c0c74d491e566 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Tue, 22 Nov 2011 15:19:19 +0900
Subject: KVM: x86 emulator: Use opcode::execute for MOV to cr/dr

MOV: 0F 22 (move to control registers)
MOV: 0F 23 (move to debug registers)

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 52 +++++++++++++++++++++++++++++---------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6b7a03b18f89..7fe5ed126f6f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2638,6 +2638,34 @@ static int em_mov(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_cr_write(struct x86_emulate_ctxt *ctxt)
+{
+	if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
+		return emulate_gp(ctxt, 0);
+
+	/* Disable writeback. */
+	ctxt->dst.type = OP_NONE;
+	return X86EMUL_CONTINUE;
+}
+
+static int em_dr_write(struct x86_emulate_ctxt *ctxt)
+{
+	unsigned long val;
+
+	if (ctxt->mode == X86EMUL_MODE_PROT64)
+		val = ctxt->src.val & ~0ULL;
+	else
+		val = ctxt->src.val & ~0U;
+
+	/* #UD condition is already handled. */
+	if (ctxt->ops->set_dr(ctxt, ctxt->modrm_reg, val) < 0)
+		return emulate_gp(ctxt, 0);
+
+	/* Disable writeback. */
+	ctxt->dst.type = OP_NONE;
+	return X86EMUL_CONTINUE;
+}
+
 static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
 {
 	if (ctxt->modrm_reg > VCPU_SREG_GS)
@@ -3304,8 +3332,8 @@ static struct opcode twobyte_table[256] = {
 	/* 0x20 - 0x2F */
 	DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read),
 	DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read),
-	DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write),
-	DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write),
+	IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
+	IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
 	N, N, N, N,
 	N, N, N, N, N, N, N, N,
 	/* 0x30 - 0x3F */
@@ -4080,26 +4108,6 @@ twobyte_insn:
 	case 0x21: /* mov from dr to reg */
 		ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
 		break;
-	case 0x22: /* mov reg, cr */
-		if (ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) {
-			emulate_gp(ctxt, 0);
-			rc = X86EMUL_PROPAGATE_FAULT;
-			goto done;
-		}
-		ctxt->dst.type = OP_NONE;
-		break;
-	case 0x23: /* mov from reg to dr */
-		if (ops->set_dr(ctxt, ctxt->modrm_reg, ctxt->src.val &
-				((ctxt->mode == X86EMUL_MODE_PROT64) ?
-				 ~0ULL : ~0U)) < 0) {
-			/* #UD condition is already handled by the code above */
-			emulate_gp(ctxt, 0);
-			rc = X86EMUL_PROPAGATE_FAULT;
-			goto done;
-		}
-
-		ctxt->dst.type = OP_NONE;	/* no writeback */
-		break;
 	case 0x30:
 		/* wrmsr */
 		msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
-- 
cgit v1.2.3-55-g7522


From e1e210b0a7f7b4d14577bdc76719963f7facc0e7 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Tue, 22 Nov 2011 15:20:03 +0900
Subject: KVM: x86 emulator: Use opcode::execute for WRMSR/RDMSR

WRMSR: 0F 30
RDMSR: 0F 32

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 52 +++++++++++++++++++++++++-------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7fe5ed126f6f..906c5eb34aa7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2666,6 +2666,30 @@ static int em_dr_write(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
+{
+	u64 msr_data;
+
+	msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
+		| ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
+	if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data))
+		return emulate_gp(ctxt, 0);
+
+	return X86EMUL_CONTINUE;
+}
+
+static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
+{
+	u64 msr_data;
+
+	if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data))
+		return emulate_gp(ctxt, 0);
+
+	ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
+	ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
+	return X86EMUL_CONTINUE;
+}
+
 static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
 {
 	if (ctxt->modrm_reg > VCPU_SREG_GS)
@@ -3337,9 +3361,9 @@ static struct opcode twobyte_table[256] = {
 	N, N, N, N,
 	N, N, N, N, N, N, N, N,
 	/* 0x30 - 0x3F */
-	DI(ImplicitOps | Priv, wrmsr),
+	II(ImplicitOps | Priv, em_wrmsr, wrmsr),
 	IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
-	DI(ImplicitOps | Priv, rdmsr),
+	II(ImplicitOps | Priv, em_rdmsr, rdmsr),
 	DIP(ImplicitOps | Priv, rdpmc, check_rdpmc),
 	I(ImplicitOps | VendorSpecific, em_sysenter),
 	I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
@@ -3818,7 +3842,6 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
 	struct x86_emulate_ops *ops = ctxt->ops;
-	u64 msr_data;
 	int rc = X86EMUL_CONTINUE;
 	int saved_dst_type = ctxt->dst.type;
 
@@ -4108,29 +4131,6 @@ twobyte_insn:
 	case 0x21: /* mov from dr to reg */
 		ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
 		break;
-	case 0x30:
-		/* wrmsr */
-		msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
-			| ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
-		if (ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) {
-			emulate_gp(ctxt, 0);
-			rc = X86EMUL_PROPAGATE_FAULT;
-			goto done;
-		}
-		rc = X86EMUL_CONTINUE;
-		break;
-	case 0x32:
-		/* rdmsr */
-		if (ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) {
-			emulate_gp(ctxt, 0);
-			rc = X86EMUL_PROPAGATE_FAULT;
-			goto done;
-		} else {
-			ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
-			ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
-		}
-		rc = X86EMUL_CONTINUE;
-		break;
 	case 0x40 ... 0x4f:	/* cmov */
 		ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val;
 		if (!test_cc(ctxt->b, ctxt->eflags))
-- 
cgit v1.2.3-55-g7522


From e940b5c20f89282fe826c5e2237932ab280497cf Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Tue, 22 Nov 2011 15:20:47 +0900
Subject: KVM: x86 emulator: Use opcode::execute for CMPXCHG

CMPXCHG: 0F B0, 0F B1

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 906c5eb34aa7..799000d8bf8b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1832,6 +1832,24 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
 	return rc;
 }
 
+static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
+{
+	/* Save real source value, then compare EAX against destination. */
+	ctxt->src.orig_val = ctxt->src.val;
+	ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
+	emulate_2op_SrcV(ctxt, "cmp");
+
+	if (ctxt->eflags & EFLG_ZF) {
+		/* Success: write back to memory. */
+		ctxt->dst.val = ctxt->src.orig_val;
+	} else {
+		/* Failure: write the value we saw to EAX. */
+		ctxt->dst.type = OP_REG;
+		ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
+	}
+	return X86EMUL_CONTINUE;
+}
+
 static int em_lseg(struct x86_emulate_ctxt *ctxt)
 {
 	int seg = ctxt->src2.val;
@@ -3400,7 +3418,7 @@ static struct opcode twobyte_table[256] = {
 	D(DstMem | SrcReg | Src2CL | ModRM),
 	D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
 	/* 0xB0 - 0xB7 */
-	D2bv(DstMem | SrcReg | ModRM | Lock | PageTable),
+	I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg),
 	I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
 	I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
 	I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
@@ -4153,23 +4171,6 @@ twobyte_insn:
 		break;
 	case 0xae:              /* clflush */
 		break;
-	case 0xb0 ... 0xb1:	/* cmpxchg */
-		/*
-		 * Save real source value, then compare EAX against
-		 * destination.
-		 */
-		ctxt->src.orig_val = ctxt->src.val;
-		ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
-		emulate_2op_SrcV(ctxt, "cmp");
-		if (ctxt->eflags & EFLG_ZF) {
-			/* Success: write back to memory. */
-			ctxt->dst.val = ctxt->src.orig_val;
-		} else {
-			/* Failure: write the value we saw to EAX. */
-			ctxt->dst.type = OP_REG;
-			ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
-		}
-		break;
 	case 0xb6 ... 0xb7:	/* movzx */
 		ctxt->dst.bytes = ctxt->op_bytes;
 		ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
-- 
cgit v1.2.3-55-g7522


From ff227392cd7b858b2b04732e02697122fd1b35b0 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Tue, 22 Nov 2011 15:21:33 +0900
Subject: KVM: x86 emulator: Use opcode::execute for BSF/BSR

BSF: 0F BC
BSR: 0F BD

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 60 +++++++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 799000d8bf8b..4cd3313b4131 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2921,6 +2921,40 @@ static int em_btc(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_bsf(struct x86_emulate_ctxt *ctxt)
+{
+	u8 zf;
+
+	__asm__ ("bsf %2, %0; setz %1"
+		 : "=r"(ctxt->dst.val), "=q"(zf)
+		 : "r"(ctxt->src.val));
+
+	ctxt->eflags &= ~X86_EFLAGS_ZF;
+	if (zf) {
+		ctxt->eflags |= X86_EFLAGS_ZF;
+		/* Disable writeback. */
+		ctxt->dst.type = OP_NONE;
+	}
+	return X86EMUL_CONTINUE;
+}
+
+static int em_bsr(struct x86_emulate_ctxt *ctxt)
+{
+	u8 zf;
+
+	__asm__ ("bsr %2, %0; setz %1"
+		 : "=r"(ctxt->dst.val), "=q"(zf)
+		 : "r"(ctxt->src.val));
+
+	ctxt->eflags &= ~X86_EFLAGS_ZF;
+	if (zf) {
+		ctxt->eflags |= X86_EFLAGS_ZF;
+		/* Disable writeback. */
+		ctxt->dst.type = OP_NONE;
+	}
+	return X86EMUL_CONTINUE;
+}
+
 static bool valid_cr(int nr)
 {
 	switch (nr) {
@@ -3428,7 +3462,7 @@ static struct opcode twobyte_table[256] = {
 	N, N,
 	G(BitOp, group8),
 	I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
-	D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
 	D2bv(DstMem | SrcReg | ModRM | Lock),
@@ -4176,30 +4210,6 @@ twobyte_insn:
 		ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
 						       : (u16) ctxt->src.val;
 		break;
-	case 0xbc: {		/* bsf */
-		u8 zf;
-		__asm__ ("bsf %2, %0; setz %1"
-			 : "=r"(ctxt->dst.val), "=q"(zf)
-			 : "r"(ctxt->src.val));
-		ctxt->eflags &= ~X86_EFLAGS_ZF;
-		if (zf) {
-			ctxt->eflags |= X86_EFLAGS_ZF;
-			ctxt->dst.type = OP_NONE;	/* Disable writeback. */
-		}
-		break;
-	}
-	case 0xbd: {		/* bsr */
-		u8 zf;
-		__asm__ ("bsr %2, %0; setz %1"
-			 : "=r"(ctxt->dst.val), "=q"(zf)
-			 : "r"(ctxt->src.val));
-		ctxt->eflags &= ~X86_EFLAGS_ZF;
-		if (zf) {
-			ctxt->eflags |= X86_EFLAGS_ZF;
-			ctxt->dst.type = OP_NONE;	/* Disable writeback. */
-		}
-		break;
-	}
 	case 0xbe ... 0xbf:	/* movsx */
 		ctxt->dst.bytes = ctxt->op_bytes;
 		ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val :
-- 
cgit v1.2.3-55-g7522


From 93a5cef07d686a0341d056b0f930a762c7174a13 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Thu, 24 Nov 2011 17:37:48 +0800
Subject: KVM: introduce KVM_MEM_SLOTS_NUM macro

Introduce KVM_MEM_SLOTS_NUM macro to instead of
KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 4 +++-
 arch/x86/kvm/mmu.c              | 2 +-
 include/linux/kvm_host.h        | 7 +++++--
 virt/kvm/kvm_main.c             | 2 +-
 4 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 69b652547489..1769f3dde611 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -31,6 +31,8 @@
 #define KVM_MEMORY_SLOTS 32
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
+#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+
 #define KVM_MMIO_SIZE 16
 
 #define KVM_PIO_PAGE_OFFSET 1
@@ -228,7 +230,7 @@ struct kvm_mmu_page {
 	 * One bit set per slot which has memory
 	 * in this shadow page.
 	 */
-	DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
+	DECLARE_BITMAP(slot_bitmap, KVM_MEM_SLOTS_NUM);
 	bool unsync;
 	int root_count;          /* Currently serving as active root */
 	unsigned int unsync_children;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index aecdea265f7e..715dcb4fb798 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1349,7 +1349,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 						  PAGE_SIZE);
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
-	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
+	bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
 	sp->parent_ptes = 0;
 	mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7c654aa46b6c..924df0d7ac5f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -227,11 +227,14 @@ struct kvm_irq_routing_table {};
 
 #endif
 
+#ifndef KVM_MEM_SLOTS_NUM
+#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+#endif
+
 struct kvm_memslots {
 	int nmemslots;
 	u64 generation;
-	struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
-					KVM_PRIVATE_MEM_SLOTS];
+	struct kvm_memory_slot memslots[KVM_MEM_SLOTS_NUM];
 };
 
 struct kvm {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index af5c988cafcc..9ad94c9996e7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -663,7 +663,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 			(void __user *)(unsigned long)mem->userspace_addr,
 			mem->memory_size)))
 		goto out;
-	if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+	if (mem->slot >= KVM_MEM_SLOTS_NUM)
 		goto out;
 	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
 		goto out;
-- 
cgit v1.2.3-55-g7522


From be593d6286075801bba6d60fa466a39c24cc7616 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Thu, 24 Nov 2011 17:38:24 +0800
Subject: KVM: introduce update_memslots function

Introduce update_memslots to update slot which will be update to
kvm->memslots

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c       |  2 +-
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      | 22 +++++++++++++++-------
 3 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index af546b768ffd..917a287d21c8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3546,7 +3546,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		memslot = &slots->memslots[log->slot];
 		memslot->dirty_bitmap = dirty_bitmap;
 		memslot->nr_dirty_pages = 0;
-		slots->generation++;
+		update_memslots(slots, NULL);
 
 		old_slots = kvm->memslots;
 		rcu_assign_pointer(kvm->memslots, slots);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 924df0d7ac5f..23f795c66220 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -320,6 +320,7 @@ void kvm_exit(void);
 
 void kvm_get_kvm(struct kvm *kvm);
 void kvm_put_kvm(struct kvm *kvm);
+void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new);
 
 static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9ad94c9996e7..b5ed7770ced3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -630,6 +630,19 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 }
 #endif /* !CONFIG_S390 */
 
+void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
+{
+	if (new) {
+		int id = new->id;
+
+		slots->memslots[id] = *new;
+		if (id >= slots->nmemslots)
+			slots->nmemslots = id + 1;
+	}
+
+	slots->generation++;
+}
+
 /*
  * Allocate some memory and give it an address in the guest physical address
  * space.
@@ -780,10 +793,8 @@ skip_lpage:
 				GFP_KERNEL);
 		if (!slots)
 			goto out_free;
-		if (mem->slot >= slots->nmemslots)
-			slots->nmemslots = mem->slot + 1;
-		slots->generation++;
 		slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
+		update_memslots(slots, NULL);
 
 		old_memslots = kvm->memslots;
 		rcu_assign_pointer(kvm->memslots, slots);
@@ -815,9 +826,6 @@ skip_lpage:
 			GFP_KERNEL);
 	if (!slots)
 		goto out_free;
-	if (mem->slot >= slots->nmemslots)
-		slots->nmemslots = mem->slot + 1;
-	slots->generation++;
 
 	/* actual memory is freed via old in kvm_free_physmem_slot below */
 	if (!npages) {
@@ -827,7 +835,7 @@ skip_lpage:
 			new.lpage_info[i] = NULL;
 	}
 
-	slots->memslots[mem->slot] = new;
+	update_memslots(slots, &new);
 	old_memslots = kvm->memslots;
 	rcu_assign_pointer(kvm->memslots, slots);
 	synchronize_srcu_expedited(&kvm->srcu);
-- 
cgit v1.2.3-55-g7522


From be6ba0f0962a39091c52eb9167ddea201fe80716 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Thu, 24 Nov 2011 17:39:18 +0800
Subject: KVM: introduce kvm_for_each_memslot macro

Introduce kvm_for_each_memslot to walk all valid memslot

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c |  6 ++----
 arch/x86/kvm/mmu.c       | 12 ++++++------
 include/linux/kvm_host.h |  4 ++++
 virt/kvm/iommu.c         | 17 +++++++++--------
 virt/kvm/kvm_main.c      | 14 ++++++--------
 5 files changed, 27 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 43f4c92816ef..42ad1f9c9f01 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1366,14 +1366,12 @@ static void kvm_release_vm_pages(struct kvm *kvm)
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
-	int i, j;
+	int j;
 	unsigned long base_gfn;
 
 	slots = kvm_memslots(kvm);
-	for (i = 0; i < slots->nmemslots; i++) {
-		memslot = &slots->memslots[i];
+	kvm_for_each_memslot(memslot, slots) {
 		base_gfn = memslot->base_gfn;
-
 		for (j = 0; j < memslot->npages; j++) {
 			if (memslot->rmap[j])
 				put_page((struct page *)memslot->rmap[j]);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 715dcb4fb798..d737443cdfdb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1128,15 +1128,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
 					 unsigned long data))
 {
-	int i, j;
+	int j;
 	int ret;
 	int retval = 0;
 	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
 
 	slots = kvm_memslots(kvm);
 
-	for (i = 0; i < slots->nmemslots; i++) {
-		struct kvm_memory_slot *memslot = &slots->memslots[i];
+	kvm_for_each_memslot(memslot, slots) {
 		unsigned long start = memslot->userspace_addr;
 		unsigned long end;
 
@@ -3985,15 +3985,15 @@ nomem:
  */
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
 {
-	int i;
 	unsigned int nr_mmu_pages;
 	unsigned int  nr_pages = 0;
 	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
 
 	slots = kvm_memslots(kvm);
 
-	for (i = 0; i < slots->nmemslots; i++)
-		nr_pages += slots->memslots[i].npages;
+	kvm_for_each_memslot(memslot, slots)
+		nr_pages += memslot->npages;
 
 	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
 	nr_mmu_pages = max(nr_mmu_pages,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 23f795c66220..392af47a4353 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -308,6 +308,10 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
 	     (vcpup = kvm_get_vcpu(kvm, idx)) != NULL; \
 	     idx++)
 
+#define kvm_for_each_memslot(memslot, slots)	\
+	for (memslot = &slots->memslots[0];	\
+	      memslot < slots->memslots + (slots)->nmemslots; memslot++)
+
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
 
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index a195c07fa829..4e5f7b7f1d2b 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -134,14 +134,15 @@ unmap_pages:
 
 static int kvm_iommu_map_memslots(struct kvm *kvm)
 {
-	int i, idx, r = 0;
+	int idx, r = 0;
 	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
 
 	idx = srcu_read_lock(&kvm->srcu);
 	slots = kvm_memslots(kvm);
 
-	for (i = 0; i < slots->nmemslots; i++) {
-		r = kvm_iommu_map_pages(kvm, &slots->memslots[i]);
+	kvm_for_each_memslot(memslot, slots) {
+		r = kvm_iommu_map_pages(kvm, memslot);
 		if (r)
 			break;
 	}
@@ -311,16 +312,16 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
 
 static int kvm_iommu_unmap_memslots(struct kvm *kvm)
 {
-	int i, idx;
+	int idx;
 	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
 
 	idx = srcu_read_lock(&kvm->srcu);
 	slots = kvm_memslots(kvm);
 
-	for (i = 0; i < slots->nmemslots; i++) {
-		kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn,
-				    slots->memslots[i].npages);
-	}
+	kvm_for_each_memslot(memslot, slots)
+		kvm_iommu_put_pages(kvm, memslot->base_gfn, memslot->npages);
+
 	srcu_read_unlock(&kvm->srcu, idx);
 
 	return 0;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b5ed7770ced3..4c2900c5d81d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -547,11 +547,11 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
 
 void kvm_free_physmem(struct kvm *kvm)
 {
-	int i;
 	struct kvm_memslots *slots = kvm->memslots;
+	struct kvm_memory_slot *memslot;
 
-	for (i = 0; i < slots->nmemslots; ++i)
-		kvm_free_physmem_slot(&slots->memslots[i], NULL);
+	kvm_for_each_memslot(memslot, slots)
+		kvm_free_physmem_slot(memslot, NULL);
 
 	kfree(kvm->memslots);
 }
@@ -975,15 +975,13 @@ EXPORT_SYMBOL_GPL(kvm_is_error_hva);
 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
 						gfn_t gfn)
 {
-	int i;
-
-	for (i = 0; i < slots->nmemslots; ++i) {
-		struct kvm_memory_slot *memslot = &slots->memslots[i];
+	struct kvm_memory_slot *memslot;
 
+	kvm_for_each_memslot(memslot, slots)
 		if (gfn >= memslot->base_gfn
 		    && gfn < memslot->base_gfn + memslot->npages)
 			return memslot;
-	}
+
 	return NULL;
 }
 
-- 
cgit v1.2.3-55-g7522


From 28a37544fb0223eb9805d2567b88f7360edec52a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Thu, 24 Nov 2011 19:04:35 +0800
Subject: KVM: introduce id_to_memslot function

Introduce id_to_memslot to get memslot by slot id

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c  |  2 +-
 arch/powerpc/kvm/book3s.c |  2 +-
 arch/x86/kvm/vmx.c        |  6 ++++--
 arch/x86/kvm/x86.c        | 18 +++++++++---------
 include/linux/kvm_host.h  |  6 ++++++
 virt/kvm/kvm_main.c       | 13 +++++++++----
 6 files changed, 30 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 42ad1f9c9f01..92d9f1e4740f 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1818,7 +1818,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	if (log->slot >= KVM_MEMORY_SLOTS)
 		goto out;
 
-	memslot = &kvm->memslots->memslots[log->slot];
+	memslot = id_to_memslot(kvm->memslots, log->slot);
 	r = -ENOENT;
 	if (!memslot->dirty_bitmap)
 		goto out;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index a459479995c6..e41ac6f7dcf1 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -498,7 +498,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 
 	/* If nothing is dirty, don't bother messing with page tables. */
 	if (is_dirty) {
-		memslot = &kvm->memslots->memslots[log->slot];
+		memslot = id_to_memslot(kvm->memslots, log->slot);
 
 		ga = memslot->base_gfn << PAGE_SHIFT;
 		ga_end = ga + (memslot->npages << PAGE_SHIFT);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ba24022f4575..8f19d91ec3e7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2711,11 +2711,13 @@ static gva_t rmode_tss_base(struct kvm *kvm)
 {
 	if (!kvm->arch.tss_addr) {
 		struct kvm_memslots *slots;
+		struct kvm_memory_slot *slot;
 		gfn_t base_gfn;
 
 		slots = kvm_memslots(kvm);
-		base_gfn = slots->memslots[0].base_gfn +
-				 kvm->memslots->memslots[0].npages - 3;
+		slot = id_to_memslot(slots, 0);
+		base_gfn = slot->base_gfn + slot->npages - 3;
+
 		return base_gfn << PAGE_SHIFT;
 	}
 	return kvm->arch.tss_addr;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 917a287d21c8..b6776c613e6d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3520,7 +3520,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	if (log->slot >= KVM_MEMORY_SLOTS)
 		goto out;
 
-	memslot = &kvm->memslots->memslots[log->slot];
+	memslot = id_to_memslot(kvm->memslots, log->slot);
 	r = -ENOENT;
 	if (!memslot->dirty_bitmap)
 		goto out;
@@ -3531,27 +3531,27 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	/* If nothing is dirty, don't bother messing with page tables. */
 	if (nr_dirty_pages) {
 		struct kvm_memslots *slots, *old_slots;
-		unsigned long *dirty_bitmap;
+		unsigned long *dirty_bitmap, *dirty_bitmap_head;
 
-		dirty_bitmap = memslot->dirty_bitmap_head;
-		if (memslot->dirty_bitmap == dirty_bitmap)
-			dirty_bitmap += n / sizeof(long);
-		memset(dirty_bitmap, 0, n);
+		dirty_bitmap = memslot->dirty_bitmap;
+		dirty_bitmap_head = memslot->dirty_bitmap_head;
+		if (dirty_bitmap == dirty_bitmap_head)
+			dirty_bitmap_head += n / sizeof(long);
+		memset(dirty_bitmap_head, 0, n);
 
 		r = -ENOMEM;
 		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
 		if (!slots)
 			goto out;
 		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
-		memslot = &slots->memslots[log->slot];
-		memslot->dirty_bitmap = dirty_bitmap;
+		memslot = id_to_memslot(slots, log->slot);
 		memslot->nr_dirty_pages = 0;
+		memslot->dirty_bitmap = dirty_bitmap_head;
 		update_memslots(slots, NULL);
 
 		old_slots = kvm->memslots;
 		rcu_assign_pointer(kvm->memslots, slots);
 		synchronize_srcu_expedited(&kvm->srcu);
-		dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
 		kfree(old_slots);
 
 		write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 392af47a4353..123925cd3ae8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -333,6 +333,12 @@ static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
 			|| lockdep_is_held(&kvm->slots_lock));
 }
 
+static inline struct kvm_memory_slot *
+id_to_memslot(struct kvm_memslots *slots, int id)
+{
+	return &slots->memslots[id];
+}
+
 #define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
 #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
 static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4c2900c5d81d..7b6084999424 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -634,8 +634,9 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
 {
 	if (new) {
 		int id = new->id;
+		struct kvm_memory_slot *old = id_to_memslot(slots, id);
 
-		slots->memslots[id] = *new;
+		*old = *new;
 		if (id >= slots->nmemslots)
 			slots->nmemslots = id + 1;
 	}
@@ -681,7 +682,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
 		goto out;
 
-	memslot = &kvm->memslots->memslots[mem->slot];
+	memslot = id_to_memslot(kvm->memslots, mem->slot);
 	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
 	npages = mem->memory_size >> PAGE_SHIFT;
 
@@ -788,12 +789,16 @@ skip_lpage:
 #endif /* not defined CONFIG_S390 */
 
 	if (!npages) {
+		struct kvm_memory_slot *slot;
+
 		r = -ENOMEM;
 		slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
 				GFP_KERNEL);
 		if (!slots)
 			goto out_free;
-		slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
+		slot = id_to_memslot(slots, mem->slot);
+		slot->flags |= KVM_MEMSLOT_INVALID;
+
 		update_memslots(slots, NULL);
 
 		old_memslots = kvm->memslots;
@@ -897,7 +902,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
 	if (log->slot >= KVM_MEMORY_SLOTS)
 		goto out;
 
-	memslot = &kvm->memslots->memslots[log->slot];
+	memslot = id_to_memslot(kvm->memslots, log->slot);
 	r = -ENOENT;
 	if (!memslot->dirty_bitmap)
 		goto out;
-- 
cgit v1.2.3-55-g7522


From 2b5e97e1fadf1ade87558f2a2003616879f9e228 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Wed, 23 Nov 2011 12:27:39 +0900
Subject: KVM: x86 emulator: Use opcode::execute for INS/OUTS from/to port in
 DX

INSB       : 6C
INSW/INSD  : 6D
OUTSB      : 6E
OUTSW/OUTSD: 6F

The I/O port address is read from the DX register when we decode the
operand because we see the SrcDX/DstDX flag is set.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4cd3313b4131..ac8e5ed78834 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3321,8 +3321,8 @@ static struct opcode opcode_table[256] = {
 	I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
 	I(SrcImmByte | Mov | Stack, em_push),
 	I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
-	D2bvIP(DstDI | SrcDX | Mov | String, ins, check_perm_in), /* insb, insw/insd */
-	D2bvIP(SrcSI | DstDX | String, outs, check_perm_out), /* outsb, outsw/outsd */
+	I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */
+	I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
 	/* 0x70 - 0x7F */
 	X16(D(SrcImmByte)),
 	/* 0x80 - 0x87 */
@@ -4027,16 +4027,6 @@ special_insn:
 			goto cannot_emulate;
 		ctxt->dst.val = (s32) ctxt->src.val;
 		break;
-	case 0x6c:		/* insb */
-	case 0x6d:		/* insw/insd */
-		ctxt->src.val = ctxt->regs[VCPU_REGS_RDX];
-		rc = em_in(ctxt);
-		break;
-	case 0x6e:		/* outsb */
-	case 0x6f:		/* outsw/outsd */
-		ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX];
-		rc = em_out(ctxt);
-		break;
 	case 0x70 ... 0x7f: /* jcc (short) */
 		if (test_cc(ctxt->b, ctxt->eflags))
 			jmp_rel(ctxt, ctxt->src.val);
-- 
cgit v1.2.3-55-g7522


From 00b27a3efb116062ca5a276ad5cb01ea1b80b5f6 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Wed, 23 Nov 2011 16:30:32 +0200
Subject: KVM: Move cpuid code to new file

The cpuid code has grown; put it into a separate file.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/Makefile |   2 +-
 arch/x86/kvm/cpuid.c  | 625 +++++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/cpuid.h  |  46 ++++
 arch/x86/kvm/lapic.c  |   1 +
 arch/x86/kvm/vmx.c    |   1 +
 arch/x86/kvm/x86.c    | 634 +-------------------------------------------------
 arch/x86/kvm/x86.h    |   5 +-
 7 files changed, 679 insertions(+), 635 deletions(-)
 create mode 100644 arch/x86/kvm/cpuid.c
 create mode 100644 arch/x86/kvm/cpuid.h

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f431c8..161b76ae87c4 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
-			   i8254.o timer.o
+			   i8254.o timer.o cpuid.o
 kvm-intel-y		+= vmx.o
 kvm-amd-y		+= svm.o
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
new file mode 100644
index 000000000000..0a332ec5203c
--- /dev/null
+++ b/arch/x86/kvm/cpuid.c
@@ -0,0 +1,625 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ * cpuid support routines
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ * Copyright IBM Corporation, 2008
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <asm/user.h>
+#include <asm/xsave.h>
+#include "cpuid.h"
+#include "lapic.h"
+#include "mmu.h"
+#include "trace.h"
+
+void kvm_update_cpuid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	if (!best)
+		return;
+
+	/* Update OSXSAVE bit */
+	if (cpu_has_xsave && best->function == 0x1) {
+		best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
+			best->ecx |= bit(X86_FEATURE_OSXSAVE);
+	}
+
+	if (apic) {
+		if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER))
+			apic->lapic_timer.timer_mode_mask = 3 << 17;
+		else
+			apic->lapic_timer.timer_mode_mask = 1 << 17;
+	}
+}
+
+static int is_efer_nx(void)
+{
+	unsigned long long efer = 0;
+
+	rdmsrl_safe(MSR_EFER, &efer);
+	return efer & EFER_NX;
+}
+
+static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvm_cpuid_entry2 *e, *entry;
+
+	entry = NULL;
+	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+		e = &vcpu->arch.cpuid_entries[i];
+		if (e->function == 0x80000001) {
+			entry = e;
+			break;
+		}
+	}
+	if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
+		entry->edx &= ~(1 << 20);
+		printk(KERN_INFO "kvm: guest NX capability removed\n");
+	}
+}
+
+/* when an old userspace process fills a new kernel module */
+int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+			     struct kvm_cpuid *cpuid,
+			     struct kvm_cpuid_entry __user *entries)
+{
+	int r, i;
+	struct kvm_cpuid_entry *cpuid_entries;
+
+	r = -E2BIG;
+	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+		goto out;
+	r = -ENOMEM;
+	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
+	if (!cpuid_entries)
+		goto out;
+	r = -EFAULT;
+	if (copy_from_user(cpuid_entries, entries,
+			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+		goto out_free;
+	for (i = 0; i < cpuid->nent; i++) {
+		vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
+		vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
+		vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
+		vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
+		vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
+		vcpu->arch.cpuid_entries[i].index = 0;
+		vcpu->arch.cpuid_entries[i].flags = 0;
+		vcpu->arch.cpuid_entries[i].padding[0] = 0;
+		vcpu->arch.cpuid_entries[i].padding[1] = 0;
+		vcpu->arch.cpuid_entries[i].padding[2] = 0;
+	}
+	vcpu->arch.cpuid_nent = cpuid->nent;
+	cpuid_fix_nx_cap(vcpu);
+	r = 0;
+	kvm_apic_set_version(vcpu);
+	kvm_x86_ops->cpuid_update(vcpu);
+	kvm_update_cpuid(vcpu);
+
+out_free:
+	vfree(cpuid_entries);
+out:
+	return r;
+}
+
+int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+			      struct kvm_cpuid2 *cpuid,
+			      struct kvm_cpuid_entry2 __user *entries)
+{
+	int r;
+
+	r = -E2BIG;
+	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+		goto out;
+	r = -EFAULT;
+	if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
+			   cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
+		goto out;
+	vcpu->arch.cpuid_nent = cpuid->nent;
+	kvm_apic_set_version(vcpu);
+	kvm_x86_ops->cpuid_update(vcpu);
+	kvm_update_cpuid(vcpu);
+	return 0;
+
+out:
+	return r;
+}
+
+int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+			      struct kvm_cpuid2 *cpuid,
+			      struct kvm_cpuid_entry2 __user *entries)
+{
+	int r;
+
+	r = -E2BIG;
+	if (cpuid->nent < vcpu->arch.cpuid_nent)
+		goto out;
+	r = -EFAULT;
+	if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
+			 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+		goto out;
+	return 0;
+
+out:
+	cpuid->nent = vcpu->arch.cpuid_nent;
+	return r;
+}
+
+static void cpuid_mask(u32 *word, int wordnum)
+{
+	*word &= boot_cpu_data.x86_capability[wordnum];
+}
+
+static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+			   u32 index)
+{
+	entry->function = function;
+	entry->index = index;
+	cpuid_count(entry->function, entry->index,
+		    &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+	entry->flags = 0;
+}
+
+static bool supported_xcr0_bit(unsigned bit)
+{
+	u64 mask = ((u64)1 << bit);
+
+	return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
+}
+
+#define F(x) bit(X86_FEATURE_##x)
+
+static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+			 u32 index, int *nent, int maxnent)
+{
+	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
+#ifdef CONFIG_X86_64
+	unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
+				? F(GBPAGES) : 0;
+	unsigned f_lm = F(LM);
+#else
+	unsigned f_gbpages = 0;
+	unsigned f_lm = 0;
+#endif
+	unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
+
+	/* cpuid 1.edx */
+	const u32 kvm_supported_word0_x86_features =
+		F(FPU) | F(VME) | F(DE) | F(PSE) |
+		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
+		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+		F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
+		0 /* Reserved, DS, ACPI */ | F(MMX) |
+		F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
+		0 /* HTT, TM, Reserved, PBE */;
+	/* cpuid 0x80000001.edx */
+	const u32 kvm_supported_word1_x86_features =
+		F(FPU) | F(VME) | F(DE) | F(PSE) |
+		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
+		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+		F(PAT) | F(PSE36) | 0 /* Reserved */ |
+		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+		F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
+		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
+	/* cpuid 1.ecx */
+	const u32 kvm_supported_word4_x86_features =
+		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
+		0 /* DS-CPL, VMX, SMX, EST */ |
+		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
+		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
+		0 /* Reserved, DCA */ | F(XMM4_1) |
+		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
+		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
+		F(F16C) | F(RDRAND);
+	/* cpuid 0x80000001.ecx */
+	const u32 kvm_supported_word6_x86_features =
+		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
+		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
+		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
+		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
+
+	/* cpuid 0xC0000001.edx */
+	const u32 kvm_supported_word5_x86_features =
+		F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
+		F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
+		F(PMM) | F(PMM_EN);
+
+	/* cpuid 7.0.ebx */
+	const u32 kvm_supported_word9_x86_features =
+		F(SMEP) | F(FSGSBASE) | F(ERMS);
+
+	/* all calls to cpuid_count() should be made on the same cpu */
+	get_cpu();
+	do_cpuid_1_ent(entry, function, index);
+	++*nent;
+
+	switch (function) {
+	case 0:
+		entry->eax = min(entry->eax, (u32)0xd);
+		break;
+	case 1:
+		entry->edx &= kvm_supported_word0_x86_features;
+		cpuid_mask(&entry->edx, 0);
+		entry->ecx &= kvm_supported_word4_x86_features;
+		cpuid_mask(&entry->ecx, 4);
+		/* we support x2apic emulation even if host does not support
+		 * it since we emulate x2apic in software */
+		entry->ecx |= F(X2APIC);
+		break;
+	/* function 2 entries are STATEFUL. That is, repeated cpuid commands
+	 * may return different values. This forces us to get_cpu() before
+	 * issuing the first command, and also to emulate this annoying behavior
+	 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
+	case 2: {
+		int t, times = entry->eax & 0xff;
+
+		entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+		entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+		for (t = 1; t < times && *nent < maxnent; ++t) {
+			do_cpuid_1_ent(&entry[t], function, 0);
+			entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+			++*nent;
+		}
+		break;
+	}
+	/* function 4 has additional index. */
+	case 4: {
+		int i, cache_type;
+
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		/* read more entries until cache_type is zero */
+		for (i = 1; *nent < maxnent; ++i) {
+			cache_type = entry[i - 1].eax & 0x1f;
+			if (!cache_type)
+				break;
+			do_cpuid_1_ent(&entry[i], function, i);
+			entry[i].flags |=
+			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+			++*nent;
+		}
+		break;
+	}
+	case 7: {
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		/* Mask ebx against host capbability word 9 */
+		if (index == 0) {
+			entry->ebx &= kvm_supported_word9_x86_features;
+			cpuid_mask(&entry->ebx, 9);
+		} else
+			entry->ebx = 0;
+		entry->eax = 0;
+		entry->ecx = 0;
+		entry->edx = 0;
+		break;
+	}
+	case 9:
+		break;
+	/* function 0xb has additional index. */
+	case 0xb: {
+		int i, level_type;
+
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		/* read more entries until level_type is zero */
+		for (i = 1; *nent < maxnent; ++i) {
+			level_type = entry[i - 1].ecx & 0xff00;
+			if (!level_type)
+				break;
+			do_cpuid_1_ent(&entry[i], function, i);
+			entry[i].flags |=
+			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+			++*nent;
+		}
+		break;
+	}
+	case 0xd: {
+		int idx, i;
+
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
+			do_cpuid_1_ent(&entry[i], function, idx);
+			if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
+				continue;
+			entry[i].flags |=
+			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+			++*nent;
+			++i;
+		}
+		break;
+	}
+	case KVM_CPUID_SIGNATURE: {
+		char signature[12] = "KVMKVMKVM\0\0";
+		u32 *sigptr = (u32 *)signature;
+		entry->eax = 0;
+		entry->ebx = sigptr[0];
+		entry->ecx = sigptr[1];
+		entry->edx = sigptr[2];
+		break;
+	}
+	case KVM_CPUID_FEATURES:
+		entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
+			     (1 << KVM_FEATURE_NOP_IO_DELAY) |
+			     (1 << KVM_FEATURE_CLOCKSOURCE2) |
+			     (1 << KVM_FEATURE_ASYNC_PF) |
+			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+
+		if (sched_info_on())
+			entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
+
+		entry->ebx = 0;
+		entry->ecx = 0;
+		entry->edx = 0;
+		break;
+	case 0x80000000:
+		entry->eax = min(entry->eax, 0x8000001a);
+		break;
+	case 0x80000001:
+		entry->edx &= kvm_supported_word1_x86_features;
+		cpuid_mask(&entry->edx, 1);
+		entry->ecx &= kvm_supported_word6_x86_features;
+		cpuid_mask(&entry->ecx, 6);
+		break;
+	case 0x80000008: {
+		unsigned g_phys_as = (entry->eax >> 16) & 0xff;
+		unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
+		unsigned phys_as = entry->eax & 0xff;
+
+		if (!g_phys_as)
+			g_phys_as = phys_as;
+		entry->eax = g_phys_as | (virt_as << 8);
+		entry->ebx = entry->edx = 0;
+		break;
+	}
+	case 0x80000019:
+		entry->ecx = entry->edx = 0;
+		break;
+	case 0x8000001a:
+		break;
+	case 0x8000001d:
+		break;
+	/*Add support for Centaur's CPUID instruction*/
+	case 0xC0000000:
+		/*Just support up to 0xC0000004 now*/
+		entry->eax = min(entry->eax, 0xC0000004);
+		break;
+	case 0xC0000001:
+		entry->edx &= kvm_supported_word5_x86_features;
+		cpuid_mask(&entry->edx, 5);
+		break;
+	case 3: /* Processor serial number */
+	case 5: /* MONITOR/MWAIT */
+	case 6: /* Thermal management */
+	case 0xA: /* Architectural Performance Monitoring */
+	case 0x80000007: /* Advanced power management */
+	case 0xC0000002:
+	case 0xC0000003:
+	case 0xC0000004:
+	default:
+		entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+		break;
+	}
+
+	kvm_x86_ops->set_supported_cpuid(function, entry);
+
+	put_cpu();
+}
+
+#undef F
+
+int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
+				      struct kvm_cpuid_entry2 __user *entries)
+{
+	struct kvm_cpuid_entry2 *cpuid_entries;
+	int limit, nent = 0, r = -E2BIG;
+	u32 func;
+
+	if (cpuid->nent < 1)
+		goto out;
+	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+		cpuid->nent = KVM_MAX_CPUID_ENTRIES;
+	r = -ENOMEM;
+	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
+	if (!cpuid_entries)
+		goto out;
+
+	do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
+	limit = cpuid_entries[0].eax;
+	for (func = 1; func <= limit && nent < cpuid->nent; ++func)
+		do_cpuid_ent(&cpuid_entries[nent], func, 0,
+			     &nent, cpuid->nent);
+	r = -E2BIG;
+	if (nent >= cpuid->nent)
+		goto out_free;
+
+	do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
+	limit = cpuid_entries[nent - 1].eax;
+	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
+		do_cpuid_ent(&cpuid_entries[nent], func, 0,
+			     &nent, cpuid->nent);
+
+
+
+	r = -E2BIG;
+	if (nent >= cpuid->nent)
+		goto out_free;
+
+	/* Add support for Centaur's CPUID instruction. */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) {
+		do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0,
+				&nent, cpuid->nent);
+
+		r = -E2BIG;
+		if (nent >= cpuid->nent)
+			goto out_free;
+
+		limit = cpuid_entries[nent - 1].eax;
+		for (func = 0xC0000001;
+			func <= limit && nent < cpuid->nent; ++func)
+			do_cpuid_ent(&cpuid_entries[nent], func, 0,
+					&nent, cpuid->nent);
+
+		r = -E2BIG;
+		if (nent >= cpuid->nent)
+			goto out_free;
+	}
+
+	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
+		     cpuid->nent);
+
+	r = -E2BIG;
+	if (nent >= cpuid->nent)
+		goto out_free;
+
+	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
+		     cpuid->nent);
+
+	r = -E2BIG;
+	if (nent >= cpuid->nent)
+		goto out_free;
+
+	r = -EFAULT;
+	if (copy_to_user(entries, cpuid_entries,
+			 nent * sizeof(struct kvm_cpuid_entry2)))
+		goto out_free;
+	cpuid->nent = nent;
+	r = 0;
+
+out_free:
+	vfree(cpuid_entries);
+out:
+	return r;
+}
+
+static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
+{
+	struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
+	int j, nent = vcpu->arch.cpuid_nent;
+
+	e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
+	/* when no next entry is found, the current entry[i] is reselected */
+	for (j = i + 1; ; j = (j + 1) % nent) {
+		struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
+		if (ej->function == e->function) {
+			ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+			return j;
+		}
+	}
+	return 0; /* silence gcc, even though control never reaches here */
+}
+
+/* find an entry with matching function, matching index (if needed), and that
+ * should be read next (if it's stateful) */
+static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
+	u32 function, u32 index)
+{
+	if (e->function != function)
+		return 0;
+	if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
+		return 0;
+	if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
+	    !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+		return 0;
+	return 1;
+}
+
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+					      u32 function, u32 index)
+{
+	int i;
+	struct kvm_cpuid_entry2 *best = NULL;
+
+	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+		struct kvm_cpuid_entry2 *e;
+
+		e = &vcpu->arch.cpuid_entries[i];
+		if (is_matching_cpuid_entry(e, function, index)) {
+			if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
+				move_to_next_stateful_cpuid_entry(vcpu, i);
+			best = e;
+			break;
+		}
+	}
+	return best;
+}
+EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
+
+int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
+	if (!best || best->eax < 0x80000008)
+		goto not_found;
+	best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+	if (best)
+		return best->eax & 0xff;
+not_found:
+	return 36;
+}
+
+/*
+ * If no match is found, check whether we exceed the vCPU's limit
+ * and return the content of the highest valid _standard_ leaf instead.
+ * This is to satisfy the CPUID specification.
+ */
+static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
+                                                  u32 function, u32 index)
+{
+	struct kvm_cpuid_entry2 *maxlevel;
+
+	maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
+	if (!maxlevel || maxlevel->eax >= function)
+		return NULL;
+	if (function & 0x80000000) {
+		maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
+		if (!maxlevel)
+			return NULL;
+	}
+	return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
+}
+
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+	u32 function, index;
+	struct kvm_cpuid_entry2 *best;
+
+	function = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
+	best = kvm_find_cpuid_entry(vcpu, function, index);
+
+	if (!best)
+		best = check_cpuid_limit(vcpu, function, index);
+
+	if (best) {
+		kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
+		kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
+		kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
+		kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
+	}
+	kvm_x86_ops->skip_emulated_instruction(vcpu);
+	trace_kvm_cpuid(function,
+			kvm_register_read(vcpu, VCPU_REGS_RAX),
+			kvm_register_read(vcpu, VCPU_REGS_RBX),
+			kvm_register_read(vcpu, VCPU_REGS_RCX),
+			kvm_register_read(vcpu, VCPU_REGS_RDX));
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
new file mode 100644
index 000000000000..5b97e1797a6d
--- /dev/null
+++ b/arch/x86/kvm/cpuid.h
@@ -0,0 +1,46 @@
+#ifndef ARCH_X86_KVM_CPUID_H
+#define ARCH_X86_KVM_CPUID_H
+
+#include "x86.h"
+
+void kvm_update_cpuid(struct kvm_vcpu *vcpu);
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+					      u32 function, u32 index);
+int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
+				      struct kvm_cpuid_entry2 __user *entries);
+int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+			     struct kvm_cpuid *cpuid,
+			     struct kvm_cpuid_entry __user *entries);
+int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+			      struct kvm_cpuid2 *cpuid,
+			      struct kvm_cpuid_entry2 __user *entries);
+int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+			      struct kvm_cpuid2 *cpuid,
+			      struct kvm_cpuid_entry2 __user *entries);
+
+
+static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	return best && (best->ecx & bit(X86_FEATURE_XSAVE));
+}
+
+static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->ebx & bit(X86_FEATURE_SMEP));
+}
+
+static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
+}
+
+#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 54abb40199d6..a7f3e655cd3e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -38,6 +38,7 @@
 #include "irq.h"
 #include "trace.h"
 #include "x86.h"
+#include "cpuid.h"
 
 #ifndef CONFIG_X86_64
 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8f19d91ec3e7..4ceced2669ef 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -18,6 +18,7 @@
 
 #include "irq.h"
 #include "mmu.h"
+#include "cpuid.h"
 
 #include <linux/kvm_host.h>
 #include <linux/module.h>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b6776c613e6d..4e533d24c513 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -26,6 +26,7 @@
 #include "tss.h"
 #include "kvm_cache_regs.h"
 #include "x86.h"
+#include "cpuid.h"
 
 #include <linux/clocksource.h>
 #include <linux/interrupt.h>
@@ -82,8 +83,6 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
-static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
-				    struct kvm_cpuid_entry2 __user *entries);
 static void process_nmi(struct kvm_vcpu *vcpu);
 
 struct kvm_x86_ops *kvm_x86_ops;
@@ -574,54 +573,6 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 }
 EXPORT_SYMBOL_GPL(kvm_set_xcr);
 
-static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
-{
-	struct kvm_cpuid_entry2 *best;
-
-	best = kvm_find_cpuid_entry(vcpu, 1, 0);
-	return best && (best->ecx & bit(X86_FEATURE_XSAVE));
-}
-
-static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
-{
-	struct kvm_cpuid_entry2 *best;
-
-	best = kvm_find_cpuid_entry(vcpu, 7, 0);
-	return best && (best->ebx & bit(X86_FEATURE_SMEP));
-}
-
-static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
-{
-	struct kvm_cpuid_entry2 *best;
-
-	best = kvm_find_cpuid_entry(vcpu, 7, 0);
-	return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
-}
-
-static void update_cpuid(struct kvm_vcpu *vcpu)
-{
-	struct kvm_cpuid_entry2 *best;
-	struct kvm_lapic *apic = vcpu->arch.apic;
-
-	best = kvm_find_cpuid_entry(vcpu, 1, 0);
-	if (!best)
-		return;
-
-	/* Update OSXSAVE bit */
-	if (cpu_has_xsave && best->function == 0x1) {
-		best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
-			best->ecx |= bit(X86_FEATURE_OSXSAVE);
-	}
-
-	if (apic) {
-		if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER))
-			apic->lapic_timer.timer_mode_mask = 3 << 17;
-		else
-			apic->lapic_timer.timer_mode_mask = 1 << 17;
-	}
-}
-
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long old_cr4 = kvm_read_cr4(vcpu);
@@ -655,7 +606,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		kvm_mmu_reset_context(vcpu);
 
 	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
-		update_cpuid(vcpu);
+		kvm_update_cpuid(vcpu);
 
 	return 0;
 }
@@ -2265,466 +2216,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
 }
 
-static int is_efer_nx(void)
-{
-	unsigned long long efer = 0;
-
-	rdmsrl_safe(MSR_EFER, &efer);
-	return efer & EFER_NX;
-}
-
-static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
-{
-	int i;
-	struct kvm_cpuid_entry2 *e, *entry;
-
-	entry = NULL;
-	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
-		e = &vcpu->arch.cpuid_entries[i];
-		if (e->function == 0x80000001) {
-			entry = e;
-			break;
-		}
-	}
-	if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
-		entry->edx &= ~(1 << 20);
-		printk(KERN_INFO "kvm: guest NX capability removed\n");
-	}
-}
-
-/* when an old userspace process fills a new kernel module */
-static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
-				    struct kvm_cpuid *cpuid,
-				    struct kvm_cpuid_entry __user *entries)
-{
-	int r, i;
-	struct kvm_cpuid_entry *cpuid_entries;
-
-	r = -E2BIG;
-	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
-		goto out;
-	r = -ENOMEM;
-	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
-	if (!cpuid_entries)
-		goto out;
-	r = -EFAULT;
-	if (copy_from_user(cpuid_entries, entries,
-			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
-		goto out_free;
-	for (i = 0; i < cpuid->nent; i++) {
-		vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
-		vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
-		vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
-		vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
-		vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
-		vcpu->arch.cpuid_entries[i].index = 0;
-		vcpu->arch.cpuid_entries[i].flags = 0;
-		vcpu->arch.cpuid_entries[i].padding[0] = 0;
-		vcpu->arch.cpuid_entries[i].padding[1] = 0;
-		vcpu->arch.cpuid_entries[i].padding[2] = 0;
-	}
-	vcpu->arch.cpuid_nent = cpuid->nent;
-	cpuid_fix_nx_cap(vcpu);
-	r = 0;
-	kvm_apic_set_version(vcpu);
-	kvm_x86_ops->cpuid_update(vcpu);
-	update_cpuid(vcpu);
-
-out_free:
-	vfree(cpuid_entries);
-out:
-	return r;
-}
-
-static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
-				     struct kvm_cpuid2 *cpuid,
-				     struct kvm_cpuid_entry2 __user *entries)
-{
-	int r;
-
-	r = -E2BIG;
-	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
-		goto out;
-	r = -EFAULT;
-	if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
-			   cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
-		goto out;
-	vcpu->arch.cpuid_nent = cpuid->nent;
-	kvm_apic_set_version(vcpu);
-	kvm_x86_ops->cpuid_update(vcpu);
-	update_cpuid(vcpu);
-	return 0;
-
-out:
-	return r;
-}
-
-static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
-				     struct kvm_cpuid2 *cpuid,
-				     struct kvm_cpuid_entry2 __user *entries)
-{
-	int r;
-
-	r = -E2BIG;
-	if (cpuid->nent < vcpu->arch.cpuid_nent)
-		goto out;
-	r = -EFAULT;
-	if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
-			 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
-		goto out;
-	return 0;
-
-out:
-	cpuid->nent = vcpu->arch.cpuid_nent;
-	return r;
-}
-
-static void cpuid_mask(u32 *word, int wordnum)
-{
-	*word &= boot_cpu_data.x86_capability[wordnum];
-}
-
-static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
-			   u32 index)
-{
-	entry->function = function;
-	entry->index = index;
-	cpuid_count(entry->function, entry->index,
-		    &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
-	entry->flags = 0;
-}
-
-static bool supported_xcr0_bit(unsigned bit)
-{
-	u64 mask = ((u64)1 << bit);
-
-	return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
-}
-
-#define F(x) bit(X86_FEATURE_##x)
-
-static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
-			 u32 index, int *nent, int maxnent)
-{
-	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
-#ifdef CONFIG_X86_64
-	unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
-				? F(GBPAGES) : 0;
-	unsigned f_lm = F(LM);
-#else
-	unsigned f_gbpages = 0;
-	unsigned f_lm = 0;
-#endif
-	unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
-
-	/* cpuid 1.edx */
-	const u32 kvm_supported_word0_x86_features =
-		F(FPU) | F(VME) | F(DE) | F(PSE) |
-		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
-		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
-		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
-		F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
-		0 /* Reserved, DS, ACPI */ | F(MMX) |
-		F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
-		0 /* HTT, TM, Reserved, PBE */;
-	/* cpuid 0x80000001.edx */
-	const u32 kvm_supported_word1_x86_features =
-		F(FPU) | F(VME) | F(DE) | F(PSE) |
-		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
-		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
-		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
-		F(PAT) | F(PSE36) | 0 /* Reserved */ |
-		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
-		F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
-		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
-	/* cpuid 1.ecx */
-	const u32 kvm_supported_word4_x86_features =
-		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
-		0 /* DS-CPL, VMX, SMX, EST */ |
-		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
-		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
-		0 /* Reserved, DCA */ | F(XMM4_1) |
-		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
-		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
-		F(F16C) | F(RDRAND);
-	/* cpuid 0x80000001.ecx */
-	const u32 kvm_supported_word6_x86_features =
-		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
-		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
-		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
-		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
-
-	/* cpuid 0xC0000001.edx */
-	const u32 kvm_supported_word5_x86_features =
-		F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
-		F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
-		F(PMM) | F(PMM_EN);
-
-	/* cpuid 7.0.ebx */
-	const u32 kvm_supported_word9_x86_features =
-		F(SMEP) | F(FSGSBASE) | F(ERMS);
-
-	/* all calls to cpuid_count() should be made on the same cpu */
-	get_cpu();
-	do_cpuid_1_ent(entry, function, index);
-	++*nent;
-
-	switch (function) {
-	case 0:
-		entry->eax = min(entry->eax, (u32)0xd);
-		break;
-	case 1:
-		entry->edx &= kvm_supported_word0_x86_features;
-		cpuid_mask(&entry->edx, 0);
-		entry->ecx &= kvm_supported_word4_x86_features;
-		cpuid_mask(&entry->ecx, 4);
-		/* we support x2apic emulation even if host does not support
-		 * it since we emulate x2apic in software */
-		entry->ecx |= F(X2APIC);
-		break;
-	/* function 2 entries are STATEFUL. That is, repeated cpuid commands
-	 * may return different values. This forces us to get_cpu() before
-	 * issuing the first command, and also to emulate this annoying behavior
-	 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
-	case 2: {
-		int t, times = entry->eax & 0xff;
-
-		entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
-		entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
-		for (t = 1; t < times && *nent < maxnent; ++t) {
-			do_cpuid_1_ent(&entry[t], function, 0);
-			entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
-			++*nent;
-		}
-		break;
-	}
-	/* function 4 has additional index. */
-	case 4: {
-		int i, cache_type;
-
-		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-		/* read more entries until cache_type is zero */
-		for (i = 1; *nent < maxnent; ++i) {
-			cache_type = entry[i - 1].eax & 0x1f;
-			if (!cache_type)
-				break;
-			do_cpuid_1_ent(&entry[i], function, i);
-			entry[i].flags |=
-			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-			++*nent;
-		}
-		break;
-	}
-	case 7: {
-		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-		/* Mask ebx against host capbability word 9 */
-		if (index == 0) {
-			entry->ebx &= kvm_supported_word9_x86_features;
-			cpuid_mask(&entry->ebx, 9);
-		} else
-			entry->ebx = 0;
-		entry->eax = 0;
-		entry->ecx = 0;
-		entry->edx = 0;
-		break;
-	}
-	case 9:
-		break;
-	/* function 0xb has additional index. */
-	case 0xb: {
-		int i, level_type;
-
-		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-		/* read more entries until level_type is zero */
-		for (i = 1; *nent < maxnent; ++i) {
-			level_type = entry[i - 1].ecx & 0xff00;
-			if (!level_type)
-				break;
-			do_cpuid_1_ent(&entry[i], function, i);
-			entry[i].flags |=
-			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-			++*nent;
-		}
-		break;
-	}
-	case 0xd: {
-		int idx, i;
-
-		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-		for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
-			do_cpuid_1_ent(&entry[i], function, idx);
-			if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
-				continue;
-			entry[i].flags |=
-			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-			++*nent;
-			++i;
-		}
-		break;
-	}
-	case KVM_CPUID_SIGNATURE: {
-		char signature[12] = "KVMKVMKVM\0\0";
-		u32 *sigptr = (u32 *)signature;
-		entry->eax = 0;
-		entry->ebx = sigptr[0];
-		entry->ecx = sigptr[1];
-		entry->edx = sigptr[2];
-		break;
-	}
-	case KVM_CPUID_FEATURES:
-		entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
-			     (1 << KVM_FEATURE_NOP_IO_DELAY) |
-			     (1 << KVM_FEATURE_CLOCKSOURCE2) |
-			     (1 << KVM_FEATURE_ASYNC_PF) |
-			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
-
-		if (sched_info_on())
-			entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
-
-		entry->ebx = 0;
-		entry->ecx = 0;
-		entry->edx = 0;
-		break;
-	case 0x80000000:
-		entry->eax = min(entry->eax, 0x8000001a);
-		break;
-	case 0x80000001:
-		entry->edx &= kvm_supported_word1_x86_features;
-		cpuid_mask(&entry->edx, 1);
-		entry->ecx &= kvm_supported_word6_x86_features;
-		cpuid_mask(&entry->ecx, 6);
-		break;
-	case 0x80000008: {
-		unsigned g_phys_as = (entry->eax >> 16) & 0xff;
-		unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
-		unsigned phys_as = entry->eax & 0xff;
-
-		if (!g_phys_as)
-			g_phys_as = phys_as;
-		entry->eax = g_phys_as | (virt_as << 8);
-		entry->ebx = entry->edx = 0;
-		break;
-	}
-	case 0x80000019:
-		entry->ecx = entry->edx = 0;
-		break;
-	case 0x8000001a:
-		break;
-	case 0x8000001d:
-		break;
-	/*Add support for Centaur's CPUID instruction*/
-	case 0xC0000000:
-		/*Just support up to 0xC0000004 now*/
-		entry->eax = min(entry->eax, 0xC0000004);
-		break;
-	case 0xC0000001:
-		entry->edx &= kvm_supported_word5_x86_features;
-		cpuid_mask(&entry->edx, 5);
-		break;
-	case 3: /* Processor serial number */
-	case 5: /* MONITOR/MWAIT */
-	case 6: /* Thermal management */
-	case 0xA: /* Architectural Performance Monitoring */
-	case 0x80000007: /* Advanced power management */
-	case 0xC0000002:
-	case 0xC0000003:
-	case 0xC0000004:
-	default:
-		entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
-		break;
-	}
-
-	kvm_x86_ops->set_supported_cpuid(function, entry);
-
-	put_cpu();
-}
-
-#undef F
-
-static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
-				     struct kvm_cpuid_entry2 __user *entries)
-{
-	struct kvm_cpuid_entry2 *cpuid_entries;
-	int limit, nent = 0, r = -E2BIG;
-	u32 func;
-
-	if (cpuid->nent < 1)
-		goto out;
-	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
-		cpuid->nent = KVM_MAX_CPUID_ENTRIES;
-	r = -ENOMEM;
-	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
-	if (!cpuid_entries)
-		goto out;
-
-	do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
-	limit = cpuid_entries[0].eax;
-	for (func = 1; func <= limit && nent < cpuid->nent; ++func)
-		do_cpuid_ent(&cpuid_entries[nent], func, 0,
-			     &nent, cpuid->nent);
-	r = -E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
-
-	do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
-	limit = cpuid_entries[nent - 1].eax;
-	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
-		do_cpuid_ent(&cpuid_entries[nent], func, 0,
-			     &nent, cpuid->nent);
-
-
-
-	r = -E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
-
-	/* Add support for Centaur's CPUID instruction. */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) {
-		do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0,
-				&nent, cpuid->nent);
-
-		r = -E2BIG;
-		if (nent >= cpuid->nent)
-			goto out_free;
-
-		limit = cpuid_entries[nent - 1].eax;
-		for (func = 0xC0000001;
-			func <= limit && nent < cpuid->nent; ++func)
-			do_cpuid_ent(&cpuid_entries[nent], func, 0,
-					&nent, cpuid->nent);
-
-		r = -E2BIG;
-		if (nent >= cpuid->nent)
-			goto out_free;
-	}
-
-	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
-		     cpuid->nent);
-
-	r = -E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
-
-	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
-		     cpuid->nent);
-
-	r = -E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
-
-	r = -EFAULT;
-	if (copy_to_user(entries, cpuid_entries,
-			 nent * sizeof(struct kvm_cpuid_entry2)))
-		goto out_free;
-	cpuid->nent = nent;
-	r = 0;
-
-out_free:
-	vfree(cpuid_entries);
-out:
-	return r;
-}
-
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 				    struct kvm_lapic_state *s)
 {
@@ -5438,125 +4929,6 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 	return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
 }
 
-static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
-{
-	struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
-	int j, nent = vcpu->arch.cpuid_nent;
-
-	e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
-	/* when no next entry is found, the current entry[i] is reselected */
-	for (j = i + 1; ; j = (j + 1) % nent) {
-		struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
-		if (ej->function == e->function) {
-			ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
-			return j;
-		}
-	}
-	return 0; /* silence gcc, even though control never reaches here */
-}
-
-/* find an entry with matching function, matching index (if needed), and that
- * should be read next (if it's stateful) */
-static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
-	u32 function, u32 index)
-{
-	if (e->function != function)
-		return 0;
-	if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
-		return 0;
-	if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
-	    !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
-		return 0;
-	return 1;
-}
-
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
-					      u32 function, u32 index)
-{
-	int i;
-	struct kvm_cpuid_entry2 *best = NULL;
-
-	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
-		struct kvm_cpuid_entry2 *e;
-
-		e = &vcpu->arch.cpuid_entries[i];
-		if (is_matching_cpuid_entry(e, function, index)) {
-			if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
-				move_to_next_stateful_cpuid_entry(vcpu, i);
-			best = e;
-			break;
-		}
-	}
-	return best;
-}
-EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
-
-int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
-{
-	struct kvm_cpuid_entry2 *best;
-
-	best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
-	if (!best || best->eax < 0x80000008)
-		goto not_found;
-	best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
-	if (best)
-		return best->eax & 0xff;
-not_found:
-	return 36;
-}
-
-/*
- * If no match is found, check whether we exceed the vCPU's limit
- * and return the content of the highest valid _standard_ leaf instead.
- * This is to satisfy the CPUID specification.
- */
-static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
-                                                  u32 function, u32 index)
-{
-	struct kvm_cpuid_entry2 *maxlevel;
-
-	maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
-	if (!maxlevel || maxlevel->eax >= function)
-		return NULL;
-	if (function & 0x80000000) {
-		maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
-		if (!maxlevel)
-			return NULL;
-	}
-	return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
-}
-
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
-{
-	u32 function, index;
-	struct kvm_cpuid_entry2 *best;
-
-	function = kvm_register_read(vcpu, VCPU_REGS_RAX);
-	index = kvm_register_read(vcpu, VCPU_REGS_RCX);
-	kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
-	best = kvm_find_cpuid_entry(vcpu, function, index);
-
-	if (!best)
-		best = check_cpuid_limit(vcpu, function, index);
-
-	if (best) {
-		kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
-		kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
-		kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
-		kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
-	}
-	kvm_x86_ops->skip_emulated_instruction(vcpu);
-	trace_kvm_cpuid(function,
-			kvm_register_read(vcpu, VCPU_REGS_RAX),
-			kvm_register_read(vcpu, VCPU_REGS_RBX),
-			kvm_register_read(vcpu, VCPU_REGS_RCX),
-			kvm_register_read(vcpu, VCPU_REGS_RDX));
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
-
 /*
  * Check if userspace requested an interrupt window, and that the
  * interrupt window is open.
@@ -6222,7 +5594,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
 	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
 	if (sregs->cr4 & X86_CR4_OSXSAVE)
-		update_cpuid(vcpu);
+		kvm_update_cpuid(vcpu);
 
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	if (!is_long_mode(vcpu) && is_pae(vcpu)) {
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index d36fe237c665..cb80c293cdd8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -33,9 +33,6 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
 	return (nr == BP_VECTOR) || (nr == OF_VECTOR);
 }
 
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
-                                             u32 function, u32 index);
-
 static inline bool is_protmode(struct kvm_vcpu *vcpu)
 {
 	return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
@@ -125,4 +122,6 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 	gva_t addr, void *val, unsigned int bytes,
 	struct x86_exception *exception);
 
+extern u64 host_xcr0;
+
 #endif
-- 
cgit v1.2.3-55-g7522


From fb215366b3c7320ac25dca766a0152df16534932 Mon Sep 17 00:00:00 2001
From: Liu, Jinsong
Date: Mon, 28 Nov 2011 03:55:19 -0800
Subject: KVM: expose latest Intel cpu new features (BMI1/BMI2/FMA/AVX2) to
 guest

Intel latest cpu add 6 new features, refer http://software.intel.com/file/36945
The new feature cpuid listed as below:

1. FMA		CPUID.EAX=01H:ECX.FMA[bit 12]
2. MOVBE	CPUID.EAX=01H:ECX.MOVBE[bit 22]
3. BMI1		CPUID.EAX=07H,ECX=0H:EBX.BMI1[bit 3]
4. AVX2		CPUID.EAX=07H,ECX=0H:EBX.AVX2[bit 5]
5. BMI2		CPUID.EAX=07H,ECX=0H:EBX.BMI2[bit 8]
6. LZCNT	CPUID.EAX=80000001H:ECX.LZCNT[bit 5]

This patch expose these features to guest.
Among them, FMA/MOVBE/LZCNT has already been defined, MOVBE/LZCNT has
already been exposed.

This patch defines BMI1/AVX2/BMI2, and exposes FMA/BMI1/AVX2/BMI2 to guest.

Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/cpufeature.h | 3 +++
 arch/x86/kvm/cpuid.c              | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index f3444f700f36..17c5d4bdee5e 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -197,7 +197,10 @@
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
 #define X86_FEATURE_FSGSBASE	(9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_BMI1	(9*32+ 3) /* 1st group bit manipulation extensions */
+#define X86_FEATURE_AVX2	(9*32+ 5) /* AVX2 instructions */
 #define X86_FEATURE_SMEP	(9*32+ 7) /* Supervisor Mode Execution Protection */
+#define X86_FEATURE_BMI2	(9*32+ 8) /* 2nd group bit manipulation extensions */
 #define X86_FEATURE_ERMS	(9*32+ 9) /* Enhanced REP MOVSB/STOSB */
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0a332ec5203c..47be763e1b60 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -222,7 +222,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
 		0 /* DS-CPL, VMX, SMX, EST */ |
 		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
-		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
+		F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
 		0 /* Reserved, DCA */ | F(XMM4_1) |
 		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
 		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
@@ -242,7 +242,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 7.0.ebx */
 	const u32 kvm_supported_word9_x86_features =
-		F(SMEP) | F(FSGSBASE) | F(ERMS);
+		F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS);
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
-- 
cgit v1.2.3-55-g7522


From 831bf664e9c1fc08fc6b3984d00d275cac82f5e9 Mon Sep 17 00:00:00 2001
From: Sasha Levin
Date: Mon, 28 Nov 2011 11:20:29 +0200
Subject: KVM: Refactor and simplify kvm_dev_ioctl_get_supported_cpuid

This patch cleans and simplifies kvm_dev_ioctl_get_supported_cpuid by using a table
instead of duplicating code as Avi suggested.

This patch also fixes a bug where kvm_dev_ioctl_get_supported_cpuid would return
-E2BIG when amount of entries passed was just right.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/cpuid.c | 113 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 63 insertions(+), 50 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 47be763e1b60..52593e8e1ae8 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -183,9 +183,10 @@ static bool supported_xcr0_bit(unsigned bit)
 
 #define F(x) bit(X86_FEATURE_##x)
 
-static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			 u32 index, int *nent, int maxnent)
 {
+	int r;
 	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
 #ifdef CONFIG_X86_64
 	unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
@@ -246,6 +247,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
+
+	r = -E2BIG;
+
+	if (*nent >= maxnent)
+		goto out;
+
 	do_cpuid_1_ent(entry, function, index);
 	++*nent;
 
@@ -271,7 +278,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 		entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
 		entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
-		for (t = 1; t < times && *nent < maxnent; ++t) {
+		for (t = 1; t < times; ++t) {
+			if (*nent >= maxnent)
+				goto out;
+
 			do_cpuid_1_ent(&entry[t], function, 0);
 			entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
 			++*nent;
@@ -284,7 +294,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		/* read more entries until cache_type is zero */
-		for (i = 1; *nent < maxnent; ++i) {
+		for (i = 1; ; ++i) {
+			if (*nent >= maxnent)
+				goto out;
+
 			cache_type = entry[i - 1].eax & 0x1f;
 			if (!cache_type)
 				break;
@@ -316,7 +329,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		/* read more entries until level_type is zero */
-		for (i = 1; *nent < maxnent; ++i) {
+		for (i = 1; ; ++i) {
+			if (*nent >= maxnent)
+				goto out;
+
 			level_type = entry[i - 1].ecx & 0xff00;
 			if (!level_type)
 				break;
@@ -331,7 +347,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		int idx, i;
 
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-		for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
+		for (idx = 1, i = 1; idx < 64; ++idx) {
+			if (*nent >= maxnent)
+				goto out;
+
 			do_cpuid_1_ent(&entry[i], function, idx);
 			if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
 				continue;
@@ -416,17 +435,41 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	kvm_x86_ops->set_supported_cpuid(function, entry);
 
+	r = 0;
+
+out:
 	put_cpu();
+
+	return r;
 }
 
 #undef F
 
+struct kvm_cpuid_param {
+	u32 func;
+	u32 idx;
+	bool has_leaf_count;
+	bool (*qualifier)(struct kvm_cpuid_param *param);
+};
+
+static bool is_centaur_cpu(struct kvm_cpuid_param *param)
+{
+	return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
+}
+
 int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 				      struct kvm_cpuid_entry2 __user *entries)
 {
 	struct kvm_cpuid_entry2 *cpuid_entries;
-	int limit, nent = 0, r = -E2BIG;
+	int limit, nent = 0, r = -E2BIG, i;
 	u32 func;
+	static struct kvm_cpuid_param param[] = {
+		{ .func = 0, .has_leaf_count = true },
+		{ .func = 0x80000000, .has_leaf_count = true },
+		{ .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true },
+		{ .func = KVM_CPUID_SIGNATURE },
+		{ .func = KVM_CPUID_FEATURES },
+	};
 
 	if (cpuid->nent < 1)
 		goto out;
@@ -437,61 +480,31 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 	if (!cpuid_entries)
 		goto out;
 
-	do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
-	limit = cpuid_entries[0].eax;
-	for (func = 1; func <= limit && nent < cpuid->nent; ++func)
-		do_cpuid_ent(&cpuid_entries[nent], func, 0,
-			     &nent, cpuid->nent);
-	r = -E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
-
-	do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
-	limit = cpuid_entries[nent - 1].eax;
-	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
-		do_cpuid_ent(&cpuid_entries[nent], func, 0,
-			     &nent, cpuid->nent);
-
-
+	r = 0;
+	for (i = 0; i < ARRAY_SIZE(param); i++) {
+		struct kvm_cpuid_param *ent = &param[i];
 
-	r = -E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
+		if (ent->qualifier && !ent->qualifier(ent))
+			continue;
 
-	/* Add support for Centaur's CPUID instruction. */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) {
-		do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0,
+		r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx,
 				&nent, cpuid->nent);
 
-		r = -E2BIG;
-		if (nent >= cpuid->nent)
+		if (r)
 			goto out_free;
 
+		if (!ent->has_leaf_count)
+			continue;
+
 		limit = cpuid_entries[nent - 1].eax;
-		for (func = 0xC0000001;
-			func <= limit && nent < cpuid->nent; ++func)
-			do_cpuid_ent(&cpuid_entries[nent], func, 0,
-					&nent, cpuid->nent);
+		for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func)
+			r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx,
+				     &nent, cpuid->nent);
 
-		r = -E2BIG;
-		if (nent >= cpuid->nent)
+		if (r)
 			goto out_free;
 	}
 
-	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
-		     cpuid->nent);
-
-	r = -E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
-
-	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
-		     cpuid->nent);
-
-	r = -E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
-
 	r = -EFAULT;
 	if (copy_to_user(entries, cpuid_entries,
 			 nent * sizeof(struct kvm_cpuid_entry2)))
-- 
cgit v1.2.3-55-g7522


From 0375f7fad904b59502341ccecfc8faea70b34c91 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Mon, 28 Nov 2011 20:41:00 +0800
Subject: KVM: MMU: audit: replace mmu audit tracepoint with jump-label

The tracepoint is only used to audit mmu code, it should not be exposed to
user, let us replace it with jump-label.

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 16 +++++++++++-----
 arch/x86/kvm/mmu_audit.c   | 28 +++++++++++++---------------
 arch/x86/kvm/mmutrace.h    | 19 -------------------
 arch/x86/kvm/paging_tmpl.h |  4 ++--
 4 files changed, 26 insertions(+), 41 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d737443cdfdb..62f69dbf6b52 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -68,6 +68,12 @@ char *audit_point_name[] = {
 	"post sync"
 };
 
+#ifdef CONFIG_KVM_MMU_AUDIT
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point);
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
+#endif
+
 #undef MMU_DEBUG
 
 #ifdef MMU_DEBUG
@@ -2852,12 +2858,12 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 		return;
 
 	vcpu_clear_mmio_info(vcpu, ~0ul);
-	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
 	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 		sp = page_header(root);
 		mmu_sync_children(vcpu, sp);
-		trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+		kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
 		return;
 	}
 	for (i = 0; i < 4; ++i) {
@@ -2869,7 +2875,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 			mmu_sync_children(vcpu, sp);
 		}
 	}
-	trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
 }
 
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -3667,7 +3673,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	++vcpu->kvm->stat.mmu_pte_write;
-	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
+	kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
 
 	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
@@ -3700,7 +3706,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	}
 	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
+	kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 }
 
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 746ec259d024..5df6736a5afb 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -224,30 +224,29 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
 	mmu_spte_walk(vcpu, audit_spte);
 }
 
-static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
+static bool mmu_audit;
+static struct jump_label_key mmu_audit_key;
+
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
 {
 	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
 
-	if (!__ratelimit(&ratelimit_state))
-		return;
+	if (static_branch((&mmu_audit_key))) {
+		if (!__ratelimit(&ratelimit_state))
+			return;
 
-	vcpu->kvm->arch.audit_point = point;
-	audit_all_active_sps(vcpu->kvm);
-	audit_vcpu_spte(vcpu);
+		vcpu->kvm->arch.audit_point = point;
+		audit_all_active_sps(vcpu->kvm);
+		audit_vcpu_spte(vcpu);
+	}
 }
 
-static bool mmu_audit;
-
 static void mmu_audit_enable(void)
 {
-	int ret;
-
 	if (mmu_audit)
 		return;
 
-	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
-	WARN_ON(ret);
-
+	jump_label_inc(&mmu_audit_key);
 	mmu_audit = true;
 }
 
@@ -256,8 +255,7 @@ static void mmu_audit_disable(void)
 	if (!mmu_audit)
 		return;
 
-	unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
-	tracepoint_synchronize_unregister();
+	jump_label_dec(&mmu_audit_key);
 	mmu_audit = false;
 }
 
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index eed67f34146d..89fb0e81322a 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -243,25 +243,6 @@ TRACE_EVENT(
 	TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
 		  __entry->access)
 );
-
-TRACE_EVENT(
-	kvm_mmu_audit,
-	TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
-	TP_ARGS(vcpu, audit_point),
-
-	TP_STRUCT__entry(
-		__field(struct kvm_vcpu *, vcpu)
-		__field(int, audit_point)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu = vcpu;
-		__entry->audit_point = audit_point;
-	),
-
-	TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
-		  audit_point_name[__entry->audit_point])
-);
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 52e9d58cec2b..15610285ebb6 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -632,7 +632,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 
-	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
+	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
 	kvm_mmu_free_some_pages(vcpu);
 	if (!force_pt_level)
 		transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
@@ -643,7 +643,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		 sptep, *sptep, emulate);
 
 	++vcpu->stat.pf_fixed;
-	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
+	kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return emulate;
-- 
cgit v1.2.3-55-g7522


From 9edb17d55f3ea4943f9654f2aad7a99b4c55840a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Mon, 28 Nov 2011 20:41:38 +0800
Subject: KVM: x86: remove the dead code of KVM_EXIT_HYPERCALL

KVM_EXIT_HYPERCALL is not used anymore, so remove the code

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4e533d24c513..465053151a2d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5389,10 +5389,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (r <= 0)
 		goto out;
 
-	if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
-		kvm_register_write(vcpu, VCPU_REGS_RAX,
-				     kvm_run->hypercall.ret);
-
 	r = __vcpu_run(vcpu);
 
 out:
-- 
cgit v1.2.3-55-g7522


From e459e3228dc57f7160e564ce0f09edb5bee656d3 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Mon, 28 Nov 2011 20:42:16 +0800
Subject: KVM: MMU: move the relevant mmu code to mmu.c

Move the mmu code in kvm_arch_vcpu_init() to kvm_mmu_create()

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  6 ++++++
 arch/x86/kvm/mmu.c              |  6 +++++-
 arch/x86/kvm/x86.c              | 11 +----------
 3 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1769f3dde611..020413afb285 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -752,6 +752,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
 			      struct x86_exception *exception);
 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
@@ -773,6 +774,11 @@ void kvm_disable_tdp(void);
 int complete_pio(struct kvm_vcpu *vcpu);
 bool kvm_check_iopl(struct kvm_vcpu *vcpu);
 
+static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+{
+	return gpa;
+}
+
 static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 {
 	struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 62f69dbf6b52..262a3af1f0ec 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3839,7 +3839,11 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 int kvm_mmu_create(struct kvm_vcpu *vcpu)
 {
 	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+	vcpu->arch.mmu.translate_gpa = translate_gpa;
+	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
 
 	return alloc_mmu_pages(vcpu);
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 465053151a2d..d99976e4451e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3430,12 +3430,7 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
 	kvm_x86_ops->get_segment(vcpu, var, seg);
 }
 
-static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
-{
-	return gpa;
-}
-
-static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 {
 	gpa_t t_gpa;
 	struct x86_exception exception;
@@ -5915,10 +5910,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	kvm = vcpu->kvm;
 
 	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
-	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
-	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-	vcpu->arch.mmu.translate_gpa = translate_gpa;
-	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
 	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	else
-- 
cgit v1.2.3-55-g7522


From d750ea28865dbff6a73444358d189dd811c68c50 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Mon, 28 Nov 2011 20:43:18 +0800
Subject: KVM: MMU: remove oos_shadow parameter

The unsync code should be stable now, maybe it is the time to remove this
parameter to cleanup the code a little bit

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/kernel-parameters.txt | 3 ---
 arch/x86/kvm/mmu.c                  | 5 -----
 2 files changed, 8 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 81c287fad79d..6e7cce096e6b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1178,9 +1178,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
 			Default is 0 (don't ignore, but inject #GP)
 
-	kvm.oos_shadow=	[KVM] Disable out-of-sync shadow paging.
-			Default is 1 (enabled)
-
 	kvm.mmu_audit=	[KVM] This is a R/W parameter which allows audit
 			KVM MMU at runtime.
 			Default is 0 (off)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 262a3af1f0ec..b1178d1bb8f5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -93,9 +93,6 @@ static int dbg = 0;
 module_param(dbg, bool, 0644);
 #endif
 
-static int oos_shadow = 1;
-module_param(oos_shadow, bool, 0644);
-
 #ifndef MMU_DEBUG
 #define ASSERT(x) do { } while (0)
 #else
@@ -2196,8 +2193,6 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 			return 1;
 
 		if (!need_unsync && !s->unsync) {
-			if (!oos_shadow)
-				return 1;
 			need_unsync = true;
 		}
 	}
-- 
cgit v1.2.3-55-g7522


From e37fa7853c276403da2fea8792c579e8bfd75042 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong
Date: Wed, 30 Nov 2011 17:43:24 +0800
Subject: KVM: MMU: audit: inline audit function

inline audit function and little cleanup

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c       | 28 +++++++---------------------
 arch/x86/kvm/mmu_audit.c | 29 +++++++++++++++++++++--------
 2 files changed, 28 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b1178d1bb8f5..7a8e99c6dc81 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -59,21 +59,6 @@ enum {
 	AUDIT_POST_SYNC
 };
 
-char *audit_point_name[] = {
-	"pre page fault",
-	"post page fault",
-	"pre pte write",
-	"post pte write",
-	"pre sync",
-	"post sync"
-};
-
-#ifdef CONFIG_KVM_MMU_AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
-#endif
-
 #undef MMU_DEBUG
 
 #ifdef MMU_DEBUG
@@ -1539,6 +1524,13 @@ static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
 	return ret;
 }
 
+#ifdef CONFIG_KVM_MMU_AUDIT
+#include "mmu_audit.c"
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
+static void mmu_audit_disable(void) { }
+#endif
+
 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			 struct list_head *invalid_list)
 {
@@ -4035,12 +4027,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 	mmu_free_memory_caches(vcpu);
 }
 
-#ifdef CONFIG_KVM_MMU_AUDIT
-#include "mmu_audit.c"
-#else
-static void mmu_audit_disable(void) { }
-#endif
-
 void kvm_mmu_module_exit(void)
 {
 	mmu_destroy_caches();
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 5df6736a5afb..fe15dcc07a6b 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,6 +19,15 @@
 
 #include <linux/ratelimit.h>
 
+char const *audit_point_name[] = {
+	"pre page fault",
+	"post page fault",
+	"pre pte write",
+	"post pte write",
+	"pre sync",
+	"post sync"
+};
+
 #define audit_printk(kvm, fmt, args...)		\
 	printk(KERN_ERR "audit: (%s) error: "	\
 		fmt, audit_point_name[kvm->arch.audit_point], ##args)
@@ -227,18 +236,22 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
 static bool mmu_audit;
 static struct jump_label_key mmu_audit_key;
 
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
+static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
 {
 	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
 
-	if (static_branch((&mmu_audit_key))) {
-		if (!__ratelimit(&ratelimit_state))
-			return;
+	if (!__ratelimit(&ratelimit_state))
+		return;
 
-		vcpu->kvm->arch.audit_point = point;
-		audit_all_active_sps(vcpu->kvm);
-		audit_vcpu_spte(vcpu);
-	}
+	vcpu->kvm->arch.audit_point = point;
+	audit_all_active_sps(vcpu->kvm);
+	audit_vcpu_spte(vcpu);
+}
+
+static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
+{
+	if (static_branch((&mmu_audit_key)))
+		__kvm_mmu_audit(vcpu, point);
 }
 
 static void mmu_audit_enable(void)
-- 
cgit v1.2.3-55-g7522


From 086c9855019935854311b477b33498a6ea357ef6 Mon Sep 17 00:00:00 2001
From: Alex,Shi
Date: Thu, 20 Oct 2011 15:34:01 +0800
Subject: KVM: use this_cpu_xxx replace percpu_xxx funcs

percpu_xxx funcs are duplicated with this_cpu_xxx funcs, so replace them
for further code clean up.

And in preempt safe scenario, __this_cpu_xxx funcs has a bit better
performance since __this_cpu_xxx has no redundant preempt_disable()

Signed-off-by: Alex Shi <alex.shi@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d99976e4451e..d55a94f1155a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4664,15 +4664,15 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
 
 static int kvm_is_in_guest(void)
 {
-	return percpu_read(current_vcpu) != NULL;
+	return __this_cpu_read(current_vcpu) != NULL;
 }
 
 static int kvm_is_user_mode(void)
 {
 	int user_mode = 3;
 
-	if (percpu_read(current_vcpu))
-		user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
+	if (__this_cpu_read(current_vcpu))
+		user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
 
 	return user_mode != 0;
 }
@@ -4681,8 +4681,8 @@ static unsigned long kvm_get_guest_ip(void)
 {
 	unsigned long ip = 0;
 
-	if (percpu_read(current_vcpu))
-		ip = kvm_rip_read(percpu_read(current_vcpu));
+	if (__this_cpu_read(current_vcpu))
+		ip = kvm_rip_read(__this_cpu_read(current_vcpu));
 
 	return ip;
 }
@@ -4695,13 +4695,13 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
 
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
 {
-	percpu_write(current_vcpu, vcpu);
+	__this_cpu_write(current_vcpu, vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
 
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
 {
-	percpu_write(current_vcpu, NULL);
+	__this_cpu_write(current_vcpu, NULL);
 }
 EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
 
-- 
cgit v1.2.3-55-g7522


From 3d56cbdf359c953f8bfcab68aa5cf766e4480799 Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Fri, 2 Dec 2011 18:35:24 +0100
Subject: KVM: MMU: Drop unused return value of
 kvm_mmu_remove_some_alloc_mmu_pages

freed_pages is never evaluated, so remove it as well as the return code
kvm_mmu_remove_some_alloc_mmu_pages so far delivered to its only user.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7a8e99c6dc81..2a2a9b40db19 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3890,14 +3890,14 @@ restart:
 	spin_unlock(&kvm->mmu_lock);
 }
 
-static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
-					       struct list_head *invalid_list)
+static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
+						struct list_head *invalid_list)
 {
 	struct kvm_mmu_page *page;
 
 	page = container_of(kvm->arch.active_mmu_pages.prev,
 			    struct kvm_mmu_page, link);
-	return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
+	kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
 }
 
 static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
@@ -3912,15 +3912,15 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
 	raw_spin_lock(&kvm_lock);
 
 	list_for_each_entry(kvm, &vm_list, vm_list) {
-		int idx, freed_pages;
+		int idx;
 		LIST_HEAD(invalid_list);
 
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
 		if (!kvm_freed && nr_to_scan > 0 &&
 		    kvm->arch.n_used_mmu_pages > 0) {
-			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
-							  &invalid_list);
+			kvm_mmu_remove_some_alloc_mmu_pages(kvm,
+							    &invalid_list);
 			kvm_freed = kvm;
 		}
 		nr_to_scan--;
-- 
cgit v1.2.3-55-g7522


From 234b639206a7d9d5ca362cff64ceddd4f27e4a46 Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Fri, 2 Dec 2011 18:26:28 +0100
Subject: KVM: x86 emulator: Remove set-but-unused cr4 from check_cr_write

This was probably copy&pasted from the cr0 case, but it's unneeded here.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ac8e5ed78834..f641201c7b31 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3028,9 +3028,6 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
 		break;
 		}
 	case 4: {
-		u64 cr4;
-
-		cr4 = ctxt->ops->get_cr(ctxt, 4);
 		ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
 
 		if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE))
-- 
cgit v1.2.3-55-g7522


From cdfca7b346e6dbab1ba33260c28ccb8333485a5b Mon Sep 17 00:00:00 2001
From: Sasha Levin
Date: Sun, 4 Dec 2011 19:36:28 +0200
Subject: KVM: Use kmemdup() instead of kmalloc/memcpy

Switch to kmemdup() in two places to shorten the code and avoid possible bugs.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c  | 4 ++--
 virt/kvm/kvm_main.c | 7 +++----
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d55a94f1155a..03042d60a8fc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3031,10 +3031,10 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		memset(dirty_bitmap_head, 0, n);
 
 		r = -ENOMEM;
-		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+		slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL);
 		if (!slots)
 			goto out;
-		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
+
 		memslot = id_to_memslot(slots, log->slot);
 		memslot->nr_dirty_pages = 0;
 		memslot->dirty_bitmap = dirty_bitmap_head;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e289486edc6d..a6e612fced73 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2591,13 +2591,12 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 	int i, r;
 	struct kvm_io_bus *new_bus, *bus;
 
-	new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
+	bus = kvm->buses[bus_idx];
+
+	new_bus = kmemdup(bus, sizeof(*bus), GFP_KERNEL);
 	if (!new_bus)
 		return -ENOMEM;
 
-	bus = kvm->buses[bus_idx];
-	memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
-
 	r = -ENOENT;
 	for (i = 0; i < new_bus->dev_count; i++)
 		if (new_bus->range[i].dev == dev) {
-- 
cgit v1.2.3-55-g7522


From ff5c2c0316ff0e3e2dba3ca14167d994453df093 Mon Sep 17 00:00:00 2001
From: Sasha Levin
Date: Sun, 4 Dec 2011 19:36:29 +0200
Subject: KVM: Use memdup_user instead of kmalloc/copy_from_user

Switch to using memdup_user when possible. This makes code more
smaller and compact, and prevents errors.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c  | 82 +++++++++++++++++++++++------------------------------
 virt/kvm/kvm_main.c | 29 ++++++++-----------
 2 files changed, 47 insertions(+), 64 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 03042d60a8fc..0a646e2b57c5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1309,12 +1309,11 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
 	if (page_num >= blob_size)
 		goto out;
 	r = -ENOMEM;
-	page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!page)
+	page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
+	if (IS_ERR(page)) {
+		r = PTR_ERR(page);
 		goto out;
-	r = -EFAULT;
-	if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
-		goto out_free;
+	}
 	if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
 		goto out_free;
 	r = 0;
@@ -1988,15 +1987,12 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
 	if (msrs.nmsrs >= MAX_IO_MSRS)
 		goto out;
 
-	r = -ENOMEM;
 	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
-	entries = kmalloc(size, GFP_KERNEL);
-	if (!entries)
+	entries = memdup_user(user_msrs->entries, size);
+	if (IS_ERR(entries)) {
+		r = PTR_ERR(entries);
 		goto out;
-
-	r = -EFAULT;
-	if (copy_from_user(entries, user_msrs->entries, size))
-		goto out_free;
+	}
 
 	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
 	if (r < 0)
@@ -2533,13 +2529,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 		if (!vcpu->arch.apic)
 			goto out;
-		u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
-		r = -ENOMEM;
-		if (!u.lapic)
-			goto out;
-		r = -EFAULT;
-		if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
+		u.lapic = memdup_user(argp, sizeof(*u.lapic));
+		if (IS_ERR(u.lapic)) {
+			r = PTR_ERR(u.lapic);
 			goto out;
+		}
+
 		r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
 		if (r)
 			goto out;
@@ -2718,14 +2713,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_SET_XSAVE: {
-		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
-		r = -ENOMEM;
-		if (!u.xsave)
-			break;
-
-		r = -EFAULT;
-		if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
-			break;
+		u.xsave = memdup_user(argp, sizeof(*u.xsave));
+		if (IS_ERR(u.xsave)) {
+			r = PTR_ERR(u.xsave);
+			goto out;
+		}
 
 		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
 		break;
@@ -2746,15 +2738,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_SET_XCRS: {
-		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
-		r = -ENOMEM;
-		if (!u.xcrs)
-			break;
-
-		r = -EFAULT;
-		if (copy_from_user(u.xcrs, argp,
-				   sizeof(struct kvm_xcrs)))
-			break;
+		u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
+		if (IS_ERR(u.xcrs)) {
+			r = PTR_ERR(u.xcrs);
+			goto out;
+		}
 
 		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
 		break;
@@ -3190,14 +3178,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	}
 	case KVM_GET_IRQCHIP: {
 		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-		struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
+		struct kvm_irqchip *chip;
 
-		r = -ENOMEM;
-		if (!chip)
+		chip = memdup_user(argp, sizeof(*chip));
+		if (IS_ERR(chip)) {
+			r = PTR_ERR(chip);
 			goto out;
-		r = -EFAULT;
-		if (copy_from_user(chip, argp, sizeof *chip))
-			goto get_irqchip_out;
+		}
+
 		r = -ENXIO;
 		if (!irqchip_in_kernel(kvm))
 			goto get_irqchip_out;
@@ -3216,14 +3204,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	}
 	case KVM_SET_IRQCHIP: {
 		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-		struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
+		struct kvm_irqchip *chip;
 
-		r = -ENOMEM;
-		if (!chip)
+		chip = memdup_user(argp, sizeof(*chip));
+		if (IS_ERR(chip)) {
+			r = PTR_ERR(chip);
 			goto out;
-		r = -EFAULT;
-		if (copy_from_user(chip, argp, sizeof *chip))
-			goto set_irqchip_out;
+		}
+
 		r = -ENXIO;
 		if (!irqchip_in_kernel(kvm))
 			goto set_irqchip_out;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a6e612fced73..d8bac0751666 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1821,12 +1821,11 @@ out_free1:
 		struct kvm_regs *kvm_regs;
 
 		r = -ENOMEM;
-		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
-		if (!kvm_regs)
+		kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
+		if (IS_ERR(kvm_regs)) {
+			r = PTR_ERR(kvm_regs);
 			goto out;
-		r = -EFAULT;
-		if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
-			goto out_free2;
+		}
 		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
 		if (r)
 			goto out_free2;
@@ -1850,13 +1849,11 @@ out_free2:
 		break;
 	}
 	case KVM_SET_SREGS: {
-		kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
-		r = -ENOMEM;
-		if (!kvm_sregs)
-			goto out;
-		r = -EFAULT;
-		if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
+		kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
+		if (IS_ERR(kvm_sregs)) {
+			r = PTR_ERR(kvm_sregs);
 			goto out;
+		}
 		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
 		if (r)
 			goto out;
@@ -1952,13 +1949,11 @@ out_free2:
 		break;
 	}
 	case KVM_SET_FPU: {
-		fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
-		r = -ENOMEM;
-		if (!fpu)
-			goto out;
-		r = -EFAULT;
-		if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
+		fpu = memdup_user(argp, sizeof(*fpu));
+		if (IS_ERR(fpu)) {
+			r = PTR_ERR(fpu);
 			goto out;
+		}
 		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
 		if (r)
 			goto out;
-- 
cgit v1.2.3-55-g7522


From 43771ebfc9d34ab1f74095d052d225a82ae0898c Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Wed, 14 Dec 2011 12:27:47 +0200
Subject: KVM: Make KVM_INTEL depend on CPU_SUP_INTEL

PMU virtualization needs to talk to Intel-specific bits of perf; these are
only available when CPU_SUP_INTEL=y.

Fixes

  arch/x86/built-in.o: In function `atomic_switch_perf_msrs':
  vmx.c:(.text+0x6b1d4): undefined reference to `perf_guest_get_msrs'

Reported-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ff5790d8e990..ca4d49ed9a6c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -52,6 +52,8 @@ config KVM
 config KVM_INTEL
 	tristate "KVM for Intel processors support"
 	depends on KVM
+	# for perf_guest_get_msrs():
+	depends on CPU_SUP_INTEL
 	---help---
 	  Provides support for KVM on Intel processors equipped with the VT
 	  extensions.
-- 
cgit v1.2.3-55-g7522


From bb5a798ad58996e4d666ead1016705854d5ca616 Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Wed, 14 Dec 2011 17:58:18 +0100
Subject: KVM: x86: Do not rely on implicit inclusions

Works so far by change, but it is not guaranteed to stay like this.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/cpuid.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 52593e8e1ae8..ca63032bf03d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -14,6 +14,8 @@
 
 #include <linux/kvm_host.h>
 #include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
 #include <asm/user.h>
 #include <asm/xsave.h>
 #include "cpuid.h"
-- 
cgit v1.2.3-55-g7522


From a647795efbedeedf8a1dc6deded26defa23562bd Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Wed, 14 Dec 2011 19:25:33 +0100
Subject: KVM: x86: Consolidate PIT legacy test

Move the test for KVM_PIT_FLAGS_HPET_LEGACY into create_pit_timer
instead of replicating it on the caller site.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8254.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 405f2620392f..d68f99df690c 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -344,7 +344,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 	struct kvm_timer *pt = &ps->pit_timer;
 	s64 interval;
 
-	if (!irqchip_in_kernel(kvm))
+	if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
 		return;
 
 	interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -397,15 +397,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 	case 1:
         /* FIXME: enhance mode 4 precision */
 	case 4:
-		if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) {
-			create_pit_timer(kvm, val, 0);
-		}
+		create_pit_timer(kvm, val, 0);
 		break;
 	case 2:
 	case 3:
-		if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){
-			create_pit_timer(kvm, val, 1);
-		}
+		create_pit_timer(kvm, val, 1);
 		break;
 	default:
 		destroy_pit_timer(kvm->arch.vpit);
-- 
cgit v1.2.3-55-g7522


From d546cb406ea0d83e2d39ec14221957a24f88a622 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 15 Dec 2011 12:38:40 +0200
Subject: KVM: drop bsp_vcpu pointer from kvm struct

Drop bsp_vcpu pointer from kvm struct since its only use is incorrect
anyway.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8259.c     | 24 ++++++++++++++++--------
 include/linux/kvm_host.h |  1 -
 virt/kvm/ioapic.c        |  2 +-
 virt/kvm/kvm_main.c      |  4 ----
 4 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index cac4746d7ffb..b6a73537e1ef 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -262,9 +262,10 @@ int kvm_pic_read_irq(struct kvm *kvm)
 
 void kvm_pic_reset(struct kvm_kpic_state *s)
 {
-	int irq;
-	struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu;
+	int irq, i;
+	struct kvm_vcpu *vcpu;
 	u8 irr = s->irr, isr = s->imr;
+	bool found = false;
 
 	s->last_irr = 0;
 	s->irr = 0;
@@ -281,12 +282,19 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
 	s->special_fully_nested_mode = 0;
 	s->init4 = 0;
 
-	for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
-		if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
-			if (irr & (1 << irq) || isr & (1 << irq)) {
-				pic_clear_isr(s, irq);
-			}
-	}
+	kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
+		if (kvm_apic_accept_pic_intr(vcpu)) {
+			found = true;
+			break;
+		}
+
+
+	if (!found)
+		return;
+
+	for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+		if (irr & (1 << irq) || isr & (1 << irq))
+			pic_clear_isr(s, irq);
 }
 
 static void pic_ioport_write(void *opaque, u32 addr, u32 val)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8c5c30361b0d..7a080383935f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -251,7 +251,6 @@ struct kvm {
 	struct srcu_struct srcu;
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
 	u32 bsp_vcpu_id;
-	struct kvm_vcpu *bsp_vcpu;
 #endif
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 	atomic_t online_vcpus;
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 71e2253edee7..dcaf272c26c0 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -185,7 +185,7 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 		irqe.dest_mode = 0; /* Physical mode. */
 		/* need to read apic_id from apic regiest since
 		 * it can be rewritten */
-		irqe.dest_id = ioapic->kvm->bsp_vcpu->vcpu_id;
+		irqe.dest_id = ioapic->kvm->bsp_vcpu_id;
 	}
 #endif
 	return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d8bac0751666..0835c4b90d2f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1743,10 +1743,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 	smp_wmb();
 	atomic_inc(&kvm->online_vcpus);
 
-#ifdef CONFIG_KVM_APIC_ARCHITECTURE
-	if (kvm->bsp_vcpu_id == id)
-		kvm->bsp_vcpu = vcpu;
-#endif
 	mutex_unlock(&kvm->lock);
 	return r;
 
-- 
cgit v1.2.3-55-g7522


From c15af35f54631b9e9b7ad1981016cc6e73cec794 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Tue, 6 Dec 2011 18:06:02 +0900
Subject: KVM: x86 emulator: Use opcode::execute for Group 1A instruction

Group 1A: 8F

Register em_pop() directly and remove em_grp1a().

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f641201c7b31..cd49774f2d0e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1675,11 +1675,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
-static int em_grp1a(struct x86_emulate_ctxt *ctxt)
-{
-	return emulate_pop(ctxt, &ctxt->dst.val, ctxt->dst.bytes);
-}
-
 static int em_grp2(struct x86_emulate_ctxt *ctxt)
 {
 	switch (ctxt->modrm_reg) {
@@ -3203,7 +3198,7 @@ static struct opcode group1[] = {
 };
 
 static struct opcode group1A[] = {
-	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
+	I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N,
 };
 
 static struct opcode group3[] = {
@@ -4031,9 +4026,6 @@ special_insn:
 	case 0x8d: /* lea r16/r32, m */
 		ctxt->dst.val = ctxt->src.addr.mem.ea;
 		break;
-	case 0x8f:		/* pop (sole member of Grp1a) */
-		rc = em_grp1a(ctxt);
-		break;
 	case 0x90 ... 0x97: /* nop / xchg reg, rax */
 		if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX])
 			break;
-- 
cgit v1.2.3-55-g7522


From c04ec8393f3815e0f60dde1d6b29040bf1875d52 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Tue, 6 Dec 2011 18:06:44 +0900
Subject: KVM: x86 emulator: Use opcode::execute for Group 4/5 instructions

Group 4: FE
Group 5: FF

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index cd49774f2d0e..5b78785de41b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3213,16 +3213,19 @@ static struct opcode group3[] = {
 };
 
 static struct opcode group4[] = {
-	D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
+	I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45),
+	I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45),
 	N, N, N, N, N, N,
 };
 
 static struct opcode group5[] = {
-	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
-	D(SrcMem | ModRM | Stack),
+	I(DstMem | SrcNone | ModRM | Lock, em_grp45),
+	I(DstMem | SrcNone | ModRM | Lock, em_grp45),
+	I(SrcMem | ModRM | Stack, em_grp45),
 	I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
-	D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
-	D(SrcMem | ModRM | Stack), N,
+	I(SrcMem | ModRM | Stack, em_grp45),
+	I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45),
+	I(SrcMem | ModRM | Stack, em_grp45), N,
 };
 
 static struct opcode group6[] = {
@@ -4082,12 +4085,6 @@ special_insn:
 	case 0xfd: /* std */
 		ctxt->eflags |= EFLG_DF;
 		break;
-	case 0xfe: /* Grp4 */
-		rc = em_grp45(ctxt);
-		break;
-	case 0xff: /* Grp5 */
-		rc = em_grp45(ctxt);
-		break;
 	default:
 		goto cannot_emulate;
 	}
-- 
cgit v1.2.3-55-g7522


From e0dac408d08c2a5e1bed2a6a9da7f3af3f7a9827 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa
Date: Tue, 6 Dec 2011 18:07:27 +0900
Subject: KVM: x86 emulator: Use opcode::execute for Group 9 instruction

Group 9: 0F C7

Rename em_grp9() to em_cmpxchg8b() and register it.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5b78785de41b..de7be77820d5 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1784,7 +1784,7 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
 	return rc;
 }
 
-static int em_grp9(struct x86_emulate_ctxt *ctxt)
+static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
 {
 	u64 old = ctxt->dst.orig_val64;
 
@@ -3261,7 +3261,7 @@ static struct opcode group8[] = {
 };
 
 static struct group_dual group9 = { {
-	N, D(DstMem64 | ModRM | Lock | PageTable), N, N, N, N, N, N,
+	N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
 }, {
 	N, N, N, N, N, N, N, N,
 } };
@@ -4202,9 +4202,6 @@ twobyte_insn:
 		ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
 							(u64) ctxt->src.val;
 		break;
-	case 0xc7:		/* Grp9 (cmpxchg8b) */
-		rc = em_grp9(ctxt);
-		break;
 	default:
 		goto cannot_emulate;
 	}
-- 
cgit v1.2.3-55-g7522


From 893420822192f717af6fde927c9e78c9b82f8327 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Thu, 10 Nov 2011 14:57:21 +0200
Subject: KVM: Expose kvm_lapic_local_deliver()

Needed to deliver performance monitoring interrupts.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c | 2 +-
 arch/x86/kvm/lapic.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a7f3e655cd3e..cfdc6e0ef002 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1121,7 +1121,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
+int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
 {
 	u32 reg = apic_get_reg(apic, lvt_type);
 	int vector, mode, trig_mode;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 138e8cc6fea6..6f4ce2575d09 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -34,6 +34,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
+int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
-- 
cgit v1.2.3-55-g7522


From f5132b01386b5a67f1ff673bb2b96a507a3f7e41 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 10 Nov 2011 14:57:22 +0200
Subject: KVM: Expose a version 2 architectural PMU to a guests

Use perf_events to emulate an architectural PMU, version 2.

Based on PMU version 1 emulation by Avi Kivity.

[avi: adjust for cpuid.c]
[jan: fix anonymous field initialization for older gcc]

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  48 ++++
 arch/x86/kvm/Kconfig            |   1 +
 arch/x86/kvm/Makefile           |   2 +-
 arch/x86/kvm/cpuid.c            |   2 +
 arch/x86/kvm/pmu.c              | 533 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              |  22 +-
 include/linux/kvm_host.h        |   2 +
 7 files changed, 600 insertions(+), 10 deletions(-)
 create mode 100644 arch/x86/kvm/pmu.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 020413afb285..fb60ffdb4e43 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -16,10 +16,12 @@
 #include <linux/mmu_notifier.h>
 #include <linux/tracepoint.h>
 #include <linux/cpumask.h>
+#include <linux/irq_work.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
 #include <linux/kvm_types.h>
+#include <linux/perf_event.h>
 
 #include <asm/pvclock-abi.h>
 #include <asm/desc.h>
@@ -291,6 +293,37 @@ struct kvm_mmu {
 	u64 pdptrs[4]; /* pae */
 };
 
+enum pmc_type {
+	KVM_PMC_GP = 0,
+	KVM_PMC_FIXED,
+};
+
+struct kvm_pmc {
+	enum pmc_type type;
+	u8 idx;
+	u64 counter;
+	u64 eventsel;
+	struct perf_event *perf_event;
+	struct kvm_vcpu *vcpu;
+};
+
+struct kvm_pmu {
+	unsigned nr_arch_gp_counters;
+	unsigned nr_arch_fixed_counters;
+	unsigned available_event_types;
+	u64 fixed_ctr_ctrl;
+	u64 global_ctrl;
+	u64 global_status;
+	u64 global_ovf_ctrl;
+	u64 counter_bitmask[2];
+	u64 global_ctrl_mask;
+	u8 version;
+	struct kvm_pmc gp_counters[X86_PMC_MAX_GENERIC];
+	struct kvm_pmc fixed_counters[X86_PMC_MAX_FIXED];
+	struct irq_work irq_work;
+	u64 reprogram_pmi;
+};
+
 struct kvm_vcpu_arch {
 	/*
 	 * rip and regs accesses must go through
@@ -424,6 +457,8 @@ struct kvm_vcpu_arch {
 	unsigned access;
 	gfn_t mmio_gfn;
 
+	struct kvm_pmu pmu;
+
 	/* used for guest single stepping over the given code position */
 	unsigned long singlestep_rip;
 
@@ -891,4 +926,17 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
 
+int kvm_is_in_guest(void);
+
+void kvm_pmu_init(struct kvm_vcpu *vcpu);
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
+void kvm_pmu_reset(struct kvm_vcpu *vcpu);
+void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu);
+bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr);
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
+void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
+void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ca4d49ed9a6c..1a7fe868f375 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -35,6 +35,7 @@ config KVM
 	select KVM_MMIO
 	select TASKSTATS
 	select TASK_DELAY_ACCT
+	select PERF_EVENTS
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
 	  virtualization extensions.  You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 161b76ae87c4..4f579e8dcacf 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
-			   i8254.o timer.o cpuid.o
+			   i8254.o timer.o cpuid.o pmu.o
 kvm-intel-y		+= vmx.o
 kvm-amd-y		+= svm.o
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index ca63032bf03d..e70be46f50fc 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -45,6 +45,8 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu)
 		else
 			apic->lapic_timer.timer_mode_mask = 1 << 17;
 	}
+
+	kvm_pmu_cpuid_update(vcpu);
 }
 
 static int is_efer_nx(void)
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
new file mode 100644
index 000000000000..7aad5446f393
--- /dev/null
+++ b/arch/x86/kvm/pmu.c
@@ -0,0 +1,533 @@
+/*
+ * Kernel-based Virtual Machine -- Performane Monitoring Unit support
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ *   Avi Kivity   <avi@redhat.com>
+ *   Gleb Natapov <gleb@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+
+static struct kvm_arch_event_perf_mapping {
+	u8 eventsel;
+	u8 unit_mask;
+	unsigned event_type;
+	bool inexact;
+} arch_events[] = {
+	/* Index must match CPUID 0x0A.EBX bit vector */
+	[0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+	[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+	[2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES  },
+	[3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
+	[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
+	[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+	[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+};
+
+/* mapping between fixed pmc index and arch_events array */
+int fixed_pmc_events[] = {1, 0, 2};
+
+static bool pmc_is_gp(struct kvm_pmc *pmc)
+{
+	return pmc->type == KVM_PMC_GP;
+}
+
+static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
+{
+	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
+
+	return pmu->counter_bitmask[pmc->type];
+}
+
+static inline bool pmc_enabled(struct kvm_pmc *pmc)
+{
+	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
+	return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
+}
+
+static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
+					 u32 base)
+{
+	if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
+		return &pmu->gp_counters[msr - base];
+	return NULL;
+}
+
+static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
+{
+	int base = MSR_CORE_PERF_FIXED_CTR0;
+	if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
+		return &pmu->fixed_counters[msr - base];
+	return NULL;
+}
+
+static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx)
+{
+	return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx);
+}
+
+static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)
+{
+	if (idx < X86_PMC_IDX_FIXED)
+		return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0);
+	else
+		return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED);
+}
+
+void kvm_deliver_pmi(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.apic)
+		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
+}
+
+static void trigger_pmi(struct irq_work *irq_work)
+{
+	struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu,
+			irq_work);
+	struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu,
+			arch.pmu);
+
+	kvm_deliver_pmi(vcpu);
+}
+
+static void kvm_perf_overflow(struct perf_event *perf_event,
+			      struct perf_sample_data *data,
+			      struct pt_regs *regs)
+{
+	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
+	__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+}
+
+static void kvm_perf_overflow_intr(struct perf_event *perf_event,
+		struct perf_sample_data *data, struct pt_regs *regs)
+{
+	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
+	if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
+		kvm_perf_overflow(perf_event, data, regs);
+		kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+		/*
+		 * Inject PMI. If vcpu was in a guest mode during NMI PMI
+		 * can be ejected on a guest mode re-entry. Otherwise we can't
+		 * be sure that vcpu wasn't executing hlt instruction at the
+		 * time of vmexit and is not going to re-enter guest mode until,
+		 * woken up. So we should wake it, but this is impossible from
+		 * NMI context. Do it from irq work instead.
+		 */
+		if (!kvm_is_in_guest())
+			irq_work_queue(&pmc->vcpu->arch.pmu.irq_work);
+		else
+			kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
+	}
+}
+
+static u64 read_pmc(struct kvm_pmc *pmc)
+{
+	u64 counter, enabled, running;
+
+	counter = pmc->counter;
+
+	if (pmc->perf_event)
+		counter += perf_event_read_value(pmc->perf_event,
+						 &enabled, &running);
+
+	/* FIXME: Scaling needed? */
+
+	return counter & pmc_bitmask(pmc);
+}
+
+static void stop_counter(struct kvm_pmc *pmc)
+{
+	if (pmc->perf_event) {
+		pmc->counter = read_pmc(pmc);
+		perf_event_release_kernel(pmc->perf_event);
+		pmc->perf_event = NULL;
+	}
+}
+
+static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
+		unsigned config, bool exclude_user, bool exclude_kernel,
+		bool intr)
+{
+	struct perf_event *event;
+	struct perf_event_attr attr = {
+		.type = type,
+		.size = sizeof(attr),
+		.pinned = true,
+		.exclude_idle = true,
+		.exclude_host = 1,
+		.exclude_user = exclude_user,
+		.exclude_kernel = exclude_kernel,
+		.config = config,
+	};
+
+	attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
+
+	event = perf_event_create_kernel_counter(&attr, -1, current,
+						 intr ? kvm_perf_overflow_intr :
+						 kvm_perf_overflow, pmc);
+	if (IS_ERR(event)) {
+		printk_once("kvm: pmu event creation failed %ld\n",
+				PTR_ERR(event));
+		return;
+	}
+
+	pmc->perf_event = event;
+	clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi);
+}
+
+static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select,
+		u8 unit_mask)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(arch_events); i++)
+		if (arch_events[i].eventsel == event_select
+				&& arch_events[i].unit_mask == unit_mask
+				&& (pmu->available_event_types & (1 << i)))
+			break;
+
+	if (i == ARRAY_SIZE(arch_events))
+		return PERF_COUNT_HW_MAX;
+
+	return arch_events[i].event_type;
+}
+
+static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
+{
+	unsigned config, type = PERF_TYPE_RAW;
+	u8 event_select, unit_mask;
+
+	pmc->eventsel = eventsel;
+
+	stop_counter(pmc);
+
+	if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc))
+		return;
+
+	event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
+	unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
+
+	if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE |
+				ARCH_PERFMON_EVENTSEL_INV |
+				ARCH_PERFMON_EVENTSEL_CMASK))) {
+		config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
+				unit_mask);
+		if (config != PERF_COUNT_HW_MAX)
+			type = PERF_TYPE_HARDWARE;
+	}
+
+	if (type == PERF_TYPE_RAW)
+		config = eventsel & X86_RAW_EVENT_MASK;
+
+	reprogram_counter(pmc, type, config,
+			!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+			!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+			eventsel & ARCH_PERFMON_EVENTSEL_INT);
+}
+
+static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
+{
+	unsigned en = en_pmi & 0x3;
+	bool pmi = en_pmi & 0x8;
+
+	stop_counter(pmc);
+
+	if (!en || !pmc_enabled(pmc))
+		return;
+
+	reprogram_counter(pmc, PERF_TYPE_HARDWARE,
+			arch_events[fixed_pmc_events[idx]].event_type,
+			!(en & 0x2), /* exclude user */
+			!(en & 0x1), /* exclude kernel */
+			pmi);
+}
+
+static inline u8 fixed_en_pmi(u64 ctrl, int idx)
+{
+	return (ctrl >> (idx * 4)) & 0xf;
+}
+
+static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
+{
+	int i;
+
+	for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+		u8 en_pmi = fixed_en_pmi(data, i);
+		struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i);
+
+		if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi)
+			continue;
+
+		reprogram_fixed_counter(pmc, en_pmi, i);
+	}
+
+	pmu->fixed_ctr_ctrl = data;
+}
+
+static void reprogram_idx(struct kvm_pmu *pmu, int idx)
+{
+	struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx);
+
+	if (!pmc)
+		return;
+
+	if (pmc_is_gp(pmc))
+		reprogram_gp_counter(pmc, pmc->eventsel);
+	else {
+		int fidx = idx - X86_PMC_IDX_FIXED;
+		reprogram_fixed_counter(pmc,
+				fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx);
+	}
+}
+
+static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
+{
+	int bit;
+	u64 diff = pmu->global_ctrl ^ data;
+
+	pmu->global_ctrl = data;
+
+	for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
+		reprogram_idx(pmu, bit);
+}
+
+bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	int ret;
+
+	switch (msr) {
+	case MSR_CORE_PERF_FIXED_CTR_CTRL:
+	case MSR_CORE_PERF_GLOBAL_STATUS:
+	case MSR_CORE_PERF_GLOBAL_CTRL:
+	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+		ret = pmu->version > 1;
+		break;
+	default:
+		ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)
+			|| get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0)
+			|| get_fixed_pmc(pmu, msr);
+		break;
+	}
+	return ret;
+}
+
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	struct kvm_pmc *pmc;
+
+	switch (index) {
+	case MSR_CORE_PERF_FIXED_CTR_CTRL:
+		*data = pmu->fixed_ctr_ctrl;
+		return 0;
+	case MSR_CORE_PERF_GLOBAL_STATUS:
+		*data = pmu->global_status;
+		return 0;
+	case MSR_CORE_PERF_GLOBAL_CTRL:
+		*data = pmu->global_ctrl;
+		return 0;
+	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+		*data = pmu->global_ovf_ctrl;
+		return 0;
+	default:
+		if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
+				(pmc = get_fixed_pmc(pmu, index))) {
+			*data = read_pmc(pmc);
+			return 0;
+		} else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
+			*data = pmc->eventsel;
+			return 0;
+		}
+	}
+	return 1;
+}
+
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	struct kvm_pmc *pmc;
+
+	switch (index) {
+	case MSR_CORE_PERF_FIXED_CTR_CTRL:
+		if (pmu->fixed_ctr_ctrl == data)
+			return 0;
+		if (!(data & 0xfffffffffffff444)) {
+			reprogram_fixed_counters(pmu, data);
+			return 0;
+		}
+		break;
+	case MSR_CORE_PERF_GLOBAL_STATUS:
+		break; /* RO MSR */
+	case MSR_CORE_PERF_GLOBAL_CTRL:
+		if (pmu->global_ctrl == data)
+			return 0;
+		if (!(data & pmu->global_ctrl_mask)) {
+			global_ctrl_changed(pmu, data);
+			return 0;
+		}
+		break;
+	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+		if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
+			pmu->global_status &= ~data;
+			pmu->global_ovf_ctrl = data;
+			return 0;
+		}
+		break;
+	default:
+		if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
+				(pmc = get_fixed_pmc(pmu, index))) {
+			data = (s64)(s32)data;
+			pmc->counter += data - read_pmc(pmc);
+			return 0;
+		} else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
+			if (data == pmc->eventsel)
+				return 0;
+			if (!(data & 0xffffffff00200000ull)) {
+				reprogram_gp_counter(pmc, data);
+				return 0;
+			}
+		}
+	}
+	return 1;
+}
+
+int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	bool fast_mode = pmc & (1u << 31);
+	bool fixed = pmc & (1u << 30);
+	struct kvm_pmc *counters;
+	u64 ctr;
+
+	pmc &= (3u << 30) - 1;
+	if (!fixed && pmc >= pmu->nr_arch_gp_counters)
+		return 1;
+	if (fixed && pmc >= pmu->nr_arch_fixed_counters)
+		return 1;
+	counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
+	ctr = read_pmc(&counters[pmc]);
+	if (fast_mode)
+		ctr = (u32)ctr;
+	*data = ctr;
+
+	return 0;
+}
+
+void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	struct kvm_cpuid_entry2 *entry;
+	unsigned bitmap_len;
+
+	pmu->nr_arch_gp_counters = 0;
+	pmu->nr_arch_fixed_counters = 0;
+	pmu->counter_bitmask[KVM_PMC_GP] = 0;
+	pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+	pmu->version = 0;
+
+	entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
+	if (!entry)
+		return;
+
+	pmu->version = entry->eax & 0xff;
+	if (!pmu->version)
+		return;
+
+	pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff,
+			X86_PMC_MAX_GENERIC);
+	pmu->counter_bitmask[KVM_PMC_GP] =
+		((u64)1 << ((entry->eax >> 16) & 0xff)) - 1;
+	bitmap_len = (entry->eax >> 24) & 0xff;
+	pmu->available_event_types = ~entry->ebx & ((1ull << bitmap_len) - 1);
+
+	if (pmu->version == 1) {
+		pmu->global_ctrl = (1 << pmu->nr_arch_gp_counters) - 1;
+		return;
+	}
+
+	pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f),
+			X86_PMC_MAX_FIXED);
+	pmu->counter_bitmask[KVM_PMC_FIXED] =
+		((u64)1 << ((entry->edx >> 5) & 0xff)) - 1;
+	pmu->global_ctrl_mask = ~(((1 << pmu->nr_arch_gp_counters) - 1)
+			| (((1ull << pmu->nr_arch_fixed_counters) - 1)
+				<< X86_PMC_IDX_FIXED));
+}
+
+void kvm_pmu_init(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+
+	memset(pmu, 0, sizeof(*pmu));
+	for (i = 0; i < X86_PMC_MAX_GENERIC; i++) {
+		pmu->gp_counters[i].type = KVM_PMC_GP;
+		pmu->gp_counters[i].vcpu = vcpu;
+		pmu->gp_counters[i].idx = i;
+	}
+	for (i = 0; i < X86_PMC_MAX_FIXED; i++) {
+		pmu->fixed_counters[i].type = KVM_PMC_FIXED;
+		pmu->fixed_counters[i].vcpu = vcpu;
+		pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED;
+	}
+	init_irq_work(&pmu->irq_work, trigger_pmi);
+	kvm_pmu_cpuid_update(vcpu);
+}
+
+void kvm_pmu_reset(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	int i;
+
+	irq_work_sync(&pmu->irq_work);
+	for (i = 0; i < X86_PMC_MAX_GENERIC; i++) {
+		struct kvm_pmc *pmc = &pmu->gp_counters[i];
+		stop_counter(pmc);
+		pmc->counter = pmc->eventsel = 0;
+	}
+
+	for (i = 0; i < X86_PMC_MAX_FIXED; i++)
+		stop_counter(&pmu->fixed_counters[i]);
+
+	pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
+		pmu->global_ovf_ctrl = 0;
+}
+
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
+{
+	kvm_pmu_reset(vcpu);
+}
+
+void kvm_handle_pmu_event(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	u64 bitmask;
+	int bit;
+
+	bitmask = pmu->reprogram_pmi;
+
+	for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
+		struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit);
+
+		if (unlikely(!pmc || !pmc->perf_event)) {
+			clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
+			continue;
+		}
+
+		reprogram_idx(pmu, bit);
+	}
+}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0a646e2b57c5..08ae951ecc5c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1602,8 +1602,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	 * which we perfectly emulate ;-). Any other value should be at least
 	 * reported, some guests depend on them.
 	 */
-	case MSR_P6_EVNTSEL0:
-	case MSR_P6_EVNTSEL1:
 	case MSR_K7_EVNTSEL0:
 	case MSR_K7_EVNTSEL1:
 	case MSR_K7_EVNTSEL2:
@@ -1615,8 +1613,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	/* at least RHEL 4 unconditionally writes to the perfctr registers,
 	 * so we ignore writes to make it happy.
 	 */
-	case MSR_P6_PERFCTR0:
-	case MSR_P6_PERFCTR1:
 	case MSR_K7_PERFCTR0:
 	case MSR_K7_PERFCTR1:
 	case MSR_K7_PERFCTR2:
@@ -1653,6 +1649,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	default:
 		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
 			return xen_hvm_config(vcpu, data);
+		if (kvm_pmu_msr(vcpu, msr))
+			return kvm_pmu_set_msr(vcpu, msr, data);
 		if (!ignore_msrs) {
 			pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
 				msr, data);
@@ -1815,10 +1813,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_K8_SYSCFG:
 	case MSR_K7_HWCR:
 	case MSR_VM_HSAVE_PA:
-	case MSR_P6_PERFCTR0:
-	case MSR_P6_PERFCTR1:
-	case MSR_P6_EVNTSEL0:
-	case MSR_P6_EVNTSEL1:
 	case MSR_K7_EVNTSEL0:
 	case MSR_K7_PERFCTR0:
 	case MSR_K8_INT_PENDING_MSG:
@@ -1929,6 +1923,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		data = 0xbe702111;
 		break;
 	default:
+		if (kvm_pmu_msr(vcpu, msr))
+			return kvm_pmu_get_msr(vcpu, msr, pdata);
 		if (!ignore_msrs) {
 			pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
 			return 1;
@@ -4650,7 +4646,7 @@ static void kvm_timer_init(void)
 
 static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
 
-static int kvm_is_in_guest(void)
+int kvm_is_in_guest(void)
 {
 	return __this_cpu_read(current_vcpu) != NULL;
 }
@@ -5114,6 +5110,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			process_nmi(vcpu);
 		req_immediate_exit =
 			kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
+		if (kvm_check_request(KVM_REQ_PMU, vcpu))
+			kvm_handle_pmu_event(vcpu);
+		if (kvm_check_request(KVM_REQ_PMI, vcpu))
+			kvm_deliver_pmi(vcpu);
 	}
 
 	r = kvm_mmu_reload(vcpu);
@@ -5850,6 +5850,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 	kvm_async_pf_hash_reset(vcpu);
 	vcpu->arch.apf.halted = false;
 
+	kvm_pmu_reset(vcpu);
+
 	return kvm_x86_ops->vcpu_reset(vcpu);
 }
 
@@ -5934,6 +5936,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 		goto fail_free_mce_banks;
 
 	kvm_async_pf_hash_reset(vcpu);
+	kvm_pmu_init(vcpu);
 
 	return 0;
 fail_free_mce_banks:
@@ -5952,6 +5955,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 	int idx;
 
+	kvm_pmu_destroy(vcpu);
 	kfree(vcpu->arch.mce_banks);
 	kvm_free_lapic(vcpu);
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7a080383935f..900c76337e8f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -52,6 +52,8 @@
 #define KVM_REQ_STEAL_UPDATE      13
 #define KVM_REQ_NMI               14
 #define KVM_REQ_IMMEDIATE_EXIT    15
+#define KVM_REQ_PMU               16
+#define KVM_REQ_PMI               17
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
-- 
cgit v1.2.3-55-g7522


From 022cd0e84020eec8b589bc119699c935c7b29584 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Thu, 10 Nov 2011 14:57:23 +0200
Subject: KVM: Add generic RDPMC support

Add a helper function that emulates the RDPMC instruction operation.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c              | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fb60ffdb4e43..52d6640a5ca1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -760,6 +760,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
 
 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+bool kvm_rdpmc(struct kvm_vcpu *vcpu);
 
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 08ae951ecc5c..27d18b7617f3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -760,6 +760,21 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 }
 EXPORT_SYMBOL_GPL(kvm_get_dr);
 
+bool kvm_rdpmc(struct kvm_vcpu *vcpu)
+{
+	u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	u64 data;
+	int err;
+
+	err = kvm_pmu_read_pmc(vcpu, ecx, &data);
+	if (err)
+		return err;
+	kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
+	return err;
+}
+EXPORT_SYMBOL_GPL(kvm_rdpmc);
+
 /*
  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
-- 
cgit v1.2.3-55-g7522


From 332b56e4841ef62db4dbf1b4b92195575e1c7338 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Thu, 10 Nov 2011 14:57:24 +0200
Subject: KVM: SVM: Intercept RDPMC

Intercept RDPMC and forward it to the PMU emulation code.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e32243eac2f4..5fa553babe56 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1014,6 +1014,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 	set_intercept(svm, INTERCEPT_NMI);
 	set_intercept(svm, INTERCEPT_SMI);
 	set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+	set_intercept(svm, INTERCEPT_RDPMC);
 	set_intercept(svm, INTERCEPT_CPUID);
 	set_intercept(svm, INTERCEPT_INVD);
 	set_intercept(svm, INTERCEPT_HLT);
@@ -2770,6 +2771,19 @@ static int emulate_on_interception(struct vcpu_svm *svm)
 	return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
 }
 
+static int rdpmc_interception(struct vcpu_svm *svm)
+{
+	int err;
+
+	if (!static_cpu_has(X86_FEATURE_NRIPS))
+		return emulate_on_interception(svm);
+
+	err = kvm_rdpmc(&svm->vcpu);
+	kvm_complete_insn_gp(&svm->vcpu, err);
+
+	return 1;
+}
+
 bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val)
 {
 	unsigned long cr0 = svm->vcpu.arch.cr0;
@@ -3190,6 +3204,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_SMI]				= nop_on_interception,
 	[SVM_EXIT_INIT]				= nop_on_interception,
 	[SVM_EXIT_VINTR]			= interrupt_window_interception,
+	[SVM_EXIT_RDPMC]			= rdpmc_interception,
 	[SVM_EXIT_CPUID]			= cpuid_interception,
 	[SVM_EXIT_IRET]                         = iret_interception,
 	[SVM_EXIT_INVD]                         = emulate_on_interception,
-- 
cgit v1.2.3-55-g7522


From fee84b079d5ddee2247b5c1f53162c330c622902 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Thu, 10 Nov 2011 14:57:25 +0200
Subject: KVM: VMX: Intercept RDPMC

Intercept RDPMC and forward it to the PMU emulation code.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4ceced2669ef..906a7e84200f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1956,6 +1956,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #endif
 		CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
 		CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
+		CPU_BASED_RDPMC_EXITING |
 		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
 	/*
 	 * We can allow some features even when not supported by the
@@ -2410,7 +2411,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	      CPU_BASED_USE_TSC_OFFSETING |
 	      CPU_BASED_MWAIT_EXITING |
 	      CPU_BASED_MONITOR_EXITING |
-	      CPU_BASED_INVLPG_EXITING;
+	      CPU_BASED_INVLPG_EXITING |
+	      CPU_BASED_RDPMC_EXITING;
 
 	if (yield_on_hlt)
 		min |= CPU_BASED_HLT_EXITING;
@@ -4613,6 +4615,16 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_rdpmc(struct kvm_vcpu *vcpu)
+{
+	int err;
+
+	err = kvm_rdpmc(vcpu);
+	kvm_complete_insn_gp(vcpu, err);
+
+	return 1;
+}
+
 static int handle_wbinvd(struct kvm_vcpu *vcpu)
 {
 	skip_emulated_instruction(vcpu);
@@ -5563,6 +5575,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_HLT]                     = handle_halt,
 	[EXIT_REASON_INVD]		      = handle_invd,
 	[EXIT_REASON_INVLPG]		      = handle_invlpg,
+	[EXIT_REASON_RDPMC]                   = handle_rdpmc,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
 	[EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
-- 
cgit v1.2.3-55-g7522


From a6c06ed1a60aff77b27ba558c315c3fed4e35565 Mon Sep 17 00:00:00 2001
From: Gleb Natapov
Date: Thu, 10 Nov 2011 14:57:28 +0200
Subject: KVM: Expose the architectural performance monitoring CPUID leaf

Provide a CPUID leaf that describes the emulated PMU.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/cpuid.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index e70be46f50fc..89b02bfaaca5 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -327,6 +327,35 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	}
 	case 9:
 		break;
+	case 0xa: { /* Architectural Performance Monitoring */
+		struct x86_pmu_capability cap;
+		union cpuid10_eax eax;
+		union cpuid10_edx edx;
+
+		perf_get_x86_pmu_capability(&cap);
+
+		/*
+		 * Only support guest architectural pmu on a host
+		 * with architectural pmu.
+		 */
+		if (!cap.version)
+			memset(&cap, 0, sizeof(cap));
+
+		eax.split.version_id = min(cap.version, 2);
+		eax.split.num_counters = cap.num_counters_gp;
+		eax.split.bit_width = cap.bit_width_gp;
+		eax.split.mask_length = cap.events_mask_len;
+
+		edx.split.num_counters_fixed = cap.num_counters_fixed;
+		edx.split.bit_width_fixed = cap.bit_width_fixed;
+		edx.split.reserved = 0;
+
+		entry->eax = eax.full;
+		entry->ebx = cap.events_mask;
+		entry->ecx = 0;
+		entry->edx = edx.full;
+		break;
+	}
 	/* function 0xb has additional index. */
 	case 0xb: {
 		int i, level_type;
@@ -427,7 +456,6 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	case 3: /* Processor serial number */
 	case 5: /* MONITOR/MWAIT */
 	case 6: /* Thermal management */
-	case 0xA: /* Architectural Performance Monitoring */
 	case 0x80000007: /* Advanced power management */
 	case 0xC0000002:
 	case 0xC0000003:
-- 
cgit v1.2.3-55-g7522


From 80bdec64c05b645708b0dd97919783ad077fcdc8 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Thu, 10 Nov 2011 14:57:29 +0200
Subject: KVM: x86 emulator: fix RDPMC privilege check

RDPMC is only privileged if CR4.PCE=0.  check_rdpmc() already implements this,
so all we need to do is drop the Priv flag.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index de7be77820d5..d270f1a817dc 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3411,7 +3411,7 @@ static struct opcode twobyte_table[256] = {
 	II(ImplicitOps | Priv, em_wrmsr, wrmsr),
 	IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
 	II(ImplicitOps | Priv, em_rdmsr, rdmsr),
-	DIP(ImplicitOps | Priv, rdpmc, check_rdpmc),
+	DIP(ImplicitOps, rdpmc, check_rdpmc),
 	I(ImplicitOps | VendorSpecific, em_sysenter),
 	I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
 	N, N,
-- 
cgit v1.2.3-55-g7522


From 222d21aa070a4885ce3c7125a1b7ce07429ea4a1 Mon Sep 17 00:00:00 2001
From: Avi Kivity
Date: Thu, 10 Nov 2011 14:57:30 +0200
Subject: KVM: x86 emulator: implement RDPMC (0F 33)

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/kvm/emulate.c             | 13 ++++++++++++-
 arch/x86/kvm/x86.c                 |  7 +++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 9a4acf41709c..ab4092e3214e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -181,6 +181,7 @@ struct x86_emulate_ops {
 	int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
 	int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
 	int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
+	int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
 	void (*halt)(struct x86_emulate_ctxt *ctxt);
 	void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
 	int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d270f1a817dc..05a562b85025 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2645,6 +2645,17 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
+{
+	u64 pmc;
+
+	if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc))
+		return emulate_gp(ctxt, 0);
+	ctxt->regs[VCPU_REGS_RAX] = (u32)pmc;
+	ctxt->regs[VCPU_REGS_RDX] = pmc >> 32;
+	return X86EMUL_CONTINUE;
+}
+
 static int em_mov(struct x86_emulate_ctxt *ctxt)
 {
 	ctxt->dst.val = ctxt->src.val;
@@ -3411,7 +3422,7 @@ static struct opcode twobyte_table[256] = {
 	II(ImplicitOps | Priv, em_wrmsr, wrmsr),
 	IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
 	II(ImplicitOps | Priv, em_rdmsr, rdmsr),
-	DIP(ImplicitOps, rdpmc, check_rdpmc),
+	IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc),
 	I(ImplicitOps | VendorSpecific, em_sysenter),
 	I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
 	N, N,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 27d18b7617f3..1171def5f96b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4146,6 +4146,12 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
 	return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
 }
 
+static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
+			     u32 pmc, u64 *pdata)
+{
+	return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata);
+}
+
 static void emulator_halt(struct x86_emulate_ctxt *ctxt)
 {
 	emul_to_vcpu(ctxt)->arch.halt_request = 1;
@@ -4198,6 +4204,7 @@ static struct x86_emulate_ops emulate_ops = {
 	.set_dr              = emulator_set_dr,
 	.set_msr             = emulator_set_msr,
 	.get_msr             = emulator_get_msr,
+	.read_pmc            = emulator_read_pmc,
 	.halt                = emulator_halt,
 	.wbinvd              = emulator_wbinvd,
 	.fix_hypercall       = emulator_fix_hypercall,
-- 
cgit v1.2.3-55-g7522


From 7c9c3a1e5fc8728e948b8fa3cbcfcfb86db3afda Mon Sep 17 00:00:00 2001
From: Alan Cox
Date: Thu, 29 Dec 2011 14:43:16 +0000
Subject: x86/intel config: Fix the APB_TIMER selection

Seems Kconfig SELECT isn't selecting things hierarchically when
selected.

config APB_TIMER
       def_bool y if X86_INTEL_MID
       prompt "Intel MID APB Timer Support" if X86_INTEL_MID
       select DW_APB_TIMER
       depends on X86_INTEL_MID && SFI

when we select APB_TIMER doesn't select DW_APB_TIMER so do it by
hand.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Link: http://lkml.kernel.org/n/tip-kpnaimplltk6d1lolusqj3ae@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 07620bc913db..78fbb346959b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -409,12 +409,14 @@ config X86_MRST
 	depends on PCI
 	depends on PCI_GOANY
 	depends on X86_IO_APIC
+	select X86_INTEL_MID
+	select SFI
+	select DW_APB_TIMER
 	select APB_TIMER
 	select I2C
 	select SPI
 	select INTEL_SCU_IPC
 	select X86_PLATFORM_DEVICES
-	select X86_INTEL_MID
 	---help---
 	  Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
 	  Internet Device(MID) platform. Moorestown consists of two chips:
@@ -428,12 +430,14 @@ config X86_MDFLD
 	depends on PCI
 	depends on PCI_GOANY
 	depends on X86_IO_APIC
+	select X86_INTEL_MID
+	select SFI
+	select DW_APB_TIMER
 	select APB_TIMER
 	select I2C
 	select SPI
 	select INTEL_SCU_IPC
 	select X86_PLATFORM_DEVICES
-	select X86_INTEL_MID
 	---help---
 	  Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin
 	  Internet Device(MID) platform. 
-- 
cgit v1.2.3-55-g7522


From 2c9ede55ecec58099b72e4bb8eab719f32f72c31 Mon Sep 17 00:00:00 2001
From: Al Viro
Date: Sat, 23 Jul 2011 20:24:48 -0400
Subject: switch device_get_devnode() and ->devnode() to umode_t *

both callers of device_get_devnode() are only interested in lower 16bits
and nobody tries to return anything wider than 16bit anyway.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/kernel/cpuid.c                    | 2 +-
 arch/x86/kernel/msr.c                      | 2 +-
 block/bsg.c                                | 2 +-
 block/genhd.c                              | 2 +-
 drivers/base/core.c                        | 4 ++--
 drivers/base/devtmpfs.c                    | 2 +-
 drivers/block/aoe/aoechr.c                 | 2 +-
 drivers/block/pktcdvd.c                    | 2 +-
 drivers/char/mem.c                         | 4 ++--
 drivers/char/misc.c                        | 2 +-
 drivers/char/raw.c                         | 2 +-
 drivers/char/tile-srom.c                   | 2 +-
 drivers/gpu/drm/drm_sysfs.c                | 2 +-
 drivers/hid/usbhid/hiddev.c                | 2 +-
 drivers/infiniband/core/cm.c               | 2 +-
 drivers/infiniband/core/user_mad.c         | 2 +-
 drivers/infiniband/core/uverbs_main.c      | 2 +-
 drivers/input/input.c                      | 2 +-
 drivers/media/dvb/ddbridge/ddbridge-core.c | 2 +-
 drivers/media/dvb/dvb-core/dvbdev.c        | 2 +-
 drivers/media/rc/rc-main.c                 | 2 +-
 drivers/tty/tty_io.c                       | 2 +-
 drivers/usb/class/usblp.c                  | 2 +-
 drivers/usb/core/file.c                    | 2 +-
 drivers/usb/core/usb.c                     | 2 +-
 drivers/usb/misc/iowarrior.c               | 2 +-
 drivers/usb/misc/legousbtower.c            | 2 +-
 include/linux/device.h                     | 6 +++---
 include/linux/genhd.h                      | 2 +-
 include/linux/usb.h                        | 2 +-
 sound/sound_core.c                         | 2 +-
 31 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 212a6a42527c..a524353d93f2 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -177,7 +177,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
 	.notifier_call = cpuid_class_cpu_callback,
 };
 
-static char *cpuid_devnode(struct device *dev, mode_t *mode)
+static char *cpuid_devnode(struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
 }
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 12fcbe2c143e..96356762a51d 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -236,7 +236,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
 	.notifier_call = msr_class_cpu_callback,
 };
 
-static char *msr_devnode(struct device *dev, mode_t *mode)
+static char *msr_devnode(struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
 }
diff --git a/block/bsg.c b/block/bsg.c
index 702f1316bb8f..9651ec7b87c2 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -1070,7 +1070,7 @@ EXPORT_SYMBOL_GPL(bsg_register_queue);
 
 static struct cdev bsg_cdev;
 
-static char *bsg_devnode(struct device *dev, mode_t *mode)
+static char *bsg_devnode(struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev));
 }
diff --git a/block/genhd.c b/block/genhd.c
index 02e9fca80825..80578f3176ef 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1109,7 +1109,7 @@ struct class block_class = {
 	.name		= "block",
 };
 
-static char *block_devnode(struct device *dev, mode_t *mode)
+static char *block_devnode(struct device *dev, umode_t *mode)
 {
 	struct gendisk *disk = dev_to_disk(dev);
 
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 919daa7cd5b1..1dfa1d616fa5 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -198,7 +198,7 @@ static int dev_uevent(struct kset *kset, struct kobject *kobj,
 	if (MAJOR(dev->devt)) {
 		const char *tmp;
 		const char *name;
-		mode_t mode = 0;
+		umode_t mode = 0;
 
 		add_uevent_var(env, "MAJOR=%u", MAJOR(dev->devt));
 		add_uevent_var(env, "MINOR=%u", MINOR(dev->devt));
@@ -1182,7 +1182,7 @@ static struct device *next_device(struct klist_iter *i)
  * freed by the caller.
  */
 const char *device_get_devnode(struct device *dev,
-			       mode_t *mode, const char **tmp)
+			       umode_t *mode, const char **tmp)
 {
 	char *s;
 
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index a4760e095ff5..3990f682e690 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -40,7 +40,7 @@ static struct req {
 	struct completion done;
 	int err;
 	const char *name;
-	mode_t mode;	/* 0 => delete */
+	umode_t mode;	/* 0 => delete */
 	struct device *dev;
 } *requests;
 
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
index 5f8e39c43ae5..e86d2062a164 100644
--- a/drivers/block/aoe/aoechr.c
+++ b/drivers/block/aoe/aoechr.c
@@ -270,7 +270,7 @@ static const struct file_operations aoe_fops = {
 	.llseek = noop_llseek,
 };
 
-static char *aoe_devnode(struct device *dev, mode_t *mode)
+static char *aoe_devnode(struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "etherd/%s", dev_name(dev));
 }
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index a63b0a2b7805..d59edeabd93f 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2817,7 +2817,7 @@ static const struct block_device_operations pktcdvd_ops = {
 	.check_events =		pkt_check_events,
 };
 
-static char *pktcdvd_devnode(struct gendisk *gd, mode_t *mode)
+static char *pktcdvd_devnode(struct gendisk *gd, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "pktcdvd/%s", gd->disk_name);
 }
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 145179033716..d6e9d081c8b1 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -847,7 +847,7 @@ static const struct file_operations kmsg_fops = {
 
 static const struct memdev {
 	const char *name;
-	mode_t mode;
+	umode_t mode;
 	const struct file_operations *fops;
 	struct backing_dev_info *dev_info;
 } devlist[] = {
@@ -901,7 +901,7 @@ static const struct file_operations memory_fops = {
 	.llseek = noop_llseek,
 };
 
-static char *mem_devnode(struct device *dev, mode_t *mode)
+static char *mem_devnode(struct device *dev, umode_t *mode)
 {
 	if (mode && devlist[MINOR(dev->devt)].mode)
 		*mode = devlist[MINOR(dev->devt)].mode;
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 778273c93242..522136d40843 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -258,7 +258,7 @@ int misc_deregister(struct miscdevice *misc)
 EXPORT_SYMBOL(misc_register);
 EXPORT_SYMBOL(misc_deregister);
 
-static char *misc_devnode(struct device *dev, mode_t *mode)
+static char *misc_devnode(struct device *dev, umode_t *mode)
 {
 	struct miscdevice *c = dev_get_drvdata(dev);
 
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index b6de2c047145..54a3a6d09819 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -308,7 +308,7 @@ static const struct file_operations raw_ctl_fops = {
 
 static struct cdev raw_cdev;
 
-static char *raw_devnode(struct device *dev, mode_t *mode)
+static char *raw_devnode(struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "raw/%s", dev_name(dev));
 }
diff --git a/drivers/char/tile-srom.c b/drivers/char/tile-srom.c
index cf3ee008dca2..4dc019408fac 100644
--- a/drivers/char/tile-srom.c
+++ b/drivers/char/tile-srom.c
@@ -329,7 +329,7 @@ static struct device_attribute srom_dev_attrs[] = {
 	__ATTR_NULL
 };
 
-static char *srom_devnode(struct device *dev, mode_t *mode)
+static char *srom_devnode(struct device *dev, umode_t *mode)
 {
 	*mode = S_IRUGO | S_IWUSR;
 	return kasprintf(GFP_KERNEL, "srom/%s", dev_name(dev));
diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
index 0f9ef9bf6730..62c3675045ac 100644
--- a/drivers/gpu/drm/drm_sysfs.c
+++ b/drivers/gpu/drm/drm_sysfs.c
@@ -72,7 +72,7 @@ static int drm_class_resume(struct device *dev)
 	return 0;
 }
 
-static char *drm_devnode(struct device *dev, mode_t *mode)
+static char *drm_devnode(struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "dri/%s", dev_name(dev));
 }
diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index 4ef02b269a71..7c297d305d5d 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c
@@ -859,7 +859,7 @@ static const struct file_operations hiddev_fops = {
 	.llseek		= noop_llseek,
 };
 
-static char *hiddev_devnode(struct device *dev, mode_t *mode)
+static char *hiddev_devnode(struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
 }
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 8b72f39202fb..c889aaef3416 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -3659,7 +3659,7 @@ static struct kobj_type cm_port_obj_type = {
 	.release = cm_release_port_obj
 };
 
-static char *cm_devnode(struct device *dev, mode_t *mode)
+static char *cm_devnode(struct device *dev, umode_t *mode)
 {
 	if (mode)
 		*mode = 0666;
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 07db22997e97..f0d588f8859e 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -1175,7 +1175,7 @@ static void ib_umad_remove_one(struct ib_device *device)
 	kref_put(&umad_dev->ref, ib_umad_release_dev);
 }
 
-static char *umad_devnode(struct device *dev, mode_t *mode)
+static char *umad_devnode(struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
 }
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 879636746373..604556d73d25 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -846,7 +846,7 @@ static void ib_uverbs_remove_one(struct ib_device *device)
 	kfree(uverbs_dev);
 }
 
-static char *uverbs_devnode(struct device *dev, mode_t *mode)
+static char *uverbs_devnode(struct device *dev, umode_t *mode)
 {
 	if (mode)
 		*mode = 0666;
diff --git a/drivers/input/input.c b/drivers/input/input.c
index da38d97a51b1..1f78c957a75a 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -1624,7 +1624,7 @@ static struct device_type input_dev_type = {
 #endif
 };
 
-static char *input_devnode(struct device *dev, mode_t *mode)
+static char *input_devnode(struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "input/%s", dev_name(dev));
 }
diff --git a/drivers/media/dvb/ddbridge/ddbridge-core.c b/drivers/media/dvb/ddbridge/ddbridge-core.c
index ba9a643b9c6a..d1e91bc80e78 100644
--- a/drivers/media/dvb/ddbridge/ddbridge-core.c
+++ b/drivers/media/dvb/ddbridge/ddbridge-core.c
@@ -1480,7 +1480,7 @@ static const struct file_operations ddb_fops = {
 	.open           = ddb_open,
 };
 
-static char *ddb_devnode(struct device *device, mode_t *mode)
+static char *ddb_devnode(struct device *device, umode_t *mode)
 {
 	struct ddb *dev = dev_get_drvdata(device);
 
diff --git a/drivers/media/dvb/dvb-core/dvbdev.c b/drivers/media/dvb/dvb-core/dvbdev.c
index f73287775953..00a67326c193 100644
--- a/drivers/media/dvb/dvb-core/dvbdev.c
+++ b/drivers/media/dvb/dvb-core/dvbdev.c
@@ -450,7 +450,7 @@ static int dvb_uevent(struct device *dev, struct kobj_uevent_env *env)
 	return 0;
 }
 
-static char *dvb_devnode(struct device *dev, mode_t *mode)
+static char *dvb_devnode(struct device *dev, umode_t *mode)
 {
 	struct dvb_device *dvbdev = dev_get_drvdata(dev);
 
diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c
index 29f900065d8a..f5db8b949bc3 100644
--- a/drivers/media/rc/rc-main.c
+++ b/drivers/media/rc/rc-main.c
@@ -715,7 +715,7 @@ static void ir_close(struct input_dev *idev)
 }
 
 /* class for /sys/class/rc */
-static char *ir_devnode(struct device *dev, mode_t *mode)
+static char *ir_devnode(struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "rc/%s", dev_name(dev));
 }
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 05085beb83db..3fdebd306b94 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -3267,7 +3267,7 @@ void __init console_init(void)
 	}
 }
 
-static char *tty_devnode(struct device *dev, mode_t *mode)
+static char *tty_devnode(struct device *dev, umode_t *mode)
 {
 	if (!mode)
 		return NULL;
diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c
index cb3a93243a05..bc5089f76cec 100644
--- a/drivers/usb/class/usblp.c
+++ b/drivers/usb/class/usblp.c
@@ -1045,7 +1045,7 @@ static const struct file_operations usblp_fops = {
 	.llseek =	noop_llseek,
 };
 
-static char *usblp_devnode(struct device *dev, mode_t *mode)
+static char *usblp_devnode(struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
 }
diff --git a/drivers/usb/core/file.c b/drivers/usb/core/file.c
index 99458c843d60..d95760de9e8b 100644
--- a/drivers/usb/core/file.c
+++ b/drivers/usb/core/file.c
@@ -66,7 +66,7 @@ static struct usb_class {
 	struct class *class;
 } *usb_class;
 
-static char *usb_devnode(struct device *dev, mode_t *mode)
+static char *usb_devnode(struct device *dev, umode_t *mode)
 {
 	struct usb_class_driver *drv;
 
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 73cd90012ec5..1382c90d0834 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -326,7 +326,7 @@ static const struct dev_pm_ops usb_device_pm_ops = {
 #endif	/* CONFIG_PM */
 
 
-static char *usb_devnode(struct device *dev, mode_t *mode)
+static char *usb_devnode(struct device *dev, umode_t *mode)
 {
 	struct usb_device *usb_dev;
 
diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c
index 81457904d6ba..5bd4b0526de5 100644
--- a/drivers/usb/misc/iowarrior.c
+++ b/drivers/usb/misc/iowarrior.c
@@ -734,7 +734,7 @@ static const struct file_operations iowarrior_fops = {
 	.llseek = noop_llseek,
 };
 
-static char *iowarrior_devnode(struct device *dev, mode_t *mode)
+static char *iowarrior_devnode(struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
 }
diff --git a/drivers/usb/misc/legousbtower.c b/drivers/usb/misc/legousbtower.c
index a989356f693e..94f6566b99f8 100644
--- a/drivers/usb/misc/legousbtower.c
+++ b/drivers/usb/misc/legousbtower.c
@@ -269,7 +269,7 @@ static const struct file_operations tower_fops = {
 	.llseek =	tower_llseek,
 };
 
-static char *legousbtower_devnode(struct device *dev, mode_t *mode)
+static char *legousbtower_devnode(struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev));
 }
diff --git a/include/linux/device.h b/include/linux/device.h
index 3136ede5a1e1..2fe0005543ed 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -294,7 +294,7 @@ struct class {
 	struct kobject			*dev_kobj;
 
 	int (*dev_uevent)(struct device *dev, struct kobj_uevent_env *env);
-	char *(*devnode)(struct device *dev, mode_t *mode);
+	char *(*devnode)(struct device *dev, umode_t *mode);
 
 	void (*class_release)(struct class *class);
 	void (*dev_release)(struct device *dev);
@@ -423,7 +423,7 @@ struct device_type {
 	const char *name;
 	const struct attribute_group **groups;
 	int (*uevent)(struct device *dev, struct kobj_uevent_env *env);
-	char *(*devnode)(struct device *dev, mode_t *mode);
+	char *(*devnode)(struct device *dev, umode_t *mode);
 	void (*release)(struct device *dev);
 
 	const struct dev_pm_ops *pm;
@@ -720,7 +720,7 @@ extern int device_rename(struct device *dev, const char *new_name);
 extern int device_move(struct device *dev, struct device *new_parent,
 		       enum dpm_order dpm_order);
 extern const char *device_get_devnode(struct device *dev,
-				      mode_t *mode, const char **tmp);
+				      umode_t *mode, const char **tmp);
 extern void *dev_get_drvdata(const struct device *dev);
 extern int dev_set_drvdata(struct device *dev, void *data);
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 6d18f3531f18..fe23ee768589 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -163,7 +163,7 @@ struct gendisk {
                                          * disks that can't be partitioned. */
 
 	char disk_name[DISK_NAME_LEN];	/* name of major driver */
-	char *(*devnode)(struct gendisk *gd, mode_t *mode);
+	char *(*devnode)(struct gendisk *gd, umode_t *mode);
 
 	unsigned int events;		/* supported events */
 	unsigned int async_events;	/* async events, subset of all */
diff --git a/include/linux/usb.h b/include/linux/usb.h
index d3d0c1374334..a59321779f8b 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -935,7 +935,7 @@ extern struct bus_type usb_bus_type;
  */
 struct usb_class_driver {
 	char *name;
-	char *(*devnode)(struct device *dev, mode_t *mode);
+	char *(*devnode)(struct device *dev, umode_t *mode);
 	const struct file_operations *fops;
 	int minor_base;
 };
diff --git a/sound/sound_core.c b/sound/sound_core.c
index 6ce277860fd7..c6e81fb928e9 100644
--- a/sound/sound_core.c
+++ b/sound/sound_core.c
@@ -29,7 +29,7 @@ MODULE_DESCRIPTION("Core sound module");
 MODULE_AUTHOR("Alan Cox");
 MODULE_LICENSE("GPL");
 
-static char *sound_devnode(struct device *dev, mode_t *mode)
+static char *sound_devnode(struct device *dev, umode_t *mode)
 {
 	if (MAJOR(dev->devt) == SOUND_MAJOR)
 		return NULL;
-- 
cgit v1.2.3-55-g7522


From f4ae40a6a50a98ac23d4b285f739455e926a473e Mon Sep 17 00:00:00 2001
From: Al Viro
Date: Sun, 24 Jul 2011 04:33:43 -0400
Subject: switch debugfs to umode_t

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/debugfs.txt              | 24 +++++------
 arch/arm/mach-msm/smd_debug.c                      |  2 +-
 arch/s390/include/asm/debug.h                      |  4 +-
 arch/s390/kernel/debug.c                           |  8 ++--
 arch/x86/xen/debugfs.c                             |  2 +-
 arch/x86/xen/debugfs.h                             |  2 +-
 drivers/acpi/ec_sys.c                              |  2 +-
 drivers/mmc/card/mmc_test.c                        |  2 +-
 .../net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c    |  2 +-
 drivers/net/wireless/ath/carl9170/debug.c          |  2 +-
 drivers/net/wireless/libertas/debugfs.c            |  2 +-
 drivers/s390/block/dasd.c                          |  4 +-
 drivers/scsi/bfa/bfad_debugfs.c                    |  2 +-
 fs/debugfs/file.c                                  | 22 +++++------
 fs/debugfs/inode.c                                 | 14 +++----
 fs/ocfs2/cluster/netdebug.c                        |  2 +-
 include/linux/debugfs.h                            | 46 +++++++++++-----------
 include/linux/relay.h                              |  2 +-
 kernel/relay.c                                     |  2 +-
 kernel/trace/blktrace.c                            |  2 +-
 kernel/trace/trace.c                               |  2 +-
 kernel/trace/trace.h                               |  2 +-
 lib/fault-inject.c                                 |  8 ++--
 mm/failslab.c                                      |  2 +-
 mm/page_alloc.c                                    |  2 +-
 25 files changed, 82 insertions(+), 82 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt
index 742cc06e138f..9281a95d689f 100644
--- a/Documentation/filesystems/debugfs.txt
+++ b/Documentation/filesystems/debugfs.txt
@@ -35,7 +35,7 @@ described below will work.
 
 The most general way to create a file within a debugfs directory is with:
 
-    struct dentry *debugfs_create_file(const char *name, mode_t mode,
+    struct dentry *debugfs_create_file(const char *name, umode_t mode,
 				       struct dentry *parent, void *data,
 				       const struct file_operations *fops);
 
@@ -53,13 +53,13 @@ actually necessary; the debugfs code provides a number of helper functions
 for simple situations.  Files containing a single integer value can be
 created with any of:
 
-    struct dentry *debugfs_create_u8(const char *name, mode_t mode,
+    struct dentry *debugfs_create_u8(const char *name, umode_t mode,
 				     struct dentry *parent, u8 *value);
-    struct dentry *debugfs_create_u16(const char *name, mode_t mode,
+    struct dentry *debugfs_create_u16(const char *name, umode_t mode,
 				      struct dentry *parent, u16 *value);
-    struct dentry *debugfs_create_u32(const char *name, mode_t mode,
+    struct dentry *debugfs_create_u32(const char *name, umode_t mode,
 				      struct dentry *parent, u32 *value);
-    struct dentry *debugfs_create_u64(const char *name, mode_t mode,
+    struct dentry *debugfs_create_u64(const char *name, umode_t mode,
 				      struct dentry *parent, u64 *value);
 
 These files support both reading and writing the given value; if a specific
@@ -67,13 +67,13 @@ file should not be written to, simply set the mode bits accordingly.  The
 values in these files are in decimal; if hexadecimal is more appropriate,
 the following functions can be used instead:
 
-    struct dentry *debugfs_create_x8(const char *name, mode_t mode,
+    struct dentry *debugfs_create_x8(const char *name, umode_t mode,
 				     struct dentry *parent, u8 *value);
-    struct dentry *debugfs_create_x16(const char *name, mode_t mode,
+    struct dentry *debugfs_create_x16(const char *name, umode_t mode,
 				      struct dentry *parent, u16 *value);
-    struct dentry *debugfs_create_x32(const char *name, mode_t mode,
+    struct dentry *debugfs_create_x32(const char *name, umode_t mode,
 				      struct dentry *parent, u32 *value);
-    struct dentry *debugfs_create_x64(const char *name, mode_t mode,
+    struct dentry *debugfs_create_x64(const char *name, umode_t mode,
 				      struct dentry *parent, u64 *value);
 
 These functions are useful as long as the developer knows the size of the
@@ -81,7 +81,7 @@ value to be exported.  Some types can have different widths on different
 architectures, though, complicating the situation somewhat.  There is a
 function meant to help out in one special case:
 
-    struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+    struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
 				         struct dentry *parent, 
 					 size_t *value);
 
@@ -90,7 +90,7 @@ a variable of type size_t.
 
 Boolean values can be placed in debugfs with:
 
-    struct dentry *debugfs_create_bool(const char *name, mode_t mode,
+    struct dentry *debugfs_create_bool(const char *name, umode_t mode,
 				       struct dentry *parent, u32 *value);
 
 A read on the resulting file will yield either Y (for non-zero values) or
@@ -104,7 +104,7 @@ Finally, a block of arbitrary binary data can be exported with:
 	unsigned long size;
     };
 
-    struct dentry *debugfs_create_blob(const char *name, mode_t mode,
+    struct dentry *debugfs_create_blob(const char *name, umode_t mode,
 				       struct dentry *parent,
 				       struct debugfs_blob_wrapper *blob);
 
diff --git a/arch/arm/mach-msm/smd_debug.c b/arch/arm/mach-msm/smd_debug.c
index 8736afff82f3..0c56a5aaf588 100644
--- a/arch/arm/mach-msm/smd_debug.c
+++ b/arch/arm/mach-msm/smd_debug.c
@@ -215,7 +215,7 @@ static const struct file_operations debug_ops = {
 	.llseek = default_llseek,
 };
 
-static void debug_create(const char *name, mode_t mode,
+static void debug_create(const char *name, umode_t mode,
 			 struct dentry *dent,
 			 int (*fill)(char *buf, int max))
 {
diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h
index 18124b75a7ab..9d88db1f55d0 100644
--- a/arch/s390/include/asm/debug.h
+++ b/arch/s390/include/asm/debug.h
@@ -73,7 +73,7 @@ typedef struct debug_info {
 	struct dentry* debugfs_entries[DEBUG_MAX_VIEWS];
 	struct debug_view* views[DEBUG_MAX_VIEWS];	
 	char name[DEBUG_MAX_NAME_LEN];
-	mode_t mode;
+	umode_t mode;
 } debug_info_t;
 
 typedef int (debug_header_proc_t) (debug_info_t* id,
@@ -124,7 +124,7 @@ debug_info_t *debug_register(const char *name, int pages, int nr_areas,
                              int buf_size);
 
 debug_info_t *debug_register_mode(const char *name, int pages, int nr_areas,
-				  int buf_size, mode_t mode, uid_t uid,
+				  int buf_size, umode_t mode, uid_t uid,
 				  gid_t gid);
 
 void debug_unregister(debug_info_t* id);
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index 5ad6bc078bfd..6848828b962e 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -74,7 +74,7 @@ static ssize_t debug_input(struct file *file, const char __user *user_buf,
 static int debug_open(struct inode *inode, struct file *file);
 static int debug_close(struct inode *inode, struct file *file);
 static debug_info_t *debug_info_create(const char *name, int pages_per_area,
-			int nr_areas, int buf_size, mode_t mode);
+			int nr_areas, int buf_size, umode_t mode);
 static void debug_info_get(debug_info_t *);
 static void debug_info_put(debug_info_t *);
 static int debug_prolog_level_fn(debug_info_t * id,
@@ -330,7 +330,7 @@ debug_info_free(debug_info_t* db_info){
 
 static debug_info_t*
 debug_info_create(const char *name, int pages_per_area, int nr_areas,
-		  int buf_size, mode_t mode)
+		  int buf_size, umode_t mode)
 {
 	debug_info_t* rc;
 
@@ -688,7 +688,7 @@ debug_close(struct inode *inode, struct file *file)
  */
 
 debug_info_t *debug_register_mode(const char *name, int pages_per_area,
-				  int nr_areas, int buf_size, mode_t mode,
+				  int nr_areas, int buf_size, umode_t mode,
 				  uid_t uid, gid_t gid)
 {
 	debug_info_t *rc = NULL;
@@ -1090,7 +1090,7 @@ debug_register_view(debug_info_t * id, struct debug_view *view)
 	int rc = 0;
 	int i;
 	unsigned long flags;
-	mode_t mode;
+	umode_t mode;
 	struct dentry *pde;
 
 	if (!id)
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index 7c0fedd98ea0..ef1db1900d86 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -109,7 +109,7 @@ static const struct file_operations u32_array_fops = {
 	.llseek = no_llseek,
 };
 
-struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
+struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
 					    struct dentry *parent,
 					    u32 *array, unsigned elements)
 {
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
index e28132084832..78d25499be5b 100644
--- a/arch/x86/xen/debugfs.h
+++ b/arch/x86/xen/debugfs.h
@@ -3,7 +3,7 @@
 
 struct dentry * __init xen_init_debugfs(void);
 
-struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
+struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
 					    struct dentry *parent,
 					    u32 *array, unsigned elements);
 
diff --git a/drivers/acpi/ec_sys.c b/drivers/acpi/ec_sys.c
index 6c47ae9793a7..b258cab9061c 100644
--- a/drivers/acpi/ec_sys.c
+++ b/drivers/acpi/ec_sys.c
@@ -105,7 +105,7 @@ int acpi_ec_add_debugfs(struct acpi_ec *ec, unsigned int ec_device_count)
 {
 	struct dentry *dev_dir;
 	char name[64];
-	mode_t mode = 0400;
+	umode_t mode = 0400;
 
 	if (ec_device_count == 0) {
 		acpi_ec_debugfs_dir = debugfs_create_dir("ec", NULL);
diff --git a/drivers/mmc/card/mmc_test.c b/drivers/mmc/card/mmc_test.c
index b038c4a9468b..e99bdc18002d 100644
--- a/drivers/mmc/card/mmc_test.c
+++ b/drivers/mmc/card/mmc_test.c
@@ -2949,7 +2949,7 @@ static void mmc_test_free_dbgfs_file(struct mmc_card *card)
 }
 
 static int __mmc_test_register_dbgfs_file(struct mmc_card *card,
-	const char *name, mode_t mode, const struct file_operations *fops)
+	const char *name, umode_t mode, const struct file_operations *fops)
 {
 	struct dentry *file = NULL;
 	struct mmc_test_dbgfs_file *df;
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
index da9072bfca8b..f5a24d99ef4f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
@@ -2000,7 +2000,7 @@ static const struct file_operations interfaces_proc_fops = {
  */
 struct cxgb4vf_debugfs_entry {
 	const char *name;		/* name of debugfs node */
-	mode_t mode;			/* file system mode */
+	umode_t mode;			/* file system mode */
 	const struct file_operations *fops;
 };
 
diff --git a/drivers/net/wireless/ath/carl9170/debug.c b/drivers/net/wireless/ath/carl9170/debug.c
index de57f90e1d5f..3c164226687f 100644
--- a/drivers/net/wireless/ath/carl9170/debug.c
+++ b/drivers/net/wireless/ath/carl9170/debug.c
@@ -56,7 +56,7 @@ static int carl9170_debugfs_open(struct inode *inode, struct file *file)
 
 struct carl9170_debugfs_fops {
 	unsigned int read_bufsize;
-	mode_t attr;
+	umode_t attr;
 	char *(*read)(struct ar9170 *ar, char *buf, size_t bufsize,
 		      ssize_t *len);
 	ssize_t (*write)(struct ar9170 *aru, const char *buf, size_t size);
diff --git a/drivers/net/wireless/libertas/debugfs.c b/drivers/net/wireless/libertas/debugfs.c
index d8d8f0d0899f..c192671610fc 100644
--- a/drivers/net/wireless/libertas/debugfs.c
+++ b/drivers/net/wireless/libertas/debugfs.c
@@ -704,7 +704,7 @@ out_unlock:
 
 struct lbs_debugfs_files {
 	const char *name;
-	int perm;
+	umode_t perm;
 	struct file_operations fops;
 };
 
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 65894f05a801..42986d7bcf9d 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1073,7 +1073,7 @@ static const struct file_operations dasd_stats_global_fops = {
 static void dasd_profile_init(struct dasd_profile *profile,
 			      struct dentry *base_dentry)
 {
-	mode_t mode;
+	umode_t mode;
 	struct dentry *pde;
 
 	if (!base_dentry)
@@ -1112,7 +1112,7 @@ static void dasd_statistics_removeroot(void)
 
 static void dasd_statistics_createroot(void)
 {
-	mode_t mode;
+	umode_t mode;
 	struct dentry *pde;
 
 	dasd_debugfs_root_entry = NULL;
diff --git a/drivers/scsi/bfa/bfad_debugfs.c b/drivers/scsi/bfa/bfad_debugfs.c
index dee1a094c2c2..caca9b7c8309 100644
--- a/drivers/scsi/bfa/bfad_debugfs.c
+++ b/drivers/scsi/bfa/bfad_debugfs.c
@@ -472,7 +472,7 @@ static const struct file_operations bfad_debugfs_op_regwr = {
 
 struct bfad_debugfs_entry {
 	const char *name;
-	mode_t	mode;
+	umode_t	mode;
 	const struct file_operations *fops;
 };
 
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 90f76575c056..d5016606fb27 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -95,7 +95,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
  * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
  * code.
  */
-struct dentry *debugfs_create_u8(const char *name, mode_t mode,
+struct dentry *debugfs_create_u8(const char *name, umode_t mode,
 				 struct dentry *parent, u8 *value)
 {
 	/* if there are no write bits set, make read only */
@@ -147,7 +147,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
  * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
  * code.
  */
-struct dentry *debugfs_create_u16(const char *name, mode_t mode,
+struct dentry *debugfs_create_u16(const char *name, umode_t mode,
 				  struct dentry *parent, u16 *value)
 {
 	/* if there are no write bits set, make read only */
@@ -199,7 +199,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
  * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
  * code.
  */
-struct dentry *debugfs_create_u32(const char *name, mode_t mode,
+struct dentry *debugfs_create_u32(const char *name, umode_t mode,
 				 struct dentry *parent, u32 *value)
 {
 	/* if there are no write bits set, make read only */
@@ -252,7 +252,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
  * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
  * code.
  */
-struct dentry *debugfs_create_u64(const char *name, mode_t mode,
+struct dentry *debugfs_create_u64(const char *name, umode_t mode,
 				 struct dentry *parent, u64 *value)
 {
 	/* if there are no write bits set, make read only */
@@ -298,7 +298,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n
  * @value: a pointer to the variable that the file should read to and write
  *         from.
  */
-struct dentry *debugfs_create_x8(const char *name, mode_t mode,
+struct dentry *debugfs_create_x8(const char *name, umode_t mode,
 				 struct dentry *parent, u8 *value)
 {
 	/* if there are no write bits set, make read only */
@@ -322,7 +322,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8);
  * @value: a pointer to the variable that the file should read to and write
  *         from.
  */
-struct dentry *debugfs_create_x16(const char *name, mode_t mode,
+struct dentry *debugfs_create_x16(const char *name, umode_t mode,
 				 struct dentry *parent, u16 *value)
 {
 	/* if there are no write bits set, make read only */
@@ -346,7 +346,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16);
  * @value: a pointer to the variable that the file should read to and write
  *         from.
  */
-struct dentry *debugfs_create_x32(const char *name, mode_t mode,
+struct dentry *debugfs_create_x32(const char *name, umode_t mode,
 				 struct dentry *parent, u32 *value)
 {
 	/* if there are no write bits set, make read only */
@@ -370,7 +370,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32);
  * @value: a pointer to the variable that the file should read to and write
  *         from.
  */
-struct dentry *debugfs_create_x64(const char *name, mode_t mode,
+struct dentry *debugfs_create_x64(const char *name, umode_t mode,
 				 struct dentry *parent, u64 *value)
 {
 	return debugfs_create_file(name, mode, parent, value, &fops_x64);
@@ -401,7 +401,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
  * @value: a pointer to the variable that the file should read to and write
  *         from.
  */
-struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
 				     struct dentry *parent, size_t *value)
 {
 	return debugfs_create_file(name, mode, parent, value, &fops_size_t);
@@ -473,7 +473,7 @@ static const struct file_operations fops_bool = {
  * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
  * code.
  */
-struct dentry *debugfs_create_bool(const char *name, mode_t mode,
+struct dentry *debugfs_create_bool(const char *name, umode_t mode,
 				   struct dentry *parent, u32 *value)
 {
 	return debugfs_create_file(name, mode, parent, value, &fops_bool);
@@ -518,7 +518,7 @@ static const struct file_operations fops_blob = {
  * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
  * code.
  */
-struct dentry *debugfs_create_blob(const char *name, mode_t mode,
+struct dentry *debugfs_create_blob(const char *name, umode_t mode,
 				   struct dentry *parent,
 				   struct debugfs_blob_wrapper *blob)
 {
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index c9dc08d0c100..956d5ddddf6e 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -30,7 +30,7 @@ static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
 static bool debugfs_registered;
 
-static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev,
+static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev,
 				       void *data, const struct file_operations *fops)
 
 {
@@ -69,7 +69,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
 
 /* SMP-safe */
 static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
-			 int mode, dev_t dev, void *data,
+			 umode_t mode, dev_t dev, void *data,
 			 const struct file_operations *fops)
 {
 	struct inode *inode;
@@ -87,7 +87,7 @@ static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
 	return error;
 }
 
-static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode,
+static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode,
 			 void *data, const struct file_operations *fops)
 {
 	int res;
@@ -101,14 +101,14 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode,
 	return res;
 }
 
-static int debugfs_link(struct inode *dir, struct dentry *dentry, int mode,
+static int debugfs_link(struct inode *dir, struct dentry *dentry, umode_t mode,
 			void *data, const struct file_operations *fops)
 {
 	mode = (mode & S_IALLUGO) | S_IFLNK;
 	return debugfs_mknod(dir, dentry, mode, 0, data, fops);
 }
 
-static int debugfs_create(struct inode *dir, struct dentry *dentry, int mode,
+static int debugfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 			  void *data, const struct file_operations *fops)
 {
 	int res;
@@ -146,7 +146,7 @@ static struct file_system_type debug_fs_type = {
 	.kill_sb =	kill_litter_super,
 };
 
-static int debugfs_create_by_name(const char *name, mode_t mode,
+static int debugfs_create_by_name(const char *name, umode_t mode,
 				  struct dentry *parent,
 				  struct dentry **dentry,
 				  void *data,
@@ -214,7 +214,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
  * If debugfs is not enabled in the kernel, the value -%ENODEV will be
  * returned.
  */
-struct dentry *debugfs_create_file(const char *name, mode_t mode,
+struct dentry *debugfs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops)
 {
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index dc45deb19e68..73ba81928bce 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -553,7 +553,7 @@ void o2net_debugfs_exit(void)
 
 int o2net_debugfs_init(void)
 {
-	mode_t mode = S_IFREG|S_IRUSR;
+	umode_t mode = S_IFREG|S_IRUSR;
 
 	o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
 	if (o2net_dentry)
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index e7d9b20ddc5b..d1ac841e8dc7 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -34,7 +34,7 @@ extern struct dentry *arch_debugfs_dir;
 extern const struct file_operations debugfs_file_operations;
 extern const struct inode_operations debugfs_link_operations;
 
-struct dentry *debugfs_create_file(const char *name, mode_t mode,
+struct dentry *debugfs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops);
 
@@ -49,28 +49,28 @@ void debugfs_remove_recursive(struct dentry *dentry);
 struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
                 struct dentry *new_dir, const char *new_name);
 
-struct dentry *debugfs_create_u8(const char *name, mode_t mode,
+struct dentry *debugfs_create_u8(const char *name, umode_t mode,
 				 struct dentry *parent, u8 *value);
-struct dentry *debugfs_create_u16(const char *name, mode_t mode,
+struct dentry *debugfs_create_u16(const char *name, umode_t mode,
 				  struct dentry *parent, u16 *value);
-struct dentry *debugfs_create_u32(const char *name, mode_t mode,
+struct dentry *debugfs_create_u32(const char *name, umode_t mode,
 				  struct dentry *parent, u32 *value);
-struct dentry *debugfs_create_u64(const char *name, mode_t mode,
+struct dentry *debugfs_create_u64(const char *name, umode_t mode,
 				  struct dentry *parent, u64 *value);
-struct dentry *debugfs_create_x8(const char *name, mode_t mode,
+struct dentry *debugfs_create_x8(const char *name, umode_t mode,
 				 struct dentry *parent, u8 *value);
-struct dentry *debugfs_create_x16(const char *name, mode_t mode,
+struct dentry *debugfs_create_x16(const char *name, umode_t mode,
 				  struct dentry *parent, u16 *value);
-struct dentry *debugfs_create_x32(const char *name, mode_t mode,
+struct dentry *debugfs_create_x32(const char *name, umode_t mode,
 				  struct dentry *parent, u32 *value);
-struct dentry *debugfs_create_x64(const char *name, mode_t mode,
+struct dentry *debugfs_create_x64(const char *name, umode_t mode,
 				  struct dentry *parent, u64 *value);
-struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
 				     struct dentry *parent, size_t *value);
-struct dentry *debugfs_create_bool(const char *name, mode_t mode,
+struct dentry *debugfs_create_bool(const char *name, umode_t mode,
 				  struct dentry *parent, u32 *value);
 
-struct dentry *debugfs_create_blob(const char *name, mode_t mode,
+struct dentry *debugfs_create_blob(const char *name, umode_t mode,
 				  struct dentry *parent,
 				  struct debugfs_blob_wrapper *blob);
 
@@ -86,7 +86,7 @@ bool debugfs_initialized(void);
  * want to duplicate the design decision mistakes of procfs and devfs again.
  */
 
-static inline struct dentry *debugfs_create_file(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_file(const char *name, umode_t mode,
 					struct dentry *parent, void *data,
 					const struct file_operations *fops)
 {
@@ -118,70 +118,70 @@ static inline struct dentry *debugfs_rename(struct dentry *old_dir, struct dentr
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_u8(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_u8(const char *name, umode_t mode,
 					       struct dentry *parent,
 					       u8 *value)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_u16(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_u16(const char *name, umode_t mode,
 						struct dentry *parent,
 						u16 *value)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_u32(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_u32(const char *name, umode_t mode,
 						struct dentry *parent,
 						u32 *value)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_u64(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_u64(const char *name, umode_t mode,
 						struct dentry *parent,
 						u64 *value)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_x8(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_x8(const char *name, umode_t mode,
 					       struct dentry *parent,
 					       u8 *value)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_x16(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_x16(const char *name, umode_t mode,
 						struct dentry *parent,
 						u16 *value)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_x32(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_x32(const char *name, umode_t mode,
 						struct dentry *parent,
 						u32 *value)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
 				     struct dentry *parent,
 				     size_t *value)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_bool(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_bool(const char *name, umode_t mode,
 						 struct dentry *parent,
 						 u32 *value)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-static inline struct dentry *debugfs_create_blob(const char *name, mode_t mode,
+static inline struct dentry *debugfs_create_blob(const char *name, umode_t mode,
 				  struct dentry *parent,
 				  struct debugfs_blob_wrapper *blob)
 {
diff --git a/include/linux/relay.h b/include/linux/relay.h
index 14a86bc7102b..a822fd71fd64 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -144,7 +144,7 @@ struct rchan_callbacks
 	 */
 	struct dentry *(*create_buf_file)(const char *filename,
 					  struct dentry *parent,
-					  int mode,
+					  umode_t mode,
 					  struct rchan_buf *buf,
 					  int *is_global);
 
diff --git a/kernel/relay.c b/kernel/relay.c
index 226fade4d727..4335e1d7ee2d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -302,7 +302,7 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
  */
 static struct dentry *create_buf_file_default_callback(const char *filename,
 						       struct dentry *parent,
-						       int mode,
+						       umode_t mode,
 						       struct rchan_buf *buf,
 						       int *is_global)
 {
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 16fc34a0806f..cdea7b56b0c9 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -402,7 +402,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry)
 
 static struct dentry *blk_create_buf_file_callback(const char *filename,
 						   struct dentry *parent,
-						   int mode,
+						   umode_t mode,
 						   struct rchan_buf *buf,
 						   int *is_global)
 {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f2bd275bb60f..660b069a0f99 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4385,7 +4385,7 @@ static const struct file_operations trace_options_core_fops = {
 };
 
 struct dentry *trace_create_file(const char *name,
-				 mode_t mode,
+				 umode_t mode,
 				 struct dentry *parent,
 				 void *data,
 				 const struct file_operations *fops)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 092e1f8d18dc..0154c0b850de 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -312,7 +312,7 @@ void tracing_reset_current(int cpu);
 void tracing_reset_current_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 struct dentry *trace_create_file(const char *name,
-				 mode_t mode,
+				 umode_t mode,
 				 struct dentry *parent,
 				 void *data,
 				 const struct file_operations *fops);
diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index 4f7554025e30..b4801f51b607 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -149,7 +149,7 @@ static int debugfs_ul_get(void *data, u64 *val)
 
 DEFINE_SIMPLE_ATTRIBUTE(fops_ul, debugfs_ul_get, debugfs_ul_set, "%llu\n");
 
-static struct dentry *debugfs_create_ul(const char *name, mode_t mode,
+static struct dentry *debugfs_create_ul(const char *name, umode_t mode,
 				struct dentry *parent, unsigned long *value)
 {
 	return debugfs_create_file(name, mode, parent, value, &fops_ul);
@@ -169,7 +169,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_stacktrace_depth, debugfs_ul_get,
 			debugfs_stacktrace_depth_set, "%llu\n");
 
 static struct dentry *debugfs_create_stacktrace_depth(
-	const char *name, mode_t mode,
+	const char *name, umode_t mode,
 	struct dentry *parent, unsigned long *value)
 {
 	return debugfs_create_file(name, mode, parent, value,
@@ -193,7 +193,7 @@ static int debugfs_atomic_t_get(void *data, u64 *val)
 DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
 			debugfs_atomic_t_set, "%lld\n");
 
-static struct dentry *debugfs_create_atomic_t(const char *name, mode_t mode,
+static struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
 				struct dentry *parent, atomic_t *value)
 {
 	return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
@@ -202,7 +202,7 @@ static struct dentry *debugfs_create_atomic_t(const char *name, mode_t mode,
 struct dentry *fault_create_debugfs_attr(const char *name,
 			struct dentry *parent, struct fault_attr *attr)
 {
-	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 	struct dentry *dir;
 
 	dir = debugfs_create_dir(name, parent);
diff --git a/mm/failslab.c b/mm/failslab.c
index 0dd7b8fec71c..fefaabaab76d 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -35,7 +35,7 @@ __setup("failslab=", setup_failslab);
 static int __init failslab_debugfs_init(void)
 {
 	struct dentry *dir;
-	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 
 	dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr);
 	if (IS_ERR(dir))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2b8ba3aebf6e..99930ec7d140 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1408,7 +1408,7 @@ static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 
 static int __init fail_page_alloc_debugfs(void)
 {
-	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 	struct dentry *dir;
 
 	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
-- 
cgit v1.2.3-55-g7522


From 28c3c05d337f6fdf84faf69374e6325b80cbf9ad Mon Sep 17 00:00:00 2001
From: Dave Jones
Date: Fri, 30 Dec 2011 14:37:05 -0500
Subject: PCI: add set_nouse_crs for use by a pci=nocrs blacklist

Some machines don't boot unless passed pci=nocrs.
(See https://bugzilla.redhat.com/show_bug.cgi?id=770308 for details of
  one report. Waiting on dmidecode output for others).

Currently there is a DMI whitelist, even though the default is on.

v2: drop the 1536 blacklist entry, superceded by the PNP/MMCONFIG changes from
    Bjorn

Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 404f21a3ff9e..f5ccf29cd6aa 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -24,6 +24,12 @@ static int __init set_use_crs(const struct dmi_system_id *id)
 	return 0;
 }
 
+static int __init set_nouse_crs(const struct dmi_system_id *id)
+{
+	pci_use_crs = false;
+	return 0;
+}
+
 static const struct dmi_system_id pci_use_crs_table[] __initconst = {
 	/* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */
 	{
@@ -54,6 +60,7 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
 			DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
 		},
 	},
+
 	{}
 };
 
-- 
cgit v1.2.3-55-g7522


From e702781fa846dd726b73e673f91ffbd3b0e8d114 Mon Sep 17 00:00:00 2001
From: Dave Jones
Date: Wed, 4 Jan 2012 11:33:12 -0500
Subject: PCI: Add Dell Studio 1557 to pci=nocrs blacklist

The Dell Studio 1557 also doesn't suspend correctly when CRS is enabled.
Details at https://bugzilla.redhat.com/show_bug.cgi?id=769657

Reported-by: Gregory S. Hoerner <ghoerner@transcendingthought.com>
Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index f5ccf29cd6aa..0d9329f203e9 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -61,6 +61,18 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
 		},
 	},
 
+	/* Now for the blacklist.. */
+
+	/* https://bugzilla.redhat.com/show_bug.cgi?id=769657 */
+	{
+		.callback = set_nouse_crs,
+		.ident = "Dell Studio 1557",
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Studio 1557"),
+			DMI_MATCH(DMI_BIOS_VERSION, "A09"),
+		},
+	},
 	{}
 };
 
-- 
cgit v1.2.3-55-g7522


From 8b6a5af92c03b363df050da906480085b6cd6e00 Mon Sep 17 00:00:00 2001
From: Dave Jones
Date: Wed, 4 Jan 2012 11:30:52 -0500
Subject: PCI: Add Thinkpad SL510 to pci=nocrs blacklist

Enabling CRS by default breaks suspend on the Thinkpad SL510.
Details in https://bugzilla.redhat.com/show_bug.cgi?id=769657

Reported-by: Stefan Kirrmann <stefan.kirrmann@gmail.com>
Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 0d9329f203e9..e662ceebd798 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -73,6 +73,16 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
 			DMI_MATCH(DMI_BIOS_VERSION, "A09"),
 		},
 	},
+	/* https://bugzilla.redhat.com/show_bug.cgi?id=769657 */
+	{
+		.callback = set_nouse_crs,
+		.ident = "Thinkpad SL510",
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BOARD_NAME, "2847DFG"),
+			DMI_MATCH(DMI_BIOS_VERSION, "6JET85WW (1.43 )"),
+		},
+	},
 	{}
 };
 
-- 
cgit v1.2.3-55-g7522


From ae5cd86455381282ece162966183d3f208c6fad7 Mon Sep 17 00:00:00 2001
From: Gary Hade
Date: Mon, 14 Nov 2011 15:42:16 -0800
Subject: x86/PCI: Ignore CPU non-addressable _CRS reserved memory resources

This assures that a _CRS reserved host bridge window or window region is
not used if it is not addressable by the CPU.  The new code either trims
the window to exclude the non-addressable portion or totally ignores the
window if the entire window is non-addressable.

The current code has been shown to be problematic with 32-bit non-PAE
kernels on systems where _CRS reserves resources above 4GB.

Signed-off-by: Gary Hade <garyhade@us.ibm.com>
Reviewed-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Thomas Renninger <trenn@novell.com>
Cc: linux-kernel@vger.kernel.org
Cc: stable@kernel.org
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index e662ceebd798..425500bb24e6 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -178,7 +178,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
 	struct acpi_resource_address64 addr;
 	acpi_status status;
 	unsigned long flags;
-	u64 start, end;
+	u64 start, orig_end, end;
 
 	status = resource_to_addr(acpi_res, &addr);
 	if (!ACPI_SUCCESS(status))
@@ -194,7 +194,21 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
 		return AE_OK;
 
 	start = addr.minimum + addr.translation_offset;
-	end = addr.maximum + addr.translation_offset;
+	orig_end = end = addr.maximum + addr.translation_offset;
+
+	/* Exclude non-addressable range or non-addressable portion of range */
+	end = min(end, (u64)iomem_resource.end);
+	if (end <= start) {
+		dev_info(&info->bridge->dev,
+			"host bridge window [%#llx-%#llx] "
+			"(ignored, not CPU addressable)\n", start, orig_end);
+		return AE_OK;
+	} else if (orig_end != end) {
+		dev_info(&info->bridge->dev,
+			"host bridge window [%#llx-%#llx] "
+			"([%#llx-%#llx] ignored, not CPU addressable)\n", 
+			start, orig_end, end + 1, orig_end);
+	}
 
 	res = &info->res[info->res_num];
 	res->name = info->name;
-- 
cgit v1.2.3-55-g7522


From 96c5590058d7fded14f43af2ab521436cecf3125 Mon Sep 17 00:00:00 2001
From: Myron Stowe
Date: Fri, 28 Oct 2011 15:48:38 -0600
Subject: PCI: Pull PCI 'latency timer' setup up into the core

The 'latency timer' of PCI devices, both Type 0 and Type 1,
is setup in architecture-specific code [see: 'pcibios_set_master()'].
There are two approaches being taken by all the architectures - check
if the 'latency timer' is currently set between 16 and 255 and if not
bring it within bounds, or, do nothing (and then there is the
gratuitously different PA-RISC implementation).

There is nothing architecture-specific about PCI's 'latency timer' so
this patch pulls its setup functionality up into the PCI core by
creating a generic 'pcibios_set_master()' function using the '__weak'
attribute which can be used by all architectures as a default which,
if necessary, can then be over-ridden by architecture-specific code.

No functional change.

Signed-off-by: Myron Stowe <myron.stowe@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/blackfin/include/asm/pci.h         |  4 ----
 arch/frv/mb93090-mb00/pci-frv.c         |  6 ------
 arch/frv/mb93090-mb00/pci-frv.h         |  2 --
 arch/h8300/include/asm/pci.h            |  5 -----
 arch/mips/pci/pci.c                     |  6 ------
 arch/mn10300/unit-asb2305/pci-asb2305.c |  6 ------
 arch/mn10300/unit-asb2305/pci-asb2305.h |  2 --
 arch/sh/drivers/pci/pci.c               |  6 ------
 arch/x86/include/asm/pci_x86.h          |  2 --
 arch/x86/pci/i386.c                     |  6 ------
 drivers/pci/pci.c                       | 29 +++++++++++++++++++++++++++++
 include/linux/pci.h                     |  3 +++
 12 files changed, 32 insertions(+), 45 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/blackfin/include/asm/pci.h b/arch/blackfin/include/asm/pci.h
index 99cae2e3bac7..74352c4597d9 100644
--- a/arch/blackfin/include/asm/pci.h
+++ b/arch/blackfin/include/asm/pci.h
@@ -10,10 +10,6 @@
 #define PCIBIOS_MIN_IO 0x00001000
 #define PCIBIOS_MIN_MEM 0x10000000
 
-static inline void pcibios_set_master(struct pci_dev *dev)
-{
-	/* No special bus mastering setup handling */
-}
 static inline void pcibios_penalize_isa_irq(int irq)
 {
 	/* We don't do dynamic PCI IRQ allocation */
diff --git a/arch/frv/mb93090-mb00/pci-frv.c b/arch/frv/mb93090-mb00/pci-frv.c
index 6b4fb28e9f99..6a0cd644d7cc 100644
--- a/arch/frv/mb93090-mb00/pci-frv.c
+++ b/arch/frv/mb93090-mb00/pci-frv.c
@@ -195,12 +195,6 @@ void __init pcibios_resource_survey(void)
 	pcibios_assign_resources();
 }
 
-/*
- *  If we set up a device for bus mastering, we need to check the latency
- *  timer as certain crappy BIOSes forget to set it properly.
- */
-unsigned int pcibios_max_latency = 255;
-
 void pcibios_set_master(struct pci_dev *dev)
 {
 	u8 lat;
diff --git a/arch/frv/mb93090-mb00/pci-frv.h b/arch/frv/mb93090-mb00/pci-frv.h
index f3fe55914793..089eeba4f3bc 100644
--- a/arch/frv/mb93090-mb00/pci-frv.h
+++ b/arch/frv/mb93090-mb00/pci-frv.h
@@ -26,8 +26,6 @@ extern unsigned int __nongpreldata pci_probe;
 
 /* pci-frv.c */
 
-extern unsigned int pcibios_max_latency;
-
 void pcibios_resource_survey(void);
 
 /* pci-vdk.c */
diff --git a/arch/h8300/include/asm/pci.h b/arch/h8300/include/asm/pci.h
index cc9762091c0a..0b2acaa3dd84 100644
--- a/arch/h8300/include/asm/pci.h
+++ b/arch/h8300/include/asm/pci.h
@@ -9,11 +9,6 @@
 
 #define pcibios_assign_all_busses()	0
 
-static inline void pcibios_set_master(struct pci_dev *dev)
-{
-	/* No special bus mastering setup handling */
-}
-
 static inline void pcibios_penalize_isa_irq(int irq, int active)
 {
 	/* We don't do dynamic PCI IRQ allocation */
diff --git a/arch/mips/pci/pci.c b/arch/mips/pci/pci.c
index 41af7fa2887b..f93f749b92e3 100644
--- a/arch/mips/pci/pci.c
+++ b/arch/mips/pci/pci.c
@@ -205,12 +205,6 @@ static int pcibios_enable_resources(struct pci_dev *dev, int mask)
 	return 0;
 }
 
-/*
- *  If we set up a device for bus mastering, we need to check the latency
- *  timer as certain crappy BIOSes forget to set it properly.
- */
-static unsigned int pcibios_max_latency = 255;
-
 void pcibios_set_master(struct pci_dev *dev)
 {
 	u8 lat;
diff --git a/arch/mn10300/unit-asb2305/pci-asb2305.c b/arch/mn10300/unit-asb2305/pci-asb2305.c
index 8e6763e6f250..2b299c413ae5 100644
--- a/arch/mn10300/unit-asb2305/pci-asb2305.c
+++ b/arch/mn10300/unit-asb2305/pci-asb2305.c
@@ -213,12 +213,6 @@ void __init pcibios_resource_survey(void)
 	pcibios_allocate_resources(1);
 }
 
-/*
- *  If we set up a device for bus mastering, we need to check the latency
- *  timer as certain crappy BIOSes forget to set it properly.
- */
-unsigned int pcibios_max_latency = 255;
-
 void pcibios_set_master(struct pci_dev *dev)
 {
 	u8 lat;
diff --git a/arch/mn10300/unit-asb2305/pci-asb2305.h b/arch/mn10300/unit-asb2305/pci-asb2305.h
index c3fa294b6e28..1194fe486b01 100644
--- a/arch/mn10300/unit-asb2305/pci-asb2305.h
+++ b/arch/mn10300/unit-asb2305/pci-asb2305.h
@@ -31,8 +31,6 @@ extern unsigned int pci_probe;
 
 /* pci-asb2305.c */
 
-extern unsigned int pcibios_max_latency;
-
 extern void pcibios_resource_survey(void);
 
 /* pci.c */
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index c2691afe8f79..cfdb2f652949 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -243,12 +243,6 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
 	return pci_enable_resources(dev, mask);
 }
 
-/*
- *  If we set up a device for bus mastering, we need to check and set
- *  the latency timer as it may not be properly set.
- */
-static unsigned int pcibios_max_latency = 255;
-
 void pcibios_set_master(struct pci_dev *dev)
 {
 	u8 lat;
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index e38197806853..b3a531746026 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -44,8 +44,6 @@ enum pci_bf_sort_state {
 
 /* pci-i386.c */
 
-extern unsigned int pcibios_max_latency;
-
 void pcibios_resource_survey(void);
 void pcibios_set_cache_line_size(void);
 
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 794b092d01ae..dd5806b0fc8b 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -254,12 +254,6 @@ void __init pcibios_resource_survey(void)
  */
 fs_initcall(pcibios_assign_resources);
 
-/*
- *  If we set up a device for bus mastering, we need to check the latency
- *  timer as certain crappy BIOSes forget to set it properly.
- */
-unsigned int pcibios_max_latency = 255;
-
 void pcibios_set_master(struct pci_dev *dev)
 {
 	u8 lat;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 924193ef4fe1..f9abe84cf5e0 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -88,6 +88,12 @@ enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_TUNE_OFF;
 u8 pci_dfl_cache_line_size __devinitdata = L1_CACHE_BYTES >> 2;
 u8 pci_cache_line_size;
 
+/*
+ * If we set up a device for bus mastering, we need to check the latency
+ * timer as certain BIOSes forget to set it properly.
+ */
+unsigned int pcibios_max_latency = 255;
+
 /**
  * pci_bus_max_busnr - returns maximum PCI bus number of given bus' children
  * @bus: pointer to PCI bus structure to search
@@ -2595,6 +2601,29 @@ static void __pci_set_master(struct pci_dev *dev, bool enable)
 	dev->is_busmaster = enable;
 }
 
+/**
+ * pcibios_set_master - enable PCI bus-mastering for device dev
+ * @dev: the PCI device to enable
+ *
+ * Enables PCI bus-mastering for the device.  This is the default
+ * implementation.  Architecture specific implementations can override
+ * this if necessary.
+ */
+void __weak pcibios_set_master(struct pci_dev *dev)
+{
+	u8 lat;
+
+	pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
+	if (lat < 16)
+		lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
+	else if (lat > pcibios_max_latency)
+		lat = pcibios_max_latency;
+	else
+		return;
+	dev_printk(KERN_DEBUG, &dev->dev, "setting latency timer to %d\n", lat);
+	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
+}
+
 /**
  * pci_set_master - enables bus-mastering for device dev
  * @dev: the PCI device to enable
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 569341d2d527..4c16a5788998 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -795,8 +795,11 @@ static inline int pci_is_managed(struct pci_dev *pdev)
 }
 
 void pci_disable_device(struct pci_dev *dev);
+
+extern unsigned int pcibios_max_latency;
 void pci_set_master(struct pci_dev *dev);
 void pci_clear_master(struct pci_dev *dev);
+
 int pci_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state);
 int pci_set_cacheline_size(struct pci_dev *dev);
 #define HAVE_PCI_SET_MWI
-- 
cgit v1.2.3-55-g7522


From b9a276ad262815d88f4dd232d578864949aab3b9 Mon Sep 17 00:00:00 2001
From: Myron Stowe
Date: Fri, 28 Oct 2011 15:49:13 -0600
Subject: PCI: x86: use generic pcibios_set_master()

This patch removes x86's architecture-specific 'pcibios_set_master()'
routine and lets the default PCI core based implementation handle PCI
device 'latency timer' setup.

No functional change.

Signed-off-by: Myron Stowe <myron.stowe@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/i386.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index dd5806b0fc8b..91821a1a0c3a 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -254,20 +254,6 @@ void __init pcibios_resource_survey(void)
  */
 fs_initcall(pcibios_assign_resources);
 
-void pcibios_set_master(struct pci_dev *dev)
-{
-	u8 lat;
-	pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
-	if (lat < 16)
-		lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
-	else if (lat > pcibios_max_latency)
-		lat = pcibios_max_latency;
-	else
-		return;
-	dev_printk(KERN_DEBUG, &dev->dev, "setting latency timer to %d\n", lat);
-	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
-}
-
 static const struct vm_operations_struct pci_mmap_ops = {
 	.access = generic_access_phys,
 };
-- 
cgit v1.2.3-55-g7522


From ca3671a83389eea1458929d22c66a69e955bfb07 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann
Date: Mon, 5 Dec 2011 18:12:28 +0100
Subject: x86/PCI: amd: Kill misleading message about enablement of IO access
 to PCI ECS]

Commit 24d9b70b8c679264756a6980e668b96b3f964826 (x86: Use PCI method
for enabling AMD extended config space before MSR method) added a
message when IO access to PCI ECS was enabled via access to the NB_CFG
PCI register.  This can lead to a bogus message like

[    0.365177] Extended Config Space enabled on 0 nodes

which is misleading because IO ECS access is subsequently enabled for
AMD CPUs (that support this) by modifying the corresponding NB_CFG
MSR.

Furthermore it's not "Extended Config Space" that is enabled by this
register setting. It's the IO access that is enabled for extended
configruation space.

IMHO the ambiguous message needs to be cancelled.

Cc: Jan Beulich <jbeulich@novell.com>
Cc: Robert Richter <robert.richter@amd.com>
Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/amd_bus.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 026e4931d162..7b7a89712d50 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -403,7 +403,6 @@ static void __init pci_enable_pci_io_ecs(void)
 			++n;
 		}
 	}
-	pr_info("Extended Config Space enabled on %u nodes\n", n);
 #endif
 }
 
-- 
cgit v1.2.3-55-g7522


From 6361d72b04d1f77736142bc3911a32b814370729 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas
Date: Fri, 28 Oct 2011 16:28:03 -0600
Subject: x86/PCI: read Broadcom CNB20LE host bridge info before PCI scan

We currently read the CNB20LE aperture information in a PCI quirk,
which happens after we've already created the root bus.  This patch
changes it to read the apertures earlier so we can create the root
bus with the correct resources.

I believe the CNB20LE lives at "pci 0000:00:00" based on
https://lkml.org/lkml/2010/8/13/220

CC: Ira W. Snyder <iws@ovro.caltech.edu>
CC: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/broadcom_bus.c | 62 ++++++++++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
index ab8269b0da29..f3a7c569a403 100644
--- a/arch/x86/pci/broadcom_bus.c
+++ b/arch/x86/pci/broadcom_bus.c
@@ -15,10 +15,11 @@
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <asm/pci_x86.h>
+#include <asm/pci-direct.h>
 
 #include "bus_numa.h"
 
-static void __devinit cnb20le_res(struct pci_dev *dev)
+static void __init cnb20le_res(u8 bus, u8 slot, u8 func)
 {
 	struct pci_root_info *info;
 	struct resource res;
@@ -26,21 +27,12 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
 	u8 fbus, lbus;
 	int i;
 
-#ifdef CONFIG_ACPI
-	/*
-	 * We should get host bridge information from ACPI unless the BIOS
-	 * doesn't support it.
-	 */
-	if (acpi_os_get_root_pointer())
-		return;
-#endif
-
 	info = &pci_root_info[pci_root_num];
 	pci_root_num++;
 
 	/* read the PCI bus numbers */
-	pci_read_config_byte(dev, 0x44, &fbus);
-	pci_read_config_byte(dev, 0x45, &lbus);
+	fbus = read_pci_config_byte(bus, slot, func, 0x44);
+	lbus = read_pci_config_byte(bus, slot, func, 0x45);
 	info->bus_min = fbus;
 	info->bus_max = lbus;
 
@@ -59,8 +51,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
 	}
 
 	/* read the non-prefetchable memory window */
-	pci_read_config_word(dev, 0xc0, &word1);
-	pci_read_config_word(dev, 0xc2, &word2);
+	word1 = read_pci_config_16(bus, slot, func, 0xc0);
+	word2 = read_pci_config_16(bus, slot, func, 0xc2);
 	if (word1 != word2) {
 		res.start = (word1 << 16) | 0x0000;
 		res.end   = (word2 << 16) | 0xffff;
@@ -69,8 +61,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
 	}
 
 	/* read the prefetchable memory window */
-	pci_read_config_word(dev, 0xc4, &word1);
-	pci_read_config_word(dev, 0xc6, &word2);
+	word1 = read_pci_config_16(bus, slot, func, 0xc4);
+	word2 = read_pci_config_16(bus, slot, func, 0xc6);
 	if (word1 != word2) {
 		res.start = (word1 << 16) | 0x0000;
 		res.end   = (word2 << 16) | 0xffff;
@@ -79,8 +71,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
 	}
 
 	/* read the IO port window */
-	pci_read_config_word(dev, 0xd0, &word1);
-	pci_read_config_word(dev, 0xd2, &word2);
+	word1 = read_pci_config_16(bus, slot, func, 0xd0);
+	word2 = read_pci_config_16(bus, slot, func, 0xd2);
 	if (word1 != word2) {
 		res.start = word1;
 		res.end   = word2;
@@ -92,13 +84,37 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
 	res.start = fbus;
 	res.end   = lbus;
 	res.flags = IORESOURCE_BUS;
-	dev_info(&dev->dev, "CNB20LE PCI Host Bridge (domain %04x %pR)\n",
-			    pci_domain_nr(dev->bus), &res);
+	printk(KERN_INFO "CNB20LE PCI Host Bridge (domain 0000 %pR)\n", &res);
 
 	for (i = 0; i < info->res_num; i++)
-		dev_info(&dev->dev, "host bridge window %pR\n", &info->res[i]);
+		printk(KERN_INFO "host bridge window %pR\n", &info->res[i]);
 }
 
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE,
-			cnb20le_res);
+static int __init broadcom_postcore_init(void)
+{
+	u8 bus = 0, slot = 0;
+	u32 id;
+	u16 vendor, device;
+
+#ifdef CONFIG_ACPI
+	/*
+	 * We should get host bridge information from ACPI unless the BIOS
+	 * doesn't support it.
+	 */
+	if (acpi_os_get_root_pointer())
+		return 0;
+#endif
+
+	id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID);
+	vendor = id & 0xffff;
+	device = (id >> 16) & 0xffff;
+
+	if (vendor == PCI_VENDOR_ID_SERVERWORKS &&
+	    device == PCI_DEVICE_ID_SERVERWORKS_LE) {
+		cnb20le_res(bus, slot, 0);
+		cnb20le_res(bus, slot, 1);
+	}
+	return 0;
+}
 
+postcore_initcall(broadcom_postcore_init);
-- 
cgit v1.2.3-55-g7522


From 46fbade05ca0784ca3c959bd7bf2aae7d81306c2 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas
Date: Fri, 28 Oct 2011 16:28:08 -0600
Subject: x86/PCI: use pci_scan_bus() instead of pci_scan_bus_parented()

This doesn't change any functionality, but it makes a subsequent patch
slightly simpler.

pci_scan_bus(NULL, ...) and pci_scan_bus_parented() are identical except
that pci_scan_bus() also calls pci_bus_add_devices():

  pci_scan_bus_parented
    pci_create_bus
    pci_scan_child_bus

  pci_scan_bus
    pci_create_bus
    pci_scan_child_bus
    pci_bus_add_devices

All callers of pcibios_scan_root() call pci_bus_add_devices() explicitly,
and we don't pass a parent device, so we might as well use pci_scan_bus().

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/common.c   | 2 +-
 arch/x86/pci/legacy.c   | 3 ---
 arch/x86/pci/numaq_32.c | 2 --
 3 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 7962ccb4d9b2..07c55ce6fdf5 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -456,7 +456,7 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
 	sd->node = get_mp_bus_to_node(busnum);
 
 	printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum);
-	bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
+	bus = pci_scan_bus(busnum, &pci_root_ops, sd);
 	if (!bus)
 		kfree(sd);
 
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 2c2aeabc2609..a1df191129d3 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -31,9 +31,6 @@ int __init pci_legacy_init(void)
 
 	printk("PCI: Probing PCI hardware\n");
 	pci_root_bus = pcibios_scan_root(0);
-	if (pci_root_bus)
-		pci_bus_add_devices(pci_root_bus);
-
 	return 0;
 }
 
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 51abf02f9226..83e125b95ca6 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -153,8 +153,6 @@ int __init pci_numaq_init(void)
 	raw_pci_ops = &pci_direct_conf1_mq;
 
 	pci_root_bus = pcibios_scan_root(0);
-	if (pci_root_bus)
-		pci_bus_add_devices(pci_root_bus);
 	if (num_online_nodes() > 1)
 		for_each_online_node(quad) {
 			if (quad == 0)
-- 
cgit v1.2.3-55-g7522


From 2cd6975a4ff92a75e46240109d01c1daf4682e5d Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas
Date: Fri, 28 Oct 2011 16:28:14 -0600
Subject: x86/PCI: convert to pci_create_root_bus() and pci_scan_root_bus()

x86 has two kinds of PCI root bus scanning:

(1) ACPI-based, using _CRS resources.  This used pci_create_bus(), not
    pci_scan_bus(), because ACPI hotplug needed to split the
    pci_bus_add_devices() into a separate host bridge .start() method.

    This patch parses the _CRS resources earlier, so we can build a list of
    resources and pass it to pci_create_root_bus().

    Note that as before, we parse the _CRS even if we aren't going to use
    it so we can print it for debugging purposes.

(2) All other, which used either default resources (ioport_resource and
    iomem_resource) or information read from the hardware via amd_bus.c or
    similar.  This used pci_scan_bus().

    This patch converts x86_pci_root_bus_res_quirks() (previously called
    from pcibios_fixup_bus()) to x86_pci_root_bus_resources(), which builds
    a list of resources before we call pci_scan_root_bus().

    We also use x86_pci_root_bus_resources() if we have ACPI but are
    ignoring _CRS.

CC: Yinghai Lu <yinghai.lu@oracle.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/topology.h |  2 +-
 arch/x86/pci/acpi.c             | 28 ++++++++++++++--------------
 arch/x86/pci/bus_numa.c         | 31 ++++++++++++++++++-------------
 arch/x86/pci/common.c           | 19 ++++++++++++-------
 4 files changed, 45 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index c00692476e9f..5f83b136dda7 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -174,7 +174,7 @@ static inline void arch_fix_phys_package_id(int num, u32 slot)
 }
 
 struct pci_bus;
-void x86_pci_root_bus_res_quirks(struct pci_bus *b);
+void x86_pci_root_bus_resources(int bus, struct list_head *resources);
 
 #ifdef CONFIG_SMP
 #define mc_capable()	((boot_cpu_data.x86_max_cores > 1) && \
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 425500bb24e6..a312e76063a7 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -12,7 +12,7 @@ struct pci_root_info {
 	char *name;
 	unsigned int res_num;
 	struct resource *res;
-	struct pci_bus *bus;
+	struct list_head *resources;
 	int busnum;
 };
 
@@ -304,23 +304,20 @@ static void add_resources(struct pci_root_info *info)
 				 "ignoring host bridge window %pR (conflicts with %s %pR)\n",
 				 res, conflict->name, conflict);
 		else
-			pci_bus_add_resource(info->bus, res, 0);
+			pci_add_resource(info->resources, res);
 	}
 }
 
 static void
 get_current_resources(struct acpi_device *device, int busnum,
-			int domain, struct pci_bus *bus)
+		      int domain, struct list_head *resources)
 {
 	struct pci_root_info info;
 	size_t size;
 
-	if (pci_use_crs)
-		pci_bus_remove_resources(bus);
-
 	info.bridge = device;
-	info.bus = bus;
 	info.res_num = 0;
+	info.resources = resources;
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
 				&info);
 	if (!info.res_num)
@@ -329,7 +326,7 @@ get_current_resources(struct acpi_device *device, int busnum,
 	size = sizeof(*info.res) * info.res_num;
 	info.res = kmalloc(size, GFP_KERNEL);
 	if (!info.res)
-		goto res_alloc_fail;
+		return;
 
 	info.name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum);
 	if (!info.name)
@@ -344,8 +341,6 @@ get_current_resources(struct acpi_device *device, int busnum,
 
 name_alloc_fail:
 	kfree(info.res);
-res_alloc_fail:
-	return;
 }
 
 struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
@@ -353,6 +348,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 	struct acpi_device *device = root->device;
 	int domain = root->segment;
 	int busnum = root->secondary.start;
+	LIST_HEAD(resources);
 	struct pci_bus *bus;
 	struct pci_sysdata *sd;
 	int node;
@@ -407,11 +403,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 		memcpy(bus->sysdata, sd, sizeof(*sd));
 		kfree(sd);
 	} else {
-		bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd);
-		if (bus) {
-			get_current_resources(device, busnum, domain, bus);
+		get_current_resources(device, busnum, domain, &resources);
+		if (list_empty(&resources))
+			x86_pci_root_bus_resources(busnum, &resources);
+		bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd,
+					  &resources);
+		if (bus)
 			bus->subordinate = pci_scan_child_bus(bus);
-		}
+		else
+			pci_free_resource_list(&resources);
 	}
 
 	/* After the PCI-E bus has been walked and all devices discovered,
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index 64a122883896..fd3f65510e9d 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -7,45 +7,50 @@
 int pci_root_num;
 struct pci_root_info pci_root_info[PCI_ROOT_NR];
 
-void x86_pci_root_bus_res_quirks(struct pci_bus *b)
+void x86_pci_root_bus_resources(int bus, struct list_head *resources)
 {
 	int i;
 	int j;
 	struct pci_root_info *info;
 
-	/* don't go for it if _CRS is used already */
-	if (b->resource[0] != &ioport_resource ||
-	    b->resource[1] != &iomem_resource)
-		return;
-
 	if (!pci_root_num)
-		return;
+		goto default_resources;
 
 	for (i = 0; i < pci_root_num; i++) {
-		if (pci_root_info[i].bus_min == b->number)
+		if (pci_root_info[i].bus_min == bus)
 			break;
 	}
 
 	if (i == pci_root_num)
-		return;
+		goto default_resources;
 
-	printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
-			b->number);
+	printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n",
+	       bus);
 
-	pci_bus_remove_resources(b);
 	info = &pci_root_info[i];
 	for (j = 0; j < info->res_num; j++) {
 		struct resource *res;
 		struct resource *root;
 
 		res = &info->res[j];
-		pci_bus_add_resource(b, res, 0);
+		pci_add_resource(resources, res);
 		if (res->flags & IORESOURCE_IO)
 			root = &ioport_resource;
 		else
 			root = &iomem_resource;
 		insert_resource(root, res);
 	}
+	return;
+
+default_resources:
+	/*
+	 * We don't have any host bridge aperture information from the
+	 * "native host bridge drivers," e.g., amd_bus or broadcom_bus,
+	 * so fall back to the defaults historically used by pci_create_bus().
+	 */
+	printk(KERN_DEBUG "PCI: root bus %02x: using default resources\n", bus);
+	pci_add_resource(resources, &ioport_resource);
+	pci_add_resource(resources, &iomem_resource);
 }
 
 void __devinit update_res(struct pci_root_info *info, resource_size_t start,
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 07c55ce6fdf5..323481e06ef8 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -164,9 +164,6 @@ void __devinit pcibios_fixup_bus(struct pci_bus *b)
 {
 	struct pci_dev *dev;
 
-	/* root bus? */
-	if (!b->parent)
-		x86_pci_root_bus_res_quirks(b);
 	pci_read_bridge_bases(b);
 	list_for_each_entry(dev, &b->devices, bus_list)
 		pcibios_fixup_device_resources(dev);
@@ -433,6 +430,7 @@ void __init dmi_check_pciprobe(void)
 
 struct pci_bus * __devinit pcibios_scan_root(int busnum)
 {
+	LIST_HEAD(resources);
 	struct pci_bus *bus = NULL;
 	struct pci_sysdata *sd;
 
@@ -456,9 +454,12 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
 	sd->node = get_mp_bus_to_node(busnum);
 
 	printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum);
-	bus = pci_scan_bus(busnum, &pci_root_ops, sd);
-	if (!bus)
+	x86_pci_root_bus_resources(busnum, &resources);
+	bus = pci_scan_root_bus(NULL, busnum, &pci_root_ops, sd, &resources);
+	if (!bus) {
+		pci_free_resource_list(&resources);
 		kfree(sd);
+	}
 
 	return bus;
 }
@@ -639,6 +640,7 @@ int pci_ext_cfg_avail(struct pci_dev *dev)
 
 struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node)
 {
+	LIST_HEAD(resources);
 	struct pci_bus *bus = NULL;
 	struct pci_sysdata *sd;
 
@@ -653,9 +655,12 @@ struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops,
 		return NULL;
 	}
 	sd->node = node;
-	bus = pci_scan_bus(busno, ops, sd);
-	if (!bus)
+	x86_pci_root_bus_resources(busno, &resources);
+	bus = pci_scan_root_bus(NULL, busno, ops, sd, &resources);
+	if (!bus) {
+		pci_free_resource_list(&resources);
 		kfree(sd);
+	}
 
 	return bus;
 }
-- 
cgit v1.2.3-55-g7522


From 24d25dbfa63c376323096660bfa9ad45a08870ce Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas
Date: Thu, 5 Jan 2012 14:27:19 -0700
Subject: x86/PCI: amd: factor out MMCONFIG discovery

This factors out the AMD native MMCONFIG discovery so we can use it
outside amd_bus.c.

amd_bus.c reads AMD MSRs so it can remove the MMCONFIG area from the
PCI resources.  We may also need the MMCONFIG information to work
around BIOS defects in the ACPI MCFG table.

Cc: Borislav Petkov <borislav.petkov@amd.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: stable@kernel.org       # 2.6.34+
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/amd_nb.h |  2 ++
 arch/x86/kernel/amd_nb.c      | 31 +++++++++++++++++++++++++++++++
 arch/x86/pci/amd_bus.c        | 42 +++++++++++-------------------------------
 3 files changed, 44 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 8e41071704a5..49ad773f4b9f 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_X86_AMD_NB_H
 #define _ASM_X86_AMD_NB_H
 
+#include <linux/ioport.h>
 #include <linux/pci.h>
 
 struct amd_nb_bus_dev_range {
@@ -13,6 +14,7 @@ extern const struct pci_device_id amd_nb_misc_ids[];
 extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
 
 extern bool early_is_amd_nb(u32 value);
+extern struct resource *amd_get_mmconfig_range(struct resource *res);
 extern int amd_cache_northbridges(void);
 extern void amd_flush_garts(void);
 extern int amd_numa_init(void);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 4c39baa8facc..bae1efe6d515 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -119,6 +119,37 @@ bool __init early_is_amd_nb(u32 device)
 	return false;
 }
 
+struct resource *amd_get_mmconfig_range(struct resource *res)
+{
+	u32 address;
+	u64 base, msr;
+	unsigned segn_busn_bits;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+		return NULL;
+
+	/* assume all cpus from fam10h have mmconfig */
+        if (boot_cpu_data.x86 < 0x10)
+		return NULL;
+
+	address = MSR_FAM10H_MMIO_CONF_BASE;
+	rdmsrl(address, msr);
+
+	/* mmconfig is not enabled */
+	if (!(msr & FAM10H_MMIO_CONF_ENABLE))
+		return NULL;
+
+	base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
+
+	segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
+			 FAM10H_MMIO_CONF_BUSRANGE_MASK;
+
+	res->flags = IORESOURCE_MEM;
+	res->start = base;
+	res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
+	return res;
+}
+
 int amd_get_subcaches(int cpu)
 {
 	struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 7b7a89712d50..0567df3890e1 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -30,34 +30,6 @@ static struct pci_hostbridge_probe pci_probes[] __initdata = {
 	{ 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 },
 };
 
-static u64 __initdata fam10h_mmconf_start;
-static u64 __initdata fam10h_mmconf_end;
-static void __init get_pci_mmcfg_amd_fam10h_range(void)
-{
-	u32 address;
-	u64 base, msr;
-	unsigned segn_busn_bits;
-
-	/* assume all cpus from fam10h have mmconf */
-        if (boot_cpu_data.x86 < 0x10)
-		return;
-
-	address = MSR_FAM10H_MMIO_CONF_BASE;
-	rdmsrl(address, msr);
-
-	/* mmconfig is not enable */
-	if (!(msr & FAM10H_MMIO_CONF_ENABLE))
-		return;
-
-	base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
-
-	segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
-			 FAM10H_MMIO_CONF_BUSRANGE_MASK;
-
-	fam10h_mmconf_start = base;
-	fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
-}
-
 #define RANGE_NUM 16
 
 /**
@@ -85,6 +57,9 @@ static int __init early_fill_mp_bus_info(void)
 	u64 val;
 	u32 address;
 	bool found;
+	struct resource fam10h_mmconf_res, *fam10h_mmconf;
+	u64 fam10h_mmconf_start;
+	u64 fam10h_mmconf_end;
 
 	if (!early_pci_allowed())
 		return -1;
@@ -211,12 +186,17 @@ static int __init early_fill_mp_bus_info(void)
 		subtract_range(range, RANGE_NUM, 0, end);
 
 	/* get mmconfig */
-	get_pci_mmcfg_amd_fam10h_range();
+	fam10h_mmconf = amd_get_mmconfig_range(&fam10h_mmconf_res);
 	/* need to take out mmconf range */
-	if (fam10h_mmconf_end) {
-		printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end);
+	if (fam10h_mmconf) {
+		printk(KERN_DEBUG "Fam 10h mmconf %pR\n", fam10h_mmconf);
+		fam10h_mmconf_start = fam10h_mmconf->start;
+		fam10h_mmconf_end = fam10h_mmconf->end;
 		subtract_range(range, RANGE_NUM, fam10h_mmconf_start,
 				 fam10h_mmconf_end + 1);
+	} else {
+		fam10h_mmconf_start = 0;
+		fam10h_mmconf_end = 0;
 	}
 
 	/* mmio resource */
-- 
cgit v1.2.3-55-g7522


From 76ccc297018d25d55b789bbd508861ef1e2cdb0c Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk
Date: Fri, 16 Dec 2011 17:38:18 -0500
Subject: x86/PCI: Expand the x86_msi_ops to have a restore MSIs.

The MSI restore function will become a function pointer in an
x86_msi_ops struct. It defaults to the implementation in the
io_apic.c and msi.c. We piggyback on the indirection mechanism
introduced by "x86: Introduce x86_msi_ops".

Cc: x86@kernel.org
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: linux-pci@vger.kernel.org
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/pci.h      |  9 +++++++++
 arch/x86/include/asm/x86_init.h |  1 +
 arch/x86/kernel/x86_init.c      |  1 +
 drivers/pci/msi.c               | 29 +++++++++++++++++++++++++++--
 4 files changed, 38 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d498943b906c..df75d07571ce 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -112,19 +112,28 @@ static inline void x86_teardown_msi_irq(unsigned int irq)
 {
 	x86_msi.teardown_msi_irq(irq);
 }
+static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq)
+{
+	x86_msi.restore_msi_irqs(dev, irq);
+}
 #define arch_setup_msi_irqs x86_setup_msi_irqs
 #define arch_teardown_msi_irqs x86_teardown_msi_irqs
 #define arch_teardown_msi_irq x86_teardown_msi_irq
+#define arch_restore_msi_irqs x86_restore_msi_irqs
 /* implemented in arch/x86/kernel/apic/io_apic. */
 int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void native_teardown_msi_irq(unsigned int irq);
+void native_restore_msi_irqs(struct pci_dev *dev, int irq);
 /* default to the implementation in drivers/lib/msi.c */
 #define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
+#define HAVE_DEFAULT_MSI_RESTORE_IRQS
 void default_teardown_msi_irqs(struct pci_dev *dev);
+void default_restore_msi_irqs(struct pci_dev *dev, int irq);
 #else
 #define native_setup_msi_irqs		NULL
 #define native_teardown_msi_irq		NULL
 #define default_teardown_msi_irqs	NULL
+#define default_restore_msi_irqs	NULL
 #endif
 
 #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 1971e652d24b..cd5208446c2d 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -177,6 +177,7 @@ struct x86_msi_ops {
 	int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
 	void (*teardown_msi_irq)(unsigned int irq);
 	void (*teardown_msi_irqs)(struct pci_dev *dev);
+	void (*restore_msi_irqs)(struct pci_dev *dev, int irq);
 };
 
 extern struct x86_init_ops x86_init;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index c1d6cd549397..83b05adaadf1 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -114,4 +114,5 @@ struct x86_msi_ops x86_msi = {
 	.setup_msi_irqs = native_setup_msi_irqs,
 	.teardown_msi_irq = native_teardown_msi_irq,
 	.teardown_msi_irqs = default_teardown_msi_irqs,
+	.restore_msi_irqs = default_restore_msi_irqs,
 };
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 82de95ec2ea0..a825d78fd0aa 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -86,6 +86,31 @@ void default_teardown_msi_irqs(struct pci_dev *dev)
 }
 #endif
 
+#ifndef arch_restore_msi_irqs
+# define arch_restore_msi_irqs default_restore_msi_irqs
+# define HAVE_DEFAULT_MSI_RESTORE_IRQS
+#endif
+
+#ifdef HAVE_DEFAULT_MSI_RESTORE_IRQS
+void default_restore_msi_irqs(struct pci_dev *dev, int irq)
+{
+	struct msi_desc *entry;
+
+	entry = NULL;
+	if (dev->msix_enabled) {
+		list_for_each_entry(entry, &dev->msi_list, list) {
+			if (irq == entry->irq)
+				break;
+		}
+	} else if (dev->msi_enabled)  {
+		entry = irq_get_msi_desc(irq);
+	}
+
+	if (entry)
+		write_msi_msg(irq, &entry->msg);
+}
+#endif
+
 static void msi_set_enable(struct pci_dev *dev, int pos, int enable)
 {
 	u16 control;
@@ -372,7 +397,7 @@ static void __pci_restore_msi_state(struct pci_dev *dev)
 
 	pci_intx_for_msi(dev, 0);
 	msi_set_enable(dev, pos, 0);
-	write_msi_msg(dev->irq, &entry->msg);
+	arch_restore_msi_irqs(dev, dev->irq);
 
 	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
 	msi_mask_irq(entry, msi_capable_mask(control), entry->masked);
@@ -400,7 +425,7 @@ static void __pci_restore_msix_state(struct pci_dev *dev)
 	pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control);
 
 	list_for_each_entry(entry, &dev->msi_list, list) {
-		write_msi_msg(entry->irq, &entry->msg);
+		arch_restore_msi_irqs(dev, entry->irq);
 		msix_mask_irq(entry, entry->masked);
 	}
 
-- 
cgit v1.2.3-55-g7522


From e58d429209105e698e9d0357481d62b37fe9a7dd Mon Sep 17 00:00:00 2001
From: Don Zickus
Date: Fri, 6 Jan 2012 11:17:51 -0500
Subject: x86, reboot: Fix typo in nmi reboot path

It was brought to my attention that my x86 change to use NMI in
the reboot path broke Intel Nehalem and Westmere boxes when
using kexec.

I realized I had mistyped the if statement in commit
3603a2512f9e69dc87914ba922eb4a0812b21cd6 and stuck the ')' in
the wrong spot.  Putting it in the right spot fixes kexec again.

Doh.

Reported-by: Yinghai Lu <yinghai@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Link: http://lkml.kernel.org/r/1325866671-9797-1-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 113acda5879e..66c74f481cab 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -176,7 +176,7 @@ static void native_nmi_stop_other_cpus(int wait)
 	 */
 	if (num_online_cpus() > 1) {
 		/* did someone beat us here? */
-		if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id() != -1))
+		if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
 			return;
 
 		if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
-- 
cgit v1.2.3-55-g7522


From 72142fd4109105c6bd21658966ca5e93c1684081 Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Sat, 7 Jan 2012 14:10:18 -0800
Subject: x86: Move <asm/asm-offsets.h> from trace_syscalls.c to asm/syscall.h

This reverts commit d5e553d6e0a4bdea43adae7373e3fa144b9a1aaa, which
caused large numbers of build warnings on PowerPC.

This moves the #include <asm/asm-offsets.h> to <asm/syscall.h>, which
makes some kind of sense since NR_syscalls is syscalls related.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Link: http://lkml.kernel.org/r/20111214181545.6e13bc954cb7ddce9086e861@canb.auug.org.au
---
 arch/x86/include/asm/syscall.h | 1 +
 kernel/trace/trace_syscalls.c  | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index c4a348f7bd43..d962e5652a73 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -15,6 +15,7 @@
 
 #include <linux/sched.h>
 #include <linux/err.h>
+#include <asm/asm-offsets.h>	/* For NR_syscalls */
 
 extern const unsigned long sys_call_table[];
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5f35f6f15d99..cb654542c1a1 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -6,7 +6,6 @@
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
 #include <asm/syscall.h>
-#include <asm/asm-offsets.h>
 
 #include "trace_output.h"
 #include "trace.h"
-- 
cgit v1.2.3-55-g7522


From dc6821e0cfe74802aefd2067b40fcdc03fc4599e Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk
Date: Sat, 7 Jan 2012 21:27:38 -0500
Subject: xen/mmu: Fix compile errors introduced by x86/memblock mismerge.

The git commit d4bbf7e7759afc172e2bfbc5c416324590049cdd
"Merge branch 'master' into x86/memblock" mismerged the 32-bit
section causing:

arch/x86/xen/mmu.c: In function ‘xen_setup_kernel_pagetable’:
arch/x86/xen/mmu.c:1855: error: expected ‘;’ before ‘)’ token
arch/x86/xen/mmu.c:1855: error: expected statement before ‘)’ token

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index f4bf8aa574f4..58a0e46c404d 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1852,7 +1852,7 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
 	xen_write_cr3(__pa(initial_page_table));
 
 	memblock_reserve(__pa(xen_start_info->pt_base),
-			 xen_start_info->nr_pt_frames * PAGE_SIZE));
+			 xen_start_info->nr_pt_frames * PAGE_SIZE);
 
 	return initial_page_table;
 }
-- 
cgit v1.2.3-55-g7522


From e39f560239984c3098237ad94c9449b1494163f8 Mon Sep 17 00:00:00 2001
From: David Daney
Date: Tue, 10 Jan 2012 15:10:21 -0800
Subject: fs: binfmt_elf: create Kconfig variable for PIE randomization

Randomization of PIE load address is hard coded in binfmt_elf.c for X86
and ARM.  Create a new Kconfig variable
(CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE) for this and use it instead.  Thus
architecture specific policy is pushed out of the generic binfmt_elf.c and
into the architecture Kconfig files.

X86 and ARM Kconfigs are modified to select the new variable so there is
no change in behavior.  A follow on patch will select it for MIPS too.

Signed-off-by: David Daney <david.daney@cavium.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/Kconfig  | 1 +
 arch/x86/Kconfig  | 1 +
 fs/Kconfig.binfmt | 3 +++
 fs/binfmt_elf.c   | 2 +-
 4 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 9d66dfc33a5a..98a6459cd398 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -16,6 +16,7 @@ config ARM
 	select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL)
 	select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL)
 	select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL)
+	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
 	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d2a69dd36d8..d6ddc0bfe36a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -62,6 +62,7 @@ config X86
 	select ANON_INODES
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
+	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_TEXT_POKE_SMP
 	select HAVE_GENERIC_HARDIRQS
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 79e2ca7973b7..e95d1b64082c 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -27,6 +27,9 @@ config COMPAT_BINFMT_ELF
 	bool
 	depends on COMPAT && BINFMT_ELF
 
+config ARCH_BINFMT_ELF_RANDOMIZE_PIE
+	bool
+
 config BINFMT_ELF_FDPIC
 	bool "Kernel support for FDPIC ELF binaries"
 	default y
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 21ac5ee4b43f..bcb884e2d613 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -794,7 +794,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			 * default mmap base, as well as whatever program they
 			 * might try to exec.  This is because the brk will
 			 * follow the loader, and is not movable.  */
-#if defined(CONFIG_X86) || defined(CONFIG_ARM)
+#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
 			/* Memory randomization might have been switched off
 			 * in runtime via sysctl.
 			 * If that is the case, retain the original non-zero
-- 
cgit v1.2.3-55-g7522


From 5e6292c0f28f03dfdb8ea3d685f0b838a23bfba4 Mon Sep 17 00:00:00 2001
From: Matt Fleming
Date: Tue, 10 Jan 2012 15:11:17 -0800
Subject: signal: add block_sigmask() for adding sigmask to current->blocked

Abstract the code sequence for adding a signal handler's sa_mask to
current->blocked because the sequence is identical for all architectures.
Furthermore, in the past some architectures actually got this code wrong,
so introduce a wrapper that all architectures can use.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/signal.c |  6 +-----
 include/linux/signal.h   |  1 +
 kernel/signal.c          | 21 +++++++++++++++++++++
 3 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 54ddaeb221c1..46a01bdc27e2 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -682,7 +682,6 @@ static int
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 		struct pt_regs *regs)
 {
-	sigset_t blocked;
 	int ret;
 
 	/* Are we from a system call? */
@@ -733,10 +732,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	 */
 	regs->flags &= ~X86_EFLAGS_TF;
 
-	sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
-	if (!(ka->sa.sa_flags & SA_NODEFER))
-		sigaddset(&blocked, sig);
-	set_current_blocked(&blocked);
+	block_sigmask(ka, sig);
 
 	tracehook_signal_handler(sig, info, ka, regs,
 				 test_thread_flag(TIF_SINGLESTEP));
diff --git a/include/linux/signal.h b/include/linux/signal.h
index a822300a253b..7987ce74874b 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -254,6 +254,7 @@ extern void set_current_blocked(const sigset_t *);
 extern int show_unhandled_signals;
 
 extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie);
+extern void block_sigmask(struct k_sigaction *ka, int signr);
 extern void exit_signals(struct task_struct *tsk);
 
 extern struct kmem_cache *sighand_cachep;
diff --git a/kernel/signal.c b/kernel/signal.c
index bb0efa5705ed..d532f1709fbf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2318,6 +2318,27 @@ relock:
 	return signr;
 }
 
+/**
+ * block_sigmask - add @ka's signal mask to current->blocked
+ * @ka: action for @signr
+ * @signr: signal that has been successfully delivered
+ *
+ * This function should be called when a signal has succesfully been
+ * delivered. It adds the mask of signals for @ka to current->blocked
+ * so that they are blocked during the execution of the signal
+ * handler. In addition, @signr will be blocked unless %SA_NODEFER is
+ * set in @ka->sa.sa_flags.
+ */
+void block_sigmask(struct k_sigaction *ka, int signr)
+{
+	sigset_t blocked;
+
+	sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
+	if (!(ka->sa.sa_flags & SA_NODEFER))
+		sigaddset(&blocked, signr);
+	set_current_blocked(&blocked);
+}
+
 /*
  * It could be that complete_signal() picked us to notify about the
  * group-wide signal. Other threads should be notified now to take
-- 
cgit v1.2.3-55-g7522


From b6c96c0214138186f495e3ee73737c6fc5e4efa2 Mon Sep 17 00:00:00 2001
From: Stratos Psomadakis
Date: Thu, 12 Jan 2012 15:44:47 +1030
Subject: lguest: Make sure interrupt is allocated ok by lguest_setup_irq

Make sure the interrupt is allocated correctly by lguest_setup_irq (check the
return value of irq_alloc_desc_at for -ENOMEM)

Signed-off-by: Stratos Psomadakis <psomas@cslab.ece.ntua.gr>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (cleanups and commentry)
---
 arch/x86/lguest/boot.c         | 21 +++++++++++++--------
 drivers/lguest/lguest_device.c | 10 +++++++---
 2 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index cf4603ba866f..642d8805bc1b 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -856,18 +856,23 @@ static void __init lguest_init_IRQ(void)
 }
 
 /*
- * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so
- * rather than set them in lguest_init_IRQ we are called here every time an
- * lguest device needs an interrupt.
- *
- * FIXME: irq_alloc_desc_at() can fail due to lack of memory, we should
- * pass that up!
+ * Interrupt descriptors are allocated as-needed, but low-numbered ones are
+ * reserved by the generic x86 code.  So we ignore irq_alloc_desc_at if it
+ * tells us the irq is already used: other errors (ie. ENOMEM) we take
+ * seriously.
  */
-void lguest_setup_irq(unsigned int irq)
+int lguest_setup_irq(unsigned int irq)
 {
-	irq_alloc_desc_at(irq, 0);
+	int err;
+
+	/* Returns -ve error or vector number. */
+	err = irq_alloc_desc_at(irq, 0);
+	if (err < 0 && err != -EEXIST)
+		return err;
+
 	irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
 				      handle_level_irq, "level");
+	return 0;
 }
 
 /*
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index 6a1d6447b864..9e8388efd88e 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -241,7 +241,7 @@ static void lg_notify(struct virtqueue *vq)
 }
 
 /* An extern declaration inside a C file is bad form.  Don't do it. */
-extern void lguest_setup_irq(unsigned int irq);
+extern int lguest_setup_irq(unsigned int irq);
 
 /*
  * This routine finds the Nth virtqueue described in the configuration of
@@ -304,7 +304,9 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
 	}
 
 	/* Make sure the interrupt is allocated. */
-	lguest_setup_irq(lvq->config.irq);
+	err = lguest_setup_irq(lvq->config.irq);
+	if (err)
+		goto destroy_vring;
 
 	/*
 	 * Tell the interrupt for this virtqueue to go to the virtio_ring
@@ -317,7 +319,7 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
 	err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED,
 			  dev_name(&vdev->dev), vq);
 	if (err)
-		goto destroy_vring;
+		goto free_desc;
 
 	/*
 	 * Last of all we hook up our 'struct lguest_vq_info" to the
@@ -326,6 +328,8 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
 	vq->priv = lvq;
 	return vq;
 
+free_desc:
+	irq_free_desc(lvq->config.irq);
 destroy_vring:
 	vring_del_virtqueue(vq);
 unmap:
-- 
cgit v1.2.3-55-g7522


From 5cf9a4e69c1ff0ccdd1d2b7404f95c0531355274 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas
Date: Thu, 12 Jan 2012 08:01:40 -0700
Subject: x86/PCI: build amd_bus.o only when CONFIG_AMD_NB=y

We only need amd_bus.o for AMD systems with PCI.  arch/x86/pci/Makefile
already depends on CONFIG_PCI=y, so this patch just adds the dependency
on CONFIG_AMD_NB.

Cc: Yinghai Lu <yinghai@kernel.org>
Cc: stable@kernel.org	# 2.6.34+ (needs adjustment for k8 -> amd rename)
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/pci/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 75b06f34b1f2..e76e18c94a3c 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -18,8 +18,9 @@ obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
 obj-$(CONFIG_X86_INTEL_MID)	+= mrst.o
 
 obj-y				+= common.o early.o
-obj-y				+= amd_bus.o bus_numa.o
+obj-y				+= bus_numa.o
 
+obj-$(CONFIG_AMD_NB)		+= amd_bus.o
 obj-$(CONFIG_PCI_CNB20LE_QUIRK)	+= broadcom_bus.o
 
 ifeq ($(CONFIG_PCI_DEBUG),y)
-- 
cgit v1.2.3-55-g7522


From bccd17294a26b67a8a19aaa120e3eeaa7da49281 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov
Date: Wed, 11 Jan 2012 05:11:46 +0400
Subject: x86: Get rid of 'dubious one-bit signed bitfield' sprase warning

This very noisy sparse warning appears on almost every file in the
kernel:

  CHECK   init/main.c
  arch/x86/include/asm/thread_info.h:43:55: error: dubious one-bit signed bitfield
  arch/x86/include/asm/thread_info.h:44:46: error: dubious one-bit signed bitfield

This patch changes sig_on_uaccess_error and uaccess_err flags to unsigned
type and thus fixes the warning.

Signed-off-by: Anton Vorontsov <cbouatmailru@gmail.com>
Acked-by: Andy Lutomirski <luto@mit.edu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/thread_info.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 74047159d0ab..bc817cd8b443 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -40,8 +40,8 @@ struct thread_info {
 						*/
 	__u8			supervisor_stack[0];
 #endif
-	int			sig_on_uaccess_error:1;
-	int			uaccess_err:1;	/* uaccess failed */
+	unsigned int		sig_on_uaccess_error:1;
+	unsigned int		uaccess_err:1;	/* uaccess failed */
 };
 
 #define INIT_THREAD_INFO(tsk)			\
-- 
cgit v1.2.3-55-g7522


From 476bc0015bf09dad39d36a8b19f76f0c181d1ec9 Mon Sep 17 00:00:00 2001
From: Rusty Russell
Date: Fri, 13 Jan 2012 09:32:18 +1030
Subject: module_param: make bool parameters really bool (arch)

module_param(bool) used to counter-intuitively take an int.  In
fddd5201 (mid-2009) we allowed bool or int/unsigned int using a messy
trick.

It's time to remove the int/unsigned int option.  For this version
it'll simply give a warning, but it'll break next kernel version.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/ia64/hp/common/aml_nfw.c  |  2 +-
 arch/x86/kernel/apm_32.c       | 16 ++++++++--------
 arch/x86/kvm/mmu.c             |  2 +-
 arch/x86/kvm/vmx.c             | 18 +++++++++---------
 arch/x86/kvm/x86.c             |  4 ++--
 arch/x86/mm/mmio-mod.c         |  4 ++--
 arch/x86/platform/geode/alix.c |  2 +-
 arch/x86/platform/iris/iris.c  |  2 +-
 8 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/hp/common/aml_nfw.c b/arch/ia64/hp/common/aml_nfw.c
index 22078486d35d..6192f7188654 100644
--- a/arch/ia64/hp/common/aml_nfw.c
+++ b/arch/ia64/hp/common/aml_nfw.c
@@ -31,7 +31,7 @@ MODULE_AUTHOR("Bjorn Helgaas <bjorn.helgaas@hp.com>");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("ACPI opregion handler for native firmware calls");
 
-static int force_register;
+static bool force_register;
 module_param_named(force, force_register, bool, 0);
 MODULE_PARM_DESC(force, "Install opregion handler even without HPQ5001 device");
 
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index a46bd383953c..f76623cbe263 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -383,21 +383,21 @@ static int ignore_sys_suspend;
 static int ignore_normal_resume;
 static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
 
-static int debug __read_mostly;
-static int smp __read_mostly;
+static bool debug __read_mostly;
+static bool smp __read_mostly;
 static int apm_disabled = -1;
 #ifdef CONFIG_SMP
-static int power_off;
+static bool power_off;
 #else
-static int power_off = 1;
+static bool power_off = 1;
 #endif
-static int realmode_power_off;
+static bool realmode_power_off;
 #ifdef CONFIG_APM_ALLOW_INTS
-static int allow_ints = 1;
+static bool allow_ints = 1;
 #else
-static int allow_ints;
+static bool allow_ints;
 #endif
-static int broken_psr;
+static bool broken_psr;
 
 static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
 static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2a2a9b40db19..224b02c3cda9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -74,7 +74,7 @@ enum {
 #endif
 
 #ifdef MMU_DEBUG
-static int dbg = 0;
+static bool dbg = 0;
 module_param(dbg, bool, 0644);
 #endif
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 906a7e84200f..d29216c462b3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -51,29 +51,29 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-static int __read_mostly enable_vpid = 1;
+static bool __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
-static int __read_mostly flexpriority_enabled = 1;
+static bool __read_mostly flexpriority_enabled = 1;
 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
 
-static int __read_mostly enable_ept = 1;
+static bool __read_mostly enable_ept = 1;
 module_param_named(ept, enable_ept, bool, S_IRUGO);
 
-static int __read_mostly enable_unrestricted_guest = 1;
+static bool __read_mostly enable_unrestricted_guest = 1;
 module_param_named(unrestricted_guest,
 			enable_unrestricted_guest, bool, S_IRUGO);
 
-static int __read_mostly emulate_invalid_guest_state = 0;
+static bool __read_mostly emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
-static int __read_mostly vmm_exclusive = 1;
+static bool __read_mostly vmm_exclusive = 1;
 module_param(vmm_exclusive, bool, S_IRUGO);
 
-static int __read_mostly yield_on_hlt = 1;
+static bool __read_mostly yield_on_hlt = 1;
 module_param(yield_on_hlt, bool, S_IRUGO);
 
-static int __read_mostly fasteoi = 1;
+static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 
 /*
@@ -81,7 +81,7 @@ module_param(fasteoi, bool, S_IRUGO);
  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
  * use VMX instructions.
  */
-static int __read_mostly nested = 0;
+static bool __read_mostly nested = 0;
 module_param(nested, bool, S_IRUGO);
 
 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST				\
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1171def5f96b..14d6cadc4ba6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -88,8 +88,8 @@ static void process_nmi(struct kvm_vcpu *vcpu);
 struct kvm_x86_ops *kvm_x86_ops;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
 
-int ignore_msrs = 0;
-module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
+static bool ignore_msrs = 0;
+module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
 
 bool kvm_has_tsc_control;
 EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index de54b9b278a7..dc0b727742f4 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -75,8 +75,8 @@ static LIST_HEAD(trace_list);		/* struct remap_trace */
 
 /* module parameters */
 static unsigned long	filter_offset;
-static int		nommiotrace;
-static int		trace_pc;
+static bool		nommiotrace;
+static bool		trace_pc;
 
 module_param(filter_offset, ulong, 0);
 module_param(nommiotrace, bool, 0);
diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c
index ca1973699d3d..dc5f1d32aced 100644
--- a/arch/x86/platform/geode/alix.c
+++ b/arch/x86/platform/geode/alix.c
@@ -27,7 +27,7 @@
 
 #include <asm/geode.h>
 
-static int force = 0;
+static bool force = 0;
 module_param(force, bool, 0444);
 /* FIXME: Award bios is not automatically detected as Alix platform */
 MODULE_PARM_DESC(force, "Force detection as ALIX.2/ALIX.3 platform");
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
index 1ba7f5ed8c9b..5917eb56b313 100644
--- a/arch/x86/platform/iris/iris.c
+++ b/arch/x86/platform/iris/iris.c
@@ -42,7 +42,7 @@ MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
 MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
 MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
 
-static int force;
+static bool force;
 
 module_param(force, bool, 0);
 MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation.");
-- 
cgit v1.2.3-55-g7522


From 43570fd2f47ba518145e9289f54cde3dba4c8b25 Mon Sep 17 00:00:00 2001
From: Heiko Carstens
Date: Thu, 12 Jan 2012 17:17:27 -0800
Subject: mm,slub,x86: decouple size of struct page from CONFIG_CMPXCHG_LOCAL

While implementing cmpxchg_double() on s390 I realized that we don't set
CONFIG_CMPXCHG_LOCAL despite the fact that we have support for it.

However setting that option will increase the size of struct page by
eight bytes on 64 bit, which we certainly do not want.  Also, it doesn't
make sense that a present cpu feature should increase the size of struct
page.

Besides that it looks like the dependency to CMPXCHG_LOCAL is wrong and
that it should depend on CMPXCHG_DOUBLE instead.

This patch:

If an architecture supports CMPXCHG_LOCAL this shouldn't result
automatically in larger struct pages if the SLUB allocator is used.
Instead introduce a new config option "HAVE_ALIGNED_STRUCT_PAGE" which
can be selected if a double word aligned struct page is required.  Also
update x86 Kconfig so that it should work as before.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig             | 8 ++++++++
 arch/x86/Kconfig         | 1 +
 include/linux/mm_types.h | 9 ++++-----
 mm/slub.c                | 6 +++---
 4 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/Kconfig b/arch/Kconfig
index 2505740b81d2..a2c5c077c32d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -185,4 +185,12 @@ config HAVE_RCU_TABLE_FREE
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
+config HAVE_ALIGNED_STRUCT_PAGE
+	bool
+	help
+	  This makes sure that struct pages are double word aligned and that
+	  e.g. the SLUB allocator can perform double word atomic operations
+	  on a struct page for better performance. However selecting this
+	  might increase the size of a struct page by a word.
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a150f4c35e94..5201a2c27239 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -60,6 +60,7 @@ config X86
 	select PERF_EVENTS
 	select HAVE_PERF_EVENTS_NMI
 	select ANON_INODES
+	select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5b42f1b34eb7..3cc3062b3767 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -151,12 +151,11 @@ struct page {
 #endif
 }
 /*
- * If another subsystem starts using the double word pairing for atomic
- * operations on struct page then it must change the #if to ensure
- * proper alignment of the page struct.
+ * The struct page can be forced to be double word aligned so that atomic ops
+ * on double words work. The SLUB allocator can make use of such a feature.
  */
-#if defined(CONFIG_SLUB) && defined(CONFIG_CMPXCHG_LOCAL)
-	__attribute__((__aligned__(2*sizeof(unsigned long))))
+#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
+	__aligned(2 * sizeof(unsigned long))
 #endif
 ;
 
diff --git a/mm/slub.c b/mm/slub.c
index 5d37b5e44140..72aa84134609 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -366,7 +366,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
 		const char *n)
 {
 	VM_BUG_ON(!irqs_disabled());
-#ifdef CONFIG_CMPXCHG_DOUBLE
+#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 	if (s->flags & __CMPXCHG_DOUBLE) {
 		if (cmpxchg_double(&page->freelist, &page->counters,
 			freelist_old, counters_old,
@@ -400,7 +400,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 		void *freelist_new, unsigned long counters_new,
 		const char *n)
 {
-#ifdef CONFIG_CMPXCHG_DOUBLE
+#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 	if (s->flags & __CMPXCHG_DOUBLE) {
 		if (cmpxchg_double(&page->freelist, &page->counters,
 			freelist_old, counters_old,
@@ -3014,7 +3014,7 @@ static int kmem_cache_open(struct kmem_cache *s,
 		}
 	}
 
-#ifdef CONFIG_CMPXCHG_DOUBLE
+#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 	if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
 		/* Enable fast mode */
 		s->flags |= __CMPXCHG_DOUBLE;
-- 
cgit v1.2.3-55-g7522


From 4156153c4daddf12dd386016f96a947a01e93bf4 Mon Sep 17 00:00:00 2001
From: Heiko Carstens
Date: Thu, 12 Jan 2012 17:17:30 -0800
Subject: mm,x86,um: move CMPXCHG_LOCAL config option

Move CMPXCHG_LOCAL and rename it to HAVE_CMPXCHG_LOCAL so architectures
can simply select the option if it is supported.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig         | 3 +++
 arch/x86/Kconfig     | 1 +
 arch/x86/Kconfig.cpu | 3 ---
 arch/x86/um/Kconfig  | 4 ----
 mm/vmstat.c          | 2 +-
 5 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/Kconfig b/arch/Kconfig
index a2c5c077c32d..22182a8cc62c 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -193,4 +193,7 @@ config HAVE_ALIGNED_STRUCT_PAGE
 	  on a struct page for better performance. However selecting this
 	  might increase the size of a struct page by a word.
 
+config HAVE_CMPXCHG_LOCAL
+	bool
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5201a2c27239..59717fd17bc7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -61,6 +61,7 @@ config X86
 	select HAVE_PERF_EVENTS_NMI
 	select ANON_INODES
 	select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
+	select HAVE_CMPXCHG_LOCAL if !M386
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index e3ca7e0d858c..99d2ab8b7795 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -309,9 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT
 config X86_CMPXCHG
 	def_bool X86_64 || (X86_32 && !M386)
 
-config CMPXCHG_LOCAL
-	def_bool X86_64 || (X86_32 && !M386)
-
 config CMPXCHG_DOUBLE
 	def_bool y
 
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index 1d97bd84b6fb..a62bfc66239e 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -6,10 +6,6 @@ menu "UML-specific options"
 
 menu "Host processor type and features"
 
-config CMPXCHG_LOCAL
-	bool
-	default n
-
 config CMPXCHG_DOUBLE
 	bool
 	default n
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8fd603b1665e..f600557a7659 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -295,7 +295,7 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
 }
 EXPORT_SYMBOL(__dec_zone_page_state);
 
-#ifdef CONFIG_CMPXCHG_LOCAL
+#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
 /*
  * If we have cmpxchg_local support then we do not need to incur the overhead
  * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
-- 
cgit v1.2.3-55-g7522


From 2565409fc0303f3ab8d66b8326702a687962a29b Mon Sep 17 00:00:00 2001
From: Heiko Carstens
Date: Thu, 12 Jan 2012 17:17:33 -0800
Subject: mm,x86,um: move CMPXCHG_DOUBLE config option

Move CMPXCHG_DOUBLE and rename it to HAVE_CMPXCHG_DOUBLE so architectures
can simply select the option if it is supported.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig         | 3 +++
 arch/x86/Kconfig     | 1 +
 arch/x86/Kconfig.cpu | 3 ---
 arch/x86/um/Kconfig  | 4 ----
 mm/slub.c            | 9 ++++++---
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/Kconfig b/arch/Kconfig
index 22182a8cc62c..4f55c736be11 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -196,4 +196,7 @@ config HAVE_ALIGNED_STRUCT_PAGE
 config HAVE_CMPXCHG_LOCAL
 	bool
 
+config HAVE_CMPXCHG_DOUBLE
+	bool
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 59717fd17bc7..6c14ecd851d0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -62,6 +62,7 @@ config X86
 	select ANON_INODES
 	select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
 	select HAVE_CMPXCHG_LOCAL if !M386
+	select HAVE_CMPXCHG_DOUBLE
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 99d2ab8b7795..3c57033e2211 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -309,9 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT
 config X86_CMPXCHG
 	def_bool X86_64 || (X86_32 && !M386)
 
-config CMPXCHG_DOUBLE
-	def_bool y
-
 config X86_L1_CACHE_SHIFT
 	int
 	default "7" if MPENTIUM4 || MPSC
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index a62bfc66239e..b2b54d2edf53 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -6,10 +6,6 @@ menu "UML-specific options"
 
 menu "Host processor type and features"
 
-config CMPXCHG_DOUBLE
-	bool
-	default n
-
 source "arch/x86/Kconfig.cpu"
 
 endmenu
diff --git a/mm/slub.c b/mm/slub.c
index 72aa84134609..4907563ef7ff 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -366,7 +366,8 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
 		const char *n)
 {
 	VM_BUG_ON(!irqs_disabled());
-#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 	if (s->flags & __CMPXCHG_DOUBLE) {
 		if (cmpxchg_double(&page->freelist, &page->counters,
 			freelist_old, counters_old,
@@ -400,7 +401,8 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 		void *freelist_new, unsigned long counters_new,
 		const char *n)
 {
-#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 	if (s->flags & __CMPXCHG_DOUBLE) {
 		if (cmpxchg_double(&page->freelist, &page->counters,
 			freelist_old, counters_old,
@@ -3014,7 +3016,8 @@ static int kmem_cache_open(struct kmem_cache *s,
 		}
 	}
 
-#if defined(CONFIG_CMPXCHG_DOUBLE) && defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 	if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
 		/* Enable fast mode */
 		s->flags |= __CMPXCHG_DOUBLE;
-- 
cgit v1.2.3-55-g7522


From 9512938b885304f72c847379611d6018064af840 Mon Sep 17 00:00:00 2001
From: Wanlong Gao
Date: Thu, 12 Jan 2012 17:20:09 -0800
Subject: cpumask: update setup_node_to_cpumask_map() comments

node_to_cpumask() has been replaced by cpumask_of_node(), and wholly
removed since commit 29c337a0 ("cpumask: remove obsolete node_to_cpumask
now everyone uses cpumask_of_node").

So update the comments for setup_node_to_cpumask_map().

Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/mm/numa.c | 2 +-
 arch/x86/mm/numa.c     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 4ff3d8e411a7..3feefc3842a8 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -58,7 +58,7 @@ static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
  * Allocate node_to_cpumask_map based on number of available nodes
  * Requires node_possible_map to be valid.
  *
- * Note: node_to_cpumask() is not valid until after this is done.
+ * Note: cpumask_of_node() is not valid until after this is done.
  */
 static void __init setup_node_to_cpumask_map(void)
 {
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 020cd2e80873..19d3fa08b119 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -110,7 +110,7 @@ void __cpuinit numa_clear_node(int cpu)
  * Allocate node_to_cpumask_map based on number of available nodes
  * Requires node_possible_map to be valid.
  *
- * Note: node_to_cpumask() is not valid until after this is done.
+ * Note: cpumask_of_node() is not valid until after this is done.
  * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
  */
 void __init setup_node_to_cpumask_map(void)
-- 
cgit v1.2.3-55-g7522


From a3301b751b19f0efbafddc4034f8e7ce6bf3007b Mon Sep 17 00:00:00 2001
From: Srivatsa S. Bhat
Date: Sat, 14 Jan 2012 08:11:31 +0530
Subject: x86/mce: Fix CPU hotplug and suspend regression related to MCE

Commit 8a25a2fd126c ("cpu: convert 'cpu' and 'machinecheck' sysdev_class
to a regular subsystem") changed how things are dealt with in the MCE
subsystem.  Some of the things that got broken due to this are CPU
hotplug and suspend/hibernate.

MCE uses per_cpu allocations of struct device.  So, when a CPU goes
offline and comes back online, in order to ensure that we start from a
clean slate with respect to the MCE subsystem, zero out the entire
per_cpu device structure to 0 before using it.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index f22a9f7f6390..29ba3297e480 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -2011,7 +2011,7 @@ static __cpuinit int mce_device_create(unsigned int cpu)
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	memset(&dev->kobj, 0, sizeof(struct kobject));
+	memset(dev, 0, sizeof(struct device));
 	dev->id  = cpu;
 	dev->bus = &mce_subsys;
 
-- 
cgit v1.2.3-55-g7522


From e032d80774315869aa2285b217fdbbfed86c0b49 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman
Date: Mon, 16 Jan 2012 14:40:28 -0800
Subject: mce: fix warning messages about static struct mce_device

When suspending, there was a large list of warnings going something like:

	Device 'machinecheck1' does not have a release() function, it is broken and must be fixed

This patch turns the static mce_devices into dynamically allocated, and
properly frees them when they are removed from the system.  It solves
the warning messages on my laptop here.

Reported-by: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Tested-by: Djalal Harouni <tixxdz@opendz.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@amd64.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/mce.h           |  2 +-
 arch/x86/kernel/cpu/mcheck/mce.c     | 18 ++++++++++++++----
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 18 +++++++++++-------
 3 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index f35ce43c1a77..6aefb14cbbc5 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -151,7 +151,7 @@ static inline void enable_p5_mce(void) {}
 
 void mce_setup(struct mce *m);
 void mce_log(struct mce *m);
-DECLARE_PER_CPU(struct device, mce_device);
+extern struct device *mce_device[CONFIG_NR_CPUS];
 
 /*
  * Maximum banks number.
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 29ba3297e480..5a11ae2e9e91 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1859,7 +1859,7 @@ static struct bus_type mce_subsys = {
 	.dev_name	= "machinecheck",
 };
 
-DEFINE_PER_CPU(struct device, mce_device);
+struct device *mce_device[CONFIG_NR_CPUS];
 
 __cpuinitdata
 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -2001,19 +2001,27 @@ static struct device_attribute *mce_device_attrs[] = {
 
 static cpumask_var_t mce_device_initialized;
 
+static void mce_device_release(struct device *dev)
+{
+	kfree(dev);
+}
+
 /* Per cpu device init. All of the cpus still share the same ctrl bank: */
 static __cpuinit int mce_device_create(unsigned int cpu)
 {
-	struct device *dev = &per_cpu(mce_device, cpu);
+	struct device *dev;
 	int err;
 	int i, j;
 
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	memset(dev, 0, sizeof(struct device));
+	dev = kzalloc(sizeof *dev, GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
 	dev->id  = cpu;
 	dev->bus = &mce_subsys;
+	dev->release = &mce_device_release;
 
 	err = device_register(dev);
 	if (err)
@@ -2030,6 +2038,7 @@ static __cpuinit int mce_device_create(unsigned int cpu)
 			goto error2;
 	}
 	cpumask_set_cpu(cpu, mce_device_initialized);
+	mce_device[cpu] = dev;
 
 	return 0;
 error2:
@@ -2046,7 +2055,7 @@ error:
 
 static __cpuinit void mce_device_remove(unsigned int cpu)
 {
-	struct device *dev = &per_cpu(mce_device, cpu);
+	struct device *dev = mce_device[cpu];
 	int i;
 
 	if (!cpumask_test_cpu(cpu, mce_device_initialized))
@@ -2060,6 +2069,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu)
 
 	device_unregister(dev);
 	cpumask_clear_cpu(cpu, mce_device_initialized);
+	mce_device[cpu] = NULL;
 }
 
 /* Make sure there are no machine checks on offlined CPUs. */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index ba0b94a7e204..786e76a86322 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -523,6 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 {
 	int i, err = 0;
 	struct threshold_bank *b = NULL;
+	struct device *dev = mce_device[cpu];
 	char name[32];
 
 	sprintf(name, "threshold_bank%i", bank);
@@ -543,8 +544,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (!b)
 			goto out;
 
-		err = sysfs_create_link(&per_cpu(mce_device, cpu).kobj,
-					b->kobj, name);
+		err = sysfs_create_link(&dev->kobj, b->kobj, name);
 		if (err)
 			goto out;
 
@@ -565,7 +565,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		goto out;
 	}
 
-	b->kobj = kobject_create_and_add(name, &per_cpu(mce_device, cpu).kobj);
+	b->kobj = kobject_create_and_add(name, &dev->kobj);
 	if (!b->kobj)
 		goto out_free;
 
@@ -585,8 +585,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (i == cpu)
 			continue;
 
-		err = sysfs_create_link(&per_cpu(mce_device, i).kobj,
-					b->kobj, name);
+		dev = mce_device[i];
+		if (dev)
+			err = sysfs_create_link(&dev->kobj,b->kobj, name);
 		if (err)
 			goto out;
 
@@ -649,6 +650,7 @@ static void deallocate_threshold_block(unsigned int cpu,
 static void threshold_remove_bank(unsigned int cpu, int bank)
 {
 	struct threshold_bank *b;
+	struct device *dev;
 	char name[32];
 	int i = 0;
 
@@ -663,7 +665,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 #ifdef CONFIG_SMP
 	/* sibling symlink */
 	if (shared_bank[bank] && b->blocks->cpu != cpu) {
-		sysfs_remove_link(&per_cpu(mce_device, cpu).kobj, name);
+		sysfs_remove_link(&mce_device[cpu]->kobj, name);
 		per_cpu(threshold_banks, cpu)[bank] = NULL;
 
 		return;
@@ -675,7 +677,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 		if (i == cpu)
 			continue;
 
-		sysfs_remove_link(&per_cpu(mce_device, i).kobj, name);
+		dev = mce_device[i];
+		if (dev)
+			sysfs_remove_link(&dev->kobj, name);
 		per_cpu(threshold_banks, i)[bank] = NULL;
 	}
 
-- 
cgit v1.2.3-55-g7522


From b54ac6d2a25084667da781c7ca2cebef52a2bcdd Mon Sep 17 00:00:00 2001
From: Huang Ying
Date: Thu, 8 Dec 2011 11:25:49 +0800
Subject: ACPI, Record ACPI NVS regions

Some firmware will access memory in ACPI NVS region via APEI.  That
is, instructions in APEI ERST/EINJ table will read/write ACPI NVS
region.  The original resource conflict checking in APEI code will
check memory/ioport accessed by APEI via general resource management
mechanism.  But ACPI NVS region is marked as busy already, so that the
false resource conflict will prevent APEI ERST/EINJ to work.

To fix this, this patch record ACPI NVS regions, so that we can avoid
request resources for memory region inside it.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/e820.c |  4 ++--
 drivers/acpi/Makefile  |  3 ++-
 drivers/acpi/nvs.c     | 53 +++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/acpi.h   | 20 +++++++++++++------
 4 files changed, 70 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 303a0e48f076..51c3b186e5b9 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -714,7 +714,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
 }
 #endif
 
-#ifdef CONFIG_HIBERNATION
+#ifdef CONFIG_ACPI
 /**
  * Mark ACPI NVS memory region, so that we can save/restore it during
  * hibernation and the subsequent resume.
@@ -727,7 +727,7 @@ static int __init e820_mark_nvs_memory(void)
 		struct e820entry *ei = &e820.map[i];
 
 		if (ei->type == E820_NVS)
-			suspend_nvs_register(ei->addr, ei->size);
+			acpi_nvs_register(ei->addr, ei->size);
 	}
 
 	return 0;
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index ecb26b4f29a0..c07f44f05f9d 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -20,11 +20,12 @@ obj-y				+= acpi.o \
 # All the builtin files are in the "acpi." module_param namespace.
 acpi-y				+= osl.o utils.o reboot.o
 acpi-y				+= atomicio.o
+acpi-y				+= nvs.o
 
 # sleep related files
 acpi-y				+= wakeup.o
 acpi-y				+= sleep.o
-acpi-$(CONFIG_ACPI_SLEEP)	+= proc.o nvs.o
+acpi-$(CONFIG_ACPI_SLEEP)	+= proc.o
 
 
 #
diff --git a/drivers/acpi/nvs.c b/drivers/acpi/nvs.c
index 096787b43c96..7a2035fa8c71 100644
--- a/drivers/acpi/nvs.c
+++ b/drivers/acpi/nvs.c
@@ -15,6 +15,56 @@
 #include <linux/acpi_io.h>
 #include <acpi/acpiosxf.h>
 
+/* ACPI NVS regions, APEI may use it */
+
+struct nvs_region {
+	__u64 phys_start;
+	__u64 size;
+	struct list_head node;
+};
+
+static LIST_HEAD(nvs_region_list);
+
+#ifdef CONFIG_ACPI_SLEEP
+static int suspend_nvs_register(unsigned long start, unsigned long size);
+#else
+static inline int suspend_nvs_register(unsigned long a, unsigned long b)
+{
+	return 0;
+}
+#endif
+
+int acpi_nvs_register(__u64 start, __u64 size)
+{
+	struct nvs_region *region;
+
+	region = kmalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return -ENOMEM;
+	region->phys_start = start;
+	region->size = size;
+	list_add_tail(&region->node, &nvs_region_list);
+
+	return suspend_nvs_register(start, size);
+}
+
+int acpi_nvs_for_each_region(int (*func)(__u64 start, __u64 size, void *data),
+			     void *data)
+{
+	int rc;
+	struct nvs_region *region;
+
+	list_for_each_entry(region, &nvs_region_list, node) {
+		rc = func(region->phys_start, region->size, data);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+
+#ifdef CONFIG_ACPI_SLEEP
 /*
  * Platforms, like ACPI, may want us to save some memory used by them during
  * suspend and to restore the contents of this memory during the subsequent
@@ -41,7 +91,7 @@ static LIST_HEAD(nvs_list);
  *	things so that the data from page-aligned addresses in this region will
  *	be copied into separate RAM pages.
  */
-int suspend_nvs_register(unsigned long start, unsigned long size)
+static int suspend_nvs_register(unsigned long start, unsigned long size)
 {
 	struct nvs_page *entry, *next;
 
@@ -159,3 +209,4 @@ void suspend_nvs_restore(void)
 		if (entry->data)
 			memcpy(entry->kaddr, entry->data, entry->size);
 }
+#endif
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6001b4da39dd..26b75442ff7a 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -306,6 +306,11 @@ extern acpi_status acpi_pci_osc_control_set(acpi_handle handle,
 					     u32 *mask, u32 req);
 extern void acpi_early_init(void);
 
+extern int acpi_nvs_register(__u64 start, __u64 size);
+
+extern int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *),
+				    void *data);
+
 #else	/* !CONFIG_ACPI */
 
 #define acpi_disabled 1
@@ -348,15 +353,18 @@ static inline int acpi_table_parse(char *id,
 {
 	return -1;
 }
-#endif	/* !CONFIG_ACPI */
 
-#ifdef CONFIG_ACPI_SLEEP
-int suspend_nvs_register(unsigned long start, unsigned long size);
-#else
-static inline int suspend_nvs_register(unsigned long a, unsigned long b)
+static inline int acpi_nvs_register(__u64 start, __u64 size)
 {
 	return 0;
 }
-#endif
+
+static inline int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *),
+					   void *data)
+{
+	return 0;
+}
+
+#endif	/* !CONFIG_ACPI */
 
 #endif	/*_LINUX_ACPI_H*/
-- 
cgit v1.2.3-55-g7522


From cd298f60a2451a16e0f077404bf69b62ec868733 Mon Sep 17 00:00:00 2001
From: Kurt Garloff
Date: Tue, 17 Jan 2012 04:20:31 -0500
Subject: ACPI, x86: Use SRAT table rev to use 8bit or 32bit PXM fields
 (x86/x86-64)

In SRAT v1, we had 8bit proximity domain (PXM) fields; SRAT v2 provides
32bits for these. The new fields were reserved before.
According to the ACPI spec, the OS must disregrard reserved fields.

x86/x86-64 was rather inconsistent prior to this patch; it used 8 bits
for the pxm field in cpu_affinity, but 32 bits in mem_affinity.
This patch makes it consistent: Either use 8 bits consistently (SRAT
rev 1 or lower) or 32 bits (SRAT rev 2 or higher).

cc: x86@kernel.org
Signed-off-by: Kurt Garloff <kurt@garloff.de>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/mm/srat.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 81dbfdeb080d..7efd0c615d58 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -104,6 +104,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
 		return;
 	pxm = pa->proximity_domain_lo;
+	if (acpi_srat_revision >= 2)
+		pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8;
 	node = setup_node(pxm);
 	if (node < 0) {
 		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
@@ -155,6 +157,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 	start = ma->base_address;
 	end = start + ma->length;
 	pxm = ma->proximity_domain;
+	if (acpi_srat_revision <= 1)
+		pxm &= 0xff;
 	node = setup_node(pxm);
 	if (node < 0) {
 		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
-- 
cgit v1.2.3-55-g7522


From d7e7528bcd456f5c36ad4a202ccfb43c5aa98bc4 Mon Sep 17 00:00:00 2001
From: Eric Paris
Date: Tue, 3 Jan 2012 14:23:06 -0500
Subject: Audit: push audit success and retcode into arch ptrace.h

The audit system previously expected arches calling to audit_syscall_exit to
supply as arguments if the syscall was a success and what the return code was.
Audit also provides a helper AUDITSC_RESULT which was supposed to simplify things
by converting from negative retcodes to an audit internal magic value stating
success or failure.  This helper was wrong and could indicate that a valid
pointer returned to userspace was a failed syscall.  The fix is to fix the
layering foolishness.  We now pass audit_syscall_exit a struct pt_reg and it
in turns calls back into arch code to collect the return value and to
determine if the syscall was a success or failure.  We also define a generic
is_syscall_success() macro which determines success/failure based on if the
value is < -MAX_ERRNO.  This works for arches like x86 which do not use a
separate mechanism to indicate syscall failure.

We make both the is_syscall_success() and regs_return_value() static inlines
instead of macros.  The reason is because the audit function must take a void*
for the regs.  (uml calls theirs struct uml_pt_regs instead of just struct
pt_regs so audit_syscall_exit can't take a struct pt_regs).  Since the audit
function takes a void* we need to use static inlines to cast it back to the
arch correct structure to dereference it.

The other major change is that on some arches, like ia64, MIPS and ppc, we
change regs_return_value() to give us the negative value on syscall failure.
THE only other user of this macro, kretprobe_example.c, won't notice and it
makes the value signed consistently for the audit functions across all archs.

In arch/sh/kernel/ptrace_64.c I see that we were using regs[9] in the old
audit code as the return value.  But the ptrace_64.h code defined the macro
regs_return_value() as regs[3].  I have no idea which one is correct, but this
patch now uses the regs_return_value() function, so it now uses regs[3].

For powerpc we previously used regs->result but now use the
regs_return_value() function which uses regs->gprs[3].  regs->gprs[3] is
always positive so the regs_return_value(), much like ia64 makes it negative
before calling the audit code when appropriate.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: H. Peter Anvin <hpa@zytor.com> [for x86 portion]
Acked-by: Tony Luck <tony.luck@intel.com> [for ia64]
Acked-by: Richard Weinberger <richard@nod.at> [for uml]
Acked-by: David S. Miller <davem@davemloft.net> [for sparc]
Acked-by: Ralf Baechle <ralf@linux-mips.org> [for mips]
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> [for ppc]
---
 arch/ia64/include/asm/ptrace.h       | 13 ++++++++++++-
 arch/ia64/kernel/ptrace.c            |  9 +--------
 arch/microblaze/include/asm/ptrace.h |  5 +++++
 arch/microblaze/kernel/ptrace.c      |  3 +--
 arch/mips/include/asm/ptrace.h       | 14 +++++++++++++-
 arch/mips/kernel/ptrace.c            |  4 +---
 arch/powerpc/include/asm/ptrace.h    | 13 ++++++++++++-
 arch/powerpc/kernel/ptrace.c         |  4 +---
 arch/s390/include/asm/ptrace.h       |  6 +++++-
 arch/s390/kernel/ptrace.c            |  4 +---
 arch/sh/include/asm/ptrace_32.h      |  5 ++++-
 arch/sh/include/asm/ptrace_64.h      |  5 ++++-
 arch/sh/kernel/ptrace_32.c           |  4 +---
 arch/sh/kernel/ptrace_64.c           |  4 +---
 arch/sparc/include/asm/ptrace.h      | 10 +++++++++-
 arch/sparc/kernel/ptrace_64.c        | 11 +----------
 arch/um/kernel/ptrace.c              |  4 ++--
 arch/x86/ia32/ia32entry.S            | 10 +++++-----
 arch/x86/kernel/entry_32.S           |  8 ++++----
 arch/x86/kernel/entry_64.S           | 10 +++++-----
 arch/x86/kernel/ptrace.c             |  3 +--
 arch/x86/kernel/vm86_32.c            |  4 ++--
 arch/x86/um/shared/sysdep/ptrace.h   |  5 +++++
 include/linux/audit.h                | 22 ++++++++++++++--------
 include/linux/ptrace.h               | 10 ++++++++++
 kernel/auditsc.c                     | 16 ++++++++++++----
 26 files changed, 132 insertions(+), 74 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h
index f5cb27614e35..68c98f5b3ca6 100644
--- a/arch/ia64/include/asm/ptrace.h
+++ b/arch/ia64/include/asm/ptrace.h
@@ -246,7 +246,18 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
 	return regs->ar_bspstore;
 }
 
-#define regs_return_value(regs) ((regs)->r8)
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+	return regs->r10 != -1;
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	if (is_syscall_success(regs))
+		return regs->r8;
+	else
+		return -regs->r8;
+}
 
 /* Conserve space in histogram by encoding slot bits in address
  * bits 2 and 3 rather than bits 0 and 1.
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 8848f43d819e..2c154088cce7 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1268,14 +1268,7 @@ syscall_trace_leave (long arg0, long arg1, long arg2, long arg3,
 {
 	int step;
 
-	if (unlikely(current->audit_context)) {
-		int success = AUDITSC_RESULT(regs.r10);
-		long result = regs.r8;
-
-		if (success != AUDITSC_SUCCESS)
-			result = -result;
-		audit_syscall_exit(success, result);
-	}
+	audit_syscall_exit(&regs);
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
diff --git a/arch/microblaze/include/asm/ptrace.h b/arch/microblaze/include/asm/ptrace.h
index 816bee64b196..94e92c805859 100644
--- a/arch/microblaze/include/asm/ptrace.h
+++ b/arch/microblaze/include/asm/ptrace.h
@@ -61,6 +61,11 @@ struct pt_regs {
 #define instruction_pointer(regs)	((regs)->pc)
 #define profile_pc(regs)		instruction_pointer(regs)
 
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->r3;
+}
+
 #else /* __KERNEL__ */
 
 /* pt_regs offsets used by gdbserver etc in ptrace syscalls */
diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c
index 043cb58f9c44..f564b1bfd386 100644
--- a/arch/microblaze/kernel/ptrace.c
+++ b/arch/microblaze/kernel/ptrace.c
@@ -159,8 +159,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->r3), regs->r3);
+	audit_syscall_exit(regs);
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h
index de39b1f343ea..7d409505df2d 100644
--- a/arch/mips/include/asm/ptrace.h
+++ b/arch/mips/include/asm/ptrace.h
@@ -137,7 +137,19 @@ extern int ptrace_set_watch_regs(struct task_struct *child,
  */
 #define user_mode(regs) (((regs)->cp0_status & KU_MASK) == KU_USER)
 
-#define regs_return_value(_regs) ((_regs)->regs[2])
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+	return !regs->regs[7];
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	if (is_syscall_success(regs))
+		return regs->regs[2];
+	else
+		return -regs->regs[2];
+}
+
 #define instruction_pointer(regs) ((regs)->cp0_epc)
 #define profile_pc(regs) instruction_pointer(regs)
 
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 4e6ea1ffad46..ab0f1963a7bd 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -572,9 +572,7 @@ out:
  */
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->regs[7]),
-		                   -regs->regs[2]);
+	audit_syscall_exit(regs);
 
 	if (!(current->ptrace & PT_PTRACED))
 		return;
diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h
index 48223f9b8728..78a205162fd7 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -86,7 +86,18 @@ struct pt_regs {
 #define instruction_pointer(regs) ((regs)->nip)
 #define user_stack_pointer(regs) ((regs)->gpr[1])
 #define kernel_stack_pointer(regs) ((regs)->gpr[1])
-#define regs_return_value(regs) ((regs)->gpr[3])
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+	return !(regs->ccr & 0x10000000);
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	if (is_syscall_success(regs))
+		return regs->gpr[3];
+	else
+		return -regs->gpr[3];
+}
 
 #ifdef CONFIG_SMP
 extern unsigned long profile_pc(struct pt_regs *regs);
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 5de73dbd15c7..09d31c12a5e3 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -1748,9 +1748,7 @@ void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS,
-				   regs->result);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->result);
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index 56da355678f4..aeb77f017985 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -541,9 +541,13 @@ struct user_regs_struct
 #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0)
 #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN)
 #define user_stack_pointer(regs)((regs)->gprs[15])
-#define regs_return_value(regs)((regs)->gprs[2])
 #define profile_pc(regs) instruction_pointer(regs)
 
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->gprs[2];
+}
+
 int regs_query_register_offset(const char *name);
 const char *regs_query_register_name(unsigned int offset);
 unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset);
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 573bc29551ef..f52758600980 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -751,9 +751,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 
 asmlinkage void do_syscall_trace_exit(struct pt_regs *regs)
 {
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]),
-				   regs->gprs[2]);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->gprs[2]);
diff --git a/arch/sh/include/asm/ptrace_32.h b/arch/sh/include/asm/ptrace_32.h
index 6c2239cca1a2..2d3e906aa722 100644
--- a/arch/sh/include/asm/ptrace_32.h
+++ b/arch/sh/include/asm/ptrace_32.h
@@ -76,7 +76,10 @@ struct pt_dspregs {
 #ifdef __KERNEL__
 
 #define MAX_REG_OFFSET		offsetof(struct pt_regs, tra)
-#define regs_return_value(_regs)	((_regs)->regs[0])
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->regs[0];
+}
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/sh/include/asm/ptrace_64.h b/arch/sh/include/asm/ptrace_64.h
index bf9be7764d69..eb3fcceaf64b 100644
--- a/arch/sh/include/asm/ptrace_64.h
+++ b/arch/sh/include/asm/ptrace_64.h
@@ -13,7 +13,10 @@ struct pt_regs {
 #ifdef __KERNEL__
 
 #define MAX_REG_OFFSET		offsetof(struct pt_regs, tregs[7])
-#define regs_return_value(_regs)	((_regs)->regs[3])
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->regs[3];
+}
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index 92b3c276339a..c0b5c179d27b 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -530,9 +530,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->regs[0]),
-				   regs->regs[0]);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->regs[0]);
diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c
index c8f97649f354..ba720d686435 100644
--- a/arch/sh/kernel/ptrace_64.c
+++ b/arch/sh/kernel/ptrace_64.c
@@ -548,9 +548,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
 	int step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->regs[9]),
-				   regs->regs[9]);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->regs[9]);
diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h
index a0e1bcf843a1..c00c3b5c2806 100644
--- a/arch/sparc/include/asm/ptrace.h
+++ b/arch/sparc/include/asm/ptrace.h
@@ -207,7 +207,15 @@ do {	current_thread_info()->syscall_noerror = 1; \
 #define instruction_pointer(regs) ((regs)->tpc)
 #define instruction_pointer_set(regs, val) ((regs)->tpc = (val))
 #define user_stack_pointer(regs) ((regs)->u_regs[UREG_FP])
-#define regs_return_value(regs) ((regs)->u_regs[UREG_I0])
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+	return !(regs->tstate & (TSTATE_XCARRY | TSTATE_ICARRY));
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+	return regs->u_regs[UREG_I0];
+}
 #ifdef CONFIG_SMP
 extern unsigned long profile_pc(struct pt_regs *);
 #else
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index 96ee50a80661..c73c8c50f117 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -1086,17 +1086,8 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
-#ifdef CONFIG_AUDITSYSCALL
-	if (unlikely(current->audit_context)) {
-		unsigned long tstate = regs->tstate;
-		int result = AUDITSC_SUCCESS;
+	audit_syscall_exit(regs);
 
-		if (unlikely(tstate & (TSTATE_XCARRY | TSTATE_ICARRY)))
-			result = AUDITSC_FAILURE;
-
-		audit_syscall_exit(result, regs->u_regs[UREG_I0]);
-	}
-#endif
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->u_regs[UREG_G1]);
 
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index c9da32b0c707..2ccf25c42feb 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -175,8 +175,8 @@ void syscall_trace(struct uml_pt_regs *regs, int entryexit)
 					    UPT_SYSCALL_ARG2(regs),
 					    UPT_SYSCALL_ARG3(regs),
 					    UPT_SYSCALL_ARG4(regs));
-		else audit_syscall_exit(AUDITSC_RESULT(UPT_SYSCALL_RET(regs)),
-					UPT_SYSCALL_RET(regs));
+		else
+			audit_syscall_exit(regs);
 	}
 
 	/* Fake a debug trap */
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 3e274564f6bf..64ced0b8f8fd 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -14,6 +14,7 @@
 #include <asm/segment.h>
 #include <asm/irqflags.h>
 #include <linux/linkage.h>
+#include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -208,12 +209,11 @@ sysexit_from_sys_call:
 	TRACE_IRQS_ON
 	sti
 	movl %eax,%esi		/* second arg, syscall return value */
-	cmpl $0,%eax		/* is it < 0? */
-	setl %al		/* 1 if so, 0 if not */
+	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
+	setbe %al		/* 1 if so, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
-	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-	call audit_syscall_exit
-	movl RAX-ARGOFFSET(%rsp),%eax	/* reload syscall return value */
+	call __audit_syscall_exit
+	movq RAX-ARGOFFSET(%rsp),%rax	/* reload syscall return value */
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	cli
 	TRACE_IRQS_OFF
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 22d0e21b4dd7..a22facf06f0e 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -42,6 +42,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/err.h>
 #include <asm/thread_info.h>
 #include <asm/irqflags.h>
 #include <asm/errno.h>
@@ -466,11 +467,10 @@ sysexit_audit:
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_ANY)
 	movl %eax,%edx		/* second arg, syscall return value */
-	cmpl $0,%eax		/* is it < 0? */
-	setl %al		/* 1 if so, 0 if not */
+	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
+	setbe %al		/* 1 if so, 0 if not */
 	movzbl %al,%eax		/* zero-extend that */
-	inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-	call audit_syscall_exit
+	call __audit_syscall_exit
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a20e1cb9dc87..e51393dd93a3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -55,6 +55,7 @@
 #include <asm/paravirt.h>
 #include <asm/ftrace.h>
 #include <asm/percpu.h>
+#include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -563,17 +564,16 @@ auditsys:
 	jmp system_call_fastpath
 
 	/*
-	 * Return fast path for syscall audit.  Call audit_syscall_exit()
+	 * Return fast path for syscall audit.  Call __audit_syscall_exit()
 	 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
 	 * masked off.
 	 */
 sysret_audit:
 	movq RAX-ARGOFFSET(%rsp),%rsi	/* second arg, syscall return value */
-	cmpq $0,%rsi		/* is it < 0? */
-	setl %al		/* 1 if so, 0 if not */
+	cmpq $-MAX_ERRNO,%rsi	/* is it < -MAX_ERRNO? */
+	setbe %al		/* 1 if so, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
-	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-	call audit_syscall_exit
+	call __audit_syscall_exit
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	jmp sysret_check
 #endif	/* CONFIG_AUDITSYSCALL */
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 89a04c7b5bb6..8b0218758775 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1414,8 +1414,7 @@ void syscall_trace_leave(struct pt_regs *regs)
 {
 	bool step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->ax);
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 863f8753ab0a..af17e1c966dc 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -335,9 +335,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	if (info->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
 
-	/*call audit_syscall_exit since we do not exit via the normal paths */
+	/*call __audit_syscall_exit since we do not exit via the normal paths */
 	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(0), 0);
+		__audit_syscall_exit(1, 0);
 
 	__asm__ __volatile__(
 		"movl %0,%%esp\n\t"
diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h
index 711b1621747f..5ef9344a8b24 100644
--- a/arch/x86/um/shared/sysdep/ptrace.h
+++ b/arch/x86/um/shared/sysdep/ptrace.h
@@ -3,3 +3,8 @@
 #else
 #include "ptrace_64.h"
 #endif
+
+static inline long regs_return_value(struct uml_pt_regs *regs)
+{
+	return UPT_SYSCALL_RET(regs);
+}
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 6e1c533f9b46..3d65e4b3ba06 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -26,6 +26,7 @@
 
 #include <linux/types.h>
 #include <linux/elf-em.h>
+#include <linux/ptrace.h>
 
 /* The netlink messages for the audit system is divided into blocks:
  * 1000 - 1099 are for commanding the audit system
@@ -408,10 +409,6 @@ struct audit_field {
 	void				*lsm_rule;
 };
 
-#define AUDITSC_INVALID 0
-#define AUDITSC_SUCCESS 1
-#define AUDITSC_FAILURE 2
-#define AUDITSC_RESULT(x) ( ((long)(x))<0?AUDITSC_FAILURE:AUDITSC_SUCCESS )
 extern int __init audit_register_class(int class, unsigned *list);
 extern int audit_classify_syscall(int abi, unsigned syscall);
 extern int audit_classify_arch(int arch);
@@ -424,7 +421,7 @@ extern void audit_free(struct task_struct *task);
 extern void audit_syscall_entry(int arch,
 				int major, unsigned long a0, unsigned long a1,
 				unsigned long a2, unsigned long a3);
-extern void audit_syscall_exit(int failed, long return_code);
+extern void __audit_syscall_exit(int ret_success, long ret_value);
 extern void __audit_getname(const char *name);
 extern void audit_putname(const char *name);
 extern void __audit_inode(const char *name, const struct dentry *dentry);
@@ -438,6 +435,15 @@ static inline int audit_dummy_context(void)
 	void *p = current->audit_context;
 	return !p || *(int *)p;
 }
+static inline void audit_syscall_exit(void *pt_regs)
+{
+	if (unlikely(current->audit_context)) {
+		int success = is_syscall_success(pt_regs);
+		int return_code = regs_return_value(pt_regs);
+
+		__audit_syscall_exit(success, return_code);
+	}
+}
 static inline void audit_getname(const char *name)
 {
 	if (unlikely(!audit_dummy_context()))
@@ -551,12 +557,12 @@ static inline void audit_mmap_fd(int fd, int flags)
 
 extern int audit_n_rules;
 extern int audit_signals;
-#else
+#else /* CONFIG_AUDITSYSCALL */
 #define audit_finish_fork(t)
 #define audit_alloc(t) ({ 0; })
 #define audit_free(t) do { ; } while (0)
 #define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0)
-#define audit_syscall_exit(f,r) do { ; } while (0)
+#define audit_syscall_exit(r) do { ; } while (0)
 #define audit_dummy_context() 1
 #define audit_getname(n) do { ; } while (0)
 #define audit_putname(n) do { ; } while (0)
@@ -587,7 +593,7 @@ extern int audit_signals;
 #define audit_ptrace(t) ((void)0)
 #define audit_n_rules 0
 #define audit_signals 0
-#endif
+#endif /* CONFIG_AUDITSYSCALL */
 
 #ifdef CONFIG_AUDIT
 /* These are defined in audit.c */
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 800f113bea66..dd4cefa6519d 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -112,6 +112,7 @@
 
 #include <linux/compiler.h>		/* For unlikely.  */
 #include <linux/sched.h>		/* For struct task_struct.  */
+#include <linux/err.h>			/* for IS_ERR_VALUE */
 
 
 extern long arch_ptrace(struct task_struct *child, long request,
@@ -265,6 +266,15 @@ static inline void ptrace_release_task(struct task_struct *task)
 #define force_successful_syscall_return() do { } while (0)
 #endif
 
+#ifndef is_syscall_success
+/*
+ * On most systems we can tell if a syscall is a success based on if the retval
+ * is an error value.  On some systems like ia64 and powerpc they have different
+ * indicators of success/failure and must define their own.
+ */
+#define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs))))
+#endif
+
 /*
  * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__.
  *
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e9bcb93800d8..3d2853808185 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -70,6 +70,11 @@
 
 #include "audit.h"
 
+/* flags stating the success for a syscall */
+#define AUDITSC_INVALID 0
+#define AUDITSC_SUCCESS 1
+#define AUDITSC_FAILURE 2
+
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
  * for saving names from getname().  If we get more names we will allocate
  * a name dynamically and also add those to the list anchored by names_list. */
@@ -1724,8 +1729,7 @@ void audit_finish_fork(struct task_struct *child)
 
 /**
  * audit_syscall_exit - deallocate audit context after a system call
- * @valid: success/failure flag
- * @return_code: syscall return value
+ * @pt_regs: syscall registers
  *
  * Tear down after system call.  If the audit context has been marked as
  * auditable (either because of the AUDIT_RECORD_CONTEXT state from
@@ -1733,13 +1737,17 @@ void audit_finish_fork(struct task_struct *child)
  * message), then write out the syscall information.  In call cases,
  * free the names stored from getname().
  */
-void audit_syscall_exit(int valid, long return_code)
+void __audit_syscall_exit(int success, long return_code)
 {
 	struct task_struct *tsk = current;
 	struct audit_context *context;
 
-	context = audit_get_context(tsk, valid, return_code);
+	if (success)
+		success = AUDITSC_SUCCESS;
+	else
+		success = AUDITSC_FAILURE;
 
+	context = audit_get_context(tsk, success, return_code);
 	if (likely(!context))
 		return;
 
-- 
cgit v1.2.3-55-g7522


From f031cd25568a390dc2c9c3a4015054183753449a Mon Sep 17 00:00:00 2001
From: Eric Paris
Date: Tue, 3 Jan 2012 14:23:06 -0500
Subject: audit: ia32entry.S sign extend error codes when calling 64 bit code

In the ia32entry syscall exit audit fastpath we have assembly code which calls
__audit_syscall_exit directly.  This code was, however, zeroes the upper 32
bits of the return code.  It then proceeded to call code which expects longs
to be 64bits long.  In order to handle code which expects longs to be 64bit we
sign extend the return code if that code is an error.  Thus the
__audit_syscall_exit function can correctly handle using the values in
snprintf("%ld").  This fixes the regression introduced in 5cbf1565f29eb57a86a.

Old record:
type=SYSCALL msg=audit(1306197182.256:281): arch=40000003 syscall=192 success=no exit=4294967283
New record:
type=SYSCALL msg=audit(1306197182.256:281): arch=40000003 syscall=192 success=no exit=-13

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/ia32/ia32entry.S | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 64ced0b8f8fd..025f0f01d254 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -210,7 +210,9 @@ sysexit_from_sys_call:
 	sti
 	movl %eax,%esi		/* second arg, syscall return value */
 	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
-	setbe %al		/* 1 if so, 0 if not */
+	jbe 1f
+	movslq %eax, %rsi	/* if error sign extend to 64 bits */
+1:	setbe %al		/* 1 if error, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
 	call __audit_syscall_exit
 	movq RAX-ARGOFFSET(%rsp),%rax	/* reload syscall return value */
-- 
cgit v1.2.3-55-g7522


From b05d8447e7821695bc2fa3359431f7a664232743 Mon Sep 17 00:00:00 2001
From: Eric Paris
Date: Tue, 3 Jan 2012 14:23:06 -0500
Subject: audit: inline audit_syscall_entry to reduce burden on archs

Every arch calls:

if (unlikely(current->audit_context))
	audit_syscall_entry()

which requires knowledge about audit (the existance of audit_context) in
the arch code.  Just do it all in static inline in audit.h so that arch's
can remain blissfully ignorant.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 arch/ia64/kernel/ptrace.c       |  9 +--------
 arch/microblaze/kernel/ptrace.c |  6 ++----
 arch/mips/kernel/ptrace.c       |  7 +++----
 arch/powerpc/kernel/ptrace.c    | 26 ++++++++++++--------------
 arch/s390/kernel/ptrace.c       | 11 +++++------
 arch/sh/kernel/ptrace_32.c      |  7 +++----
 arch/sh/kernel/ptrace_64.c      |  7 +++----
 arch/sparc/kernel/ptrace_64.c   | 17 ++++++++---------
 arch/um/kernel/ptrace.c         | 20 +++++++++-----------
 arch/x86/ia32/ia32entry.S       |  2 +-
 arch/x86/kernel/entry_32.S      |  2 +-
 arch/x86/kernel/entry_64.S      |  4 ++--
 arch/x86/kernel/ptrace.c        | 22 ++++++++++------------
 arch/xtensa/kernel/ptrace.c     |  3 +--
 include/linux/audit.h           | 13 ++++++++++---
 kernel/auditsc.c                |  2 +-
 16 files changed, 72 insertions(+), 86 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 2c154088cce7..dad91661ddf9 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1246,15 +1246,8 @@ syscall_trace_enter (long arg0, long arg1, long arg2, long arg3,
 	if (test_thread_flag(TIF_RESTORE_RSE))
 		ia64_sync_krbs();
 
-	if (unlikely(current->audit_context)) {
-		long syscall;
-		int arch;
 
-		syscall = regs.r15;
-		arch = AUDIT_ARCH_IA64;
-
-		audit_syscall_entry(arch, syscall, arg0, arg1, arg2, arg3);
-	}
+	audit_syscall_entry(AUDIT_ARCH_IA64, regs.r15, arg0, arg1, arg2, arg3);
 
 	return 0;
 }
diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c
index f564b1bfd386..6eb2aa927d89 100644
--- a/arch/microblaze/kernel/ptrace.c
+++ b/arch/microblaze/kernel/ptrace.c
@@ -147,10 +147,8 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 		 */
 		ret = -1L;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(EM_MICROBLAZE, regs->r12,
-				    regs->r5, regs->r6,
-				    regs->r7, regs->r8);
+	audit_syscall_entry(EM_MICROBLAZE, regs->r12, regs->r5, regs->r6,
+			    regs->r7, regs->r8);
 
 	return ret ?: regs->r12;
 }
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index ab0f1963a7bd..7786b608d932 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -560,10 +560,9 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 	}
 
 out:
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(audit_arch(), regs->regs[2],
-				    regs->regs[4], regs->regs[5],
-				    regs->regs[6], regs->regs[7]);
+	audit_syscall_entry(audit_arch(), regs->regs[2],
+			    regs->regs[4], regs->regs[5],
+			    regs->regs[6], regs->regs[7]);
 }
 
 /*
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 09d31c12a5e3..5b43325402bc 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -1724,22 +1724,20 @@ long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->gpr[0]);
 
-	if (unlikely(current->audit_context)) {
 #ifdef CONFIG_PPC64
-		if (!is_32bit_task())
-			audit_syscall_entry(AUDIT_ARCH_PPC64,
-					    regs->gpr[0],
-					    regs->gpr[3], regs->gpr[4],
-					    regs->gpr[5], regs->gpr[6]);
-		else
+	if (!is_32bit_task())
+		audit_syscall_entry(AUDIT_ARCH_PPC64,
+				    regs->gpr[0],
+				    regs->gpr[3], regs->gpr[4],
+				    regs->gpr[5], regs->gpr[6]);
+	else
 #endif
-			audit_syscall_entry(AUDIT_ARCH_PPC,
-					    regs->gpr[0],
-					    regs->gpr[3] & 0xffffffff,
-					    regs->gpr[4] & 0xffffffff,
-					    regs->gpr[5] & 0xffffffff,
-					    regs->gpr[6] & 0xffffffff);
-	}
+		audit_syscall_entry(AUDIT_ARCH_PPC,
+				    regs->gpr[0],
+				    regs->gpr[3] & 0xffffffff,
+				    regs->gpr[4] & 0xffffffff,
+				    regs->gpr[5] & 0xffffffff,
+				    regs->gpr[6] & 0xffffffff);
 
 	return ret ?: regs->gpr[0];
 }
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index f52758600980..9d82ed4bcb27 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -740,12 +740,11 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->gprs[2]);
 
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(is_compat_task() ?
-					AUDIT_ARCH_S390 : AUDIT_ARCH_S390X,
-				    regs->gprs[2], regs->orig_gpr2,
-				    regs->gprs[3], regs->gprs[4],
-				    regs->gprs[5]);
+	audit_syscall_entry(is_compat_task() ?
+				AUDIT_ARCH_S390 : AUDIT_ARCH_S390X,
+			    regs->gprs[2], regs->orig_gpr2,
+			    regs->gprs[3], regs->gprs[4],
+			    regs->gprs[5]);
 	return ret ?: regs->gprs[2];
 }
 
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index c0b5c179d27b..a3e651563763 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -518,10 +518,9 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->regs[0]);
 
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(audit_arch(), regs->regs[3],
-				    regs->regs[4], regs->regs[5],
-				    regs->regs[6], regs->regs[7]);
+	audit_syscall_entry(audit_arch(), regs->regs[3],
+			    regs->regs[4], regs->regs[5],
+			    regs->regs[6], regs->regs[7]);
 
 	return ret ?: regs->regs[0];
 }
diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c
index ba720d686435..3d0080b5c976 100644
--- a/arch/sh/kernel/ptrace_64.c
+++ b/arch/sh/kernel/ptrace_64.c
@@ -536,10 +536,9 @@ asmlinkage long long do_syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->regs[9]);
 
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(audit_arch(), regs->regs[1],
-				    regs->regs[2], regs->regs[3],
-				    regs->regs[4], regs->regs[5]);
+	audit_syscall_entry(audit_arch(), regs->regs[1],
+			    regs->regs[2], regs->regs[3],
+			    regs->regs[4], regs->regs[5]);
 
 	return ret ?: regs->regs[9];
 }
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index c73c8c50f117..9388844cd88c 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -1071,15 +1071,14 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->u_regs[UREG_G1]);
 
-	if (unlikely(current->audit_context) && !ret)
-		audit_syscall_entry((test_thread_flag(TIF_32BIT) ?
-				     AUDIT_ARCH_SPARC :
-				     AUDIT_ARCH_SPARC64),
-				    regs->u_regs[UREG_G1],
-				    regs->u_regs[UREG_I0],
-				    regs->u_regs[UREG_I1],
-				    regs->u_regs[UREG_I2],
-				    regs->u_regs[UREG_I3]);
+	audit_syscall_entry((test_thread_flag(TIF_32BIT) ?
+			     AUDIT_ARCH_SPARC :
+			     AUDIT_ARCH_SPARC64),
+			    regs->u_regs[UREG_G1],
+			    regs->u_regs[UREG_I0],
+			    regs->u_regs[UREG_I1],
+			    regs->u_regs[UREG_I2],
+			    regs->u_regs[UREG_I3]);
 
 	return ret;
 }
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index 2ccf25c42feb..06b190390505 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -167,17 +167,15 @@ void syscall_trace(struct uml_pt_regs *regs, int entryexit)
 	int is_singlestep = (current->ptrace & PT_DTRACE) && entryexit;
 	int tracesysgood;
 
-	if (unlikely(current->audit_context)) {
-		if (!entryexit)
-			audit_syscall_entry(HOST_AUDIT_ARCH,
-					    UPT_SYSCALL_NR(regs),
-					    UPT_SYSCALL_ARG1(regs),
-					    UPT_SYSCALL_ARG2(regs),
-					    UPT_SYSCALL_ARG3(regs),
-					    UPT_SYSCALL_ARG4(regs));
-		else
-			audit_syscall_exit(regs);
-	}
+	if (!entryexit)
+		audit_syscall_entry(HOST_AUDIT_ARCH,
+				    UPT_SYSCALL_NR(regs),
+				    UPT_SYSCALL_ARG1(regs),
+				    UPT_SYSCALL_ARG2(regs),
+				    UPT_SYSCALL_ARG3(regs),
+				    UPT_SYSCALL_ARG4(regs));
+	else
+		audit_syscall_exit(regs);
 
 	/* Fake a debug trap */
 	if (is_singlestep)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 025f0f01d254..cecfd9a8f734 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -192,7 +192,7 @@ sysexit_from_sys_call:
 	movl %ebx,%edx			/* 3rd arg: 1st syscall arg */
 	movl %eax,%esi			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_I386,%edi	/* 1st arg: audit arch */
-	call audit_syscall_entry
+	call __audit_syscall_entry
 	movl RAX-ARGOFFSET(%rsp),%eax	/* reload syscall number */
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index a22facf06f0e..1ccd742eba1b 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -456,7 +456,7 @@ sysenter_audit:
 	movl %ebx,%ecx			/* 3rd arg: 1st syscall arg */
 	movl %eax,%edx			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_I386,%eax	/* 1st arg: audit arch */
-	call audit_syscall_entry
+	call __audit_syscall_entry
 	pushl_cfi %ebx
 	movl PT_EAX(%esp),%eax		/* reload syscall number */
 	jmp sysenter_do_call
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e51393dd93a3..1ca66b650123 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -549,7 +549,7 @@ badsys:
 #ifdef CONFIG_AUDITSYSCALL
 	/*
 	 * Fast path for syscall audit without full syscall trace.
-	 * We just call audit_syscall_entry() directly, and then
+	 * We just call __audit_syscall_entry() directly, and then
 	 * jump back to the normal fast path.
 	 */
 auditsys:
@@ -559,7 +559,7 @@ auditsys:
 	movq %rdi,%rdx			/* 3rd arg: 1st syscall arg */
 	movq %rax,%rsi			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_X86_64,%edi	/* 1st arg: audit arch */
-	call audit_syscall_entry
+	call __audit_syscall_entry
 	LOAD_ARGS 0		/* reload call-clobbered registers */
 	jmp system_call_fastpath
 
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 8b0218758775..50267386b766 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1392,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->orig_ax);
 
-	if (unlikely(current->audit_context)) {
-		if (IS_IA32)
-			audit_syscall_entry(AUDIT_ARCH_I386,
-					    regs->orig_ax,
-					    regs->bx, regs->cx,
-					    regs->dx, regs->si);
+	if (IS_IA32)
+		audit_syscall_entry(AUDIT_ARCH_I386,
+				    regs->orig_ax,
+				    regs->bx, regs->cx,
+				    regs->dx, regs->si);
 #ifdef CONFIG_X86_64
-		else
-			audit_syscall_entry(AUDIT_ARCH_X86_64,
-					    regs->orig_ax,
-					    regs->di, regs->si,
-					    regs->dx, regs->r10);
+	else
+		audit_syscall_entry(AUDIT_ARCH_X86_64,
+				    regs->orig_ax,
+				    regs->di, regs->si,
+				    regs->dx, regs->r10);
 #endif
-	}
 
 	return ret ?: regs->orig_ax;
 }
diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c
index a0d042aa2967..2dff698ab02e 100644
--- a/arch/xtensa/kernel/ptrace.c
+++ b/arch/xtensa/kernel/ptrace.c
@@ -334,8 +334,7 @@ void do_syscall_trace_enter(struct pt_regs *regs)
 		do_syscall_trace();
 
 #if 0
-	if (unlikely(current->audit_context))
-		audit_syscall_entry(current, AUDIT_ARCH_XTENSA..);
+	audit_syscall_entry(current, AUDIT_ARCH_XTENSA..);
 #endif
 }
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 3d65e4b3ba06..f56ce2669b83 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -418,9 +418,9 @@ extern int audit_classify_arch(int arch);
 extern void audit_finish_fork(struct task_struct *child);
 extern int  audit_alloc(struct task_struct *task);
 extern void audit_free(struct task_struct *task);
-extern void audit_syscall_entry(int arch,
-				int major, unsigned long a0, unsigned long a1,
-				unsigned long a2, unsigned long a3);
+extern void __audit_syscall_entry(int arch,
+				  int major, unsigned long a0, unsigned long a1,
+				  unsigned long a2, unsigned long a3);
 extern void __audit_syscall_exit(int ret_success, long ret_value);
 extern void __audit_getname(const char *name);
 extern void audit_putname(const char *name);
@@ -435,6 +435,13 @@ static inline int audit_dummy_context(void)
 	void *p = current->audit_context;
 	return !p || *(int *)p;
 }
+static inline void audit_syscall_entry(int arch, int major, unsigned long a0,
+				       unsigned long a1, unsigned long a2,
+				       unsigned long a3)
+{
+	if (unlikely(!audit_dummy_context()))
+		__audit_syscall_entry(arch, major, a0, a1, a2, a3);
+}
 static inline void audit_syscall_exit(void *pt_regs)
 {
 	if (unlikely(current->audit_context)) {
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3d2853808185..b408100dd6ef 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1632,7 +1632,7 @@ void audit_free(struct task_struct *tsk)
  * will only be written if another part of the kernel requests that it
  * be written).
  */
-void audit_syscall_entry(int arch, int major,
+void __audit_syscall_entry(int arch, int major,
 			 unsigned long a1, unsigned long a2,
 			 unsigned long a3, unsigned long a4)
 {
-- 
cgit v1.2.3-55-g7522


From 6015ff103133c7e50a753c198c69bcabc3a5e3b0 Mon Sep 17 00:00:00 2001
From: Al Viro
Date: Wed, 18 Jan 2012 01:51:22 +0000
Subject: x86-32: Fix build failure with AUDIT=y, AUDITSYSCALL=n

JONGMAN HEO reports:

  With current linus git (commit a25a2b84), I got following build error,

  arch/x86/kernel/vm86_32.c: In function 'do_sys_vm86':
  arch/x86/kernel/vm86_32.c:340: error: implicit declaration of function '__audit_syscall_exit'
  make[3]: *** [arch/x86/kernel/vm86_32.o] Error 1

OK, I can reproduce it (32bit allmodconfig with AUDIT=y, AUDITSYSCALL=n)

It's due to commit d7e7528bcd45: "Audit: push audit success and retcode
into arch ptrace.h".

Reported-by: JONGMAN HEO <jongman.heo@samsung.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/vm86_32.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index af17e1c966dc..b466cab5ba15 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -336,8 +336,10 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 		mark_screen_rdonly(tsk->mm);
 
 	/*call __audit_syscall_exit since we do not exit via the normal paths */
+#ifdef CONFIG_AUDITSYSCALL
 	if (unlikely(current->audit_context))
 		__audit_syscall_exit(1, 0);
+#endif
 
 	__asm__ __volatile__(
 		"movl %0,%%esp\n\t"
-- 
cgit v1.2.3-55-g7522


From 90a4c0f51e8e44111a926be6f4c87af3938a79c3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds
Date: Wed, 18 Jan 2012 19:26:11 -0800
Subject: uml: fix compile for x86-64

Randy Dunlap reports that we get

  arch/x86/um/shared/sysdep/ptrace.h:7:20: error: redefinition of 'regs_return_value'
  arch/x86/um/shared/sysdep/ptrace.h:7:20: note: previous definition of 'regs_return_value' was here

when compiling UML for x86-64.

Stephen Rothwell root-caused it and says:

 "Caused by commit d7e7528bcd45 ("Audit: push audit success and retcode
  into arch ptrace.h") (another patch that was never in linux-next :-().

  This file now needs protection against double inclusion."

so let's do as the man says.

Reported-by: Randy Dunlap <rdunlap@xenotime.net>
Analyzed-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/um/shared/sysdep/ptrace.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h
index 5ef9344a8b24..2bbe1ec2d96a 100644
--- a/arch/x86/um/shared/sysdep/ptrace.h
+++ b/arch/x86/um/shared/sysdep/ptrace.h
@@ -1,3 +1,6 @@
+#ifndef __SYSDEP_X86_PTRACE_H
+#define __SYSDEP_X86_PTRACE_H
+
 #ifdef __i386__
 #include "ptrace_32.h"
 #else
@@ -8,3 +11,5 @@ static inline long regs_return_value(struct uml_pt_regs *regs)
 {
 	return UPT_SYSCALL_RET(regs);
 }
+
+#endif /* __SYSDEP_X86_PTRACE_H */
-- 
cgit v1.2.3-55-g7522