From 6b2ddf33baec23dace85bd647e3fc4ac070963e8 Mon Sep 17 00:00:00 2001
From: Vasily Gorbik
Date: Sun, 17 Jun 2018 00:30:43 +0200
Subject: s390/extmem: fix gcc 8 stringop-overflow warning

arch/s390/mm/extmem.c: In function '__segment_load':
arch/s390/mm/extmem.c:436:2: warning: 'strncat' specified bound 7 equals
source length [-Wstringop-overflow=]
  strncat(seg->res_name, " (DCSS)", 7);

What gcc complains about here is the misuse of strncat function, which
in this case does not limit a number of bytes taken from "src", so it is
in the end the same as strcat(seg->res_name, " (DCSS)");

Keeping in mind that a res_name is 15 bytes, strncat in this case
would overflow the buffer and write 0 into alignment byte between the
fields in the struct. To avoid that increasing res_name size to 16,
and reusing strlcat.

Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/extmem.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/s390/mm')

diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index 6ad15d3fab81..84111a43ea29 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -80,7 +80,7 @@ struct qin64 {
 struct dcss_segment {
 	struct list_head list;
 	char dcss_name[8];
-	char res_name[15];
+	char res_name[16];
 	unsigned long start_addr;
 	unsigned long end;
 	atomic_t ref_count;
@@ -433,7 +433,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
 	memcpy(&seg->res_name, seg->dcss_name, 8);
 	EBCASC(seg->res_name, 8);
 	seg->res_name[8] = '\0';
-	strncat(seg->res_name, " (DCSS)", 7);
+	strlcat(seg->res_name, " (DCSS)", sizeof(seg->res_name));
 	seg->res->name = seg->res_name;
 	rc = seg->vm_segtype;
 	if (rc == SEG_TYPE_SC ||
-- 
cgit v1.2.3-55-g7522


From 5bedf8aa03c28cb8dc98bdd32a41b66d8f7d3eaa Mon Sep 17 00:00:00 2001
From: Vasily Gorbik
Date: Sun, 24 Jun 2018 12:17:43 +0200
Subject: s390/mm: correct allocate_pgste proc_handler callback

Since proc_dointvec does not perform value range control,
proc_dointvec_minmax should be used to limit value range, which is
clearly intended here, as the internal representation of the value:

unsigned int alloc_pgste:1;

In fact it currently works, since we have

      mm->context.alloc_pgste = page_table_allocate_pgste || ...

... since commit 23fefe119ceb5 ("s390/kvm: avoid global config of vm.alloc_pgste=1")

Before that it was

       mm->context.alloc_pgste = page_table_allocate_pgste;

which was broken. That was introduced with commit 0b46e0a3ec0d7 ("s390/kvm:
remove delayed reallocation of page tables for KVM").

Fixes: 0b46e0a3ec0d7 ("s390/kvm: remove delayed reallocation of page tables for KVM")
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/pgalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/s390/mm')

diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 84bd6329a88d..6cf9c9ff2bff 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -28,7 +28,7 @@ static struct ctl_table page_table_sysctl[] = {
 		.data		= &page_table_allocate_pgste,
 		.maxlen		= sizeof(int),
 		.mode		= S_IRUGO | S_IWUSR,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &page_table_allocate_pgste_min,
 		.extra2		= &page_table_allocate_pgste_max,
 	},
-- 
cgit v1.2.3-55-g7522


From 71e33a1dd712dc6f68de733218d4506bea26bda6 Mon Sep 17 00:00:00 2001
From: Vasily Gorbik
Date: Sun, 1 Jul 2018 15:56:28 +0200
Subject: s390/cmm: split and simplify cmm pages proc handler

Split cmm_pages_handler into cmm_pages_handler and
cmm_timed_pages_handler, each handling separate proc entry.  And reuse
proc_doulongvec_minmax to simplify proc handlers. Min/max values are
optional and are omitted here.

Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/cmm.c | 67 ++++++++++++++++++++++++++----------------------------
 1 file changed, 32 insertions(+), 35 deletions(-)

(limited to 'arch/s390/mm')

diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index 6cf024eb2085..50d8d1c887a4 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -251,45 +251,42 @@ static int cmm_skip_blanks(char *cp, char **endp)
 	return str != cp;
 }
 
-static struct ctl_table cmm_table[];
-
 static int cmm_pages_handler(struct ctl_table *ctl, int write,
 			     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	char buf[16], *p;
-	unsigned int len;
-	long nr;
+	long nr = cmm_get_pages();
+	struct ctl_table ctl_entry = {
+		.procname	= ctl->procname,
+		.data		= &nr,
+		.maxlen		= sizeof(long),
+	};
+	int rc;
 
-	if (!*lenp || (*ppos && !write)) {
-		*lenp = 0;
-		return 0;
-	}
+	rc = proc_doulongvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
+	if (rc < 0 || !write)
+		return rc;
 
-	if (write) {
-		len = *lenp;
-		if (copy_from_user(buf, buffer,
-				   len > sizeof(buf) ? sizeof(buf) : len))
-			return -EFAULT;
-		buf[sizeof(buf) - 1] = '\0';
-		cmm_skip_blanks(buf, &p);
-		nr = simple_strtoul(p, &p, 0);
-		if (ctl == &cmm_table[0])
-			cmm_set_pages(nr);
-		else
-			cmm_add_timed_pages(nr);
-	} else {
-		if (ctl == &cmm_table[0])
-			nr = cmm_get_pages();
-		else
-			nr = cmm_get_timed_pages();
-		len = sprintf(buf, "%ld\n", nr);
-		if (len > *lenp)
-			len = *lenp;
-		if (copy_to_user(buffer, buf, len))
-			return -EFAULT;
-	}
-	*lenp = len;
-	*ppos += len;
+	cmm_set_pages(nr);
+	return 0;
+}
+
+static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
+				   void __user *buffer, size_t *lenp,
+				   loff_t *ppos)
+{
+	long nr = cmm_get_timed_pages();
+	struct ctl_table ctl_entry = {
+		.procname	= ctl->procname,
+		.data		= &nr,
+		.maxlen		= sizeof(long),
+	};
+	int rc;
+
+	rc = proc_doulongvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
+	if (rc < 0 || !write)
+		return rc;
+
+	cmm_add_timed_pages(nr);
 	return 0;
 }
 
@@ -338,7 +335,7 @@ static struct ctl_table cmm_table[] = {
 	{
 		.procname	= "cmm_timed_pages",
 		.mode		= 0644,
-		.proc_handler	= cmm_pages_handler,
+		.proc_handler	= cmm_timed_pages_handler,
 	},
 	{
 		.procname	= "cmm_timeout",
-- 
cgit v1.2.3-55-g7522


From ac4c4fc87d6beb6dd8287f24670e5e81c24b1de7 Mon Sep 17 00:00:00 2001
From: Vasily Gorbik
Date: Fri, 13 Jul 2018 08:13:13 +0200
Subject: s390/cmm: avoid add_timer on concurrently used timer

cmm_set_timer could be called concurrently from cmm_thread, cmm proc
handler, upon cmm smsg receive and timer function itself. To avoid
potential race condition and hitting BUG_ON in add_timer on already
pending timer simply reuse mod_timer which is according to
documentation "the only safe way to modify the timeout" with multiple
unserialized concurrent users. mod_timer can handle both active and
inactive timers which allows to carry out minor code simplification as
well.

Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/cmm.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch/s390/mm')

diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index 50d8d1c887a4..510a18299196 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -191,12 +191,7 @@ static void cmm_set_timer(void)
 			del_timer(&cmm_timer);
 		return;
 	}
-	if (timer_pending(&cmm_timer)) {
-		if (mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds*HZ))
-			return;
-	}
-	cmm_timer.expires = jiffies + cmm_timeout_seconds*HZ;
-	add_timer(&cmm_timer);
+	mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds * HZ);
 }
 
 static void cmm_timer_fn(struct timer_list *unused)
-- 
cgit v1.2.3-55-g7522


From 306d6c49ac9ded11114cb53b0925da52f2c2ada1 Mon Sep 17 00:00:00 2001
From: Claudio Imbrenda
Date: Mon, 16 Jul 2018 10:38:57 +0200
Subject: s390/kvm: fix deadlock when killed by oom

When the oom killer kills a userspace process in the page fault handler
while in guest context, the fault handler fails to release the mm_sem
if the FAULT_FLAG_RETRY_NOWAIT option is set. This leads to a deadlock
when tearing down the mm when the process terminates. This bug can only
happen when pfault is enabled, so only KVM clients are affected.

The problem arises in the rare cases in which handle_mm_fault does not
release the mm_sem. This patch fixes the issue by manually releasing
the mm_sem when needed.

Fixes: 24eb3a824c4f3 ("KVM: s390: Add FAULT_FLAG_RETRY_NOWAIT for guest fault")
Cc: <stable@vger.kernel.org> # 3.15+
Signed-off-by: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/fault.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/s390/mm')

diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index e074480d3598..4cc3f06b0ab3 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -502,6 +502,8 @@ retry:
 	/* No reason to continue if interrupted by SIGKILL. */
 	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
 		fault = VM_FAULT_SIGNAL;
+		if (flags & FAULT_FLAG_RETRY_NOWAIT)
+			goto out_up;
 		goto out;
 	}
 	if (unlikely(fault & VM_FAULT_ERROR))
-- 
cgit v1.2.3-55-g7522


From 5a045bb9c44caebcf4e88bf78343166596c0014b Mon Sep 17 00:00:00 2001
From: Janosch Frank
Date: Fri, 13 Jul 2018 11:28:16 +0100
Subject: s390/mm: Make gmap_protect_range more modular

This patch reworks the gmap_protect_range logic and extracts the pte
handling into an own function. Also we do now walk to the pmd and make
it accessible in the function for later use. This way we can add huge
page handling logic more easily.

Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/gmap.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 83 insertions(+), 10 deletions(-)

(limited to 'arch/s390/mm')

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index bc56ec8abcf7..71b0e9ca0137 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -864,7 +864,79 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
  */
 static void gmap_pte_op_end(spinlock_t *ptl)
 {
-	spin_unlock(ptl);
+	if (ptl)
+		spin_unlock(ptl);
+}
+
+/**
+ * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock
+ *		      and return the pmd pointer
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ *
+ * Returns a pointer to the pmd for a guest address, or NULL
+ */
+static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
+{
+	pmd_t *pmdp;
+
+	BUG_ON(gmap_is_shadow(gmap));
+	spin_lock(&gmap->guest_table_lock);
+	pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
+
+	if (!pmdp || pmd_none(*pmdp)) {
+		spin_unlock(&gmap->guest_table_lock);
+		return NULL;
+	}
+
+	/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
+	if (!pmd_large(*pmdp))
+		spin_unlock(&gmap->guest_table_lock);
+	return pmdp;
+}
+
+/**
+ * gmap_pmd_op_end - release the guest_table_lock if needed
+ * @gmap: pointer to the guest mapping meta data structure
+ * @pmdp: pointer to the pmd
+ */
+static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
+{
+	if (pmd_large(*pmdp))
+		spin_unlock(&gmap->guest_table_lock);
+}
+
+/*
+ * gmap_protect_pte - remove access rights to memory and set pgste bits
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @pmdp: pointer to the pmd associated with the pte
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: pgste notification bits to set
+ *
+ * Returns 0 if successfully protected, -ENOMEM if out of memory and
+ * -EAGAIN if a fixup is needed.
+ *
+ * Expected to be called with sg->mm->mmap_sem in read
+ */
+static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
+			    pmd_t *pmdp, int prot, unsigned long bits)
+{
+	int rc;
+	pte_t *ptep;
+	spinlock_t *ptl = NULL;
+
+	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
+		return -EAGAIN;
+
+	ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl);
+	if (!ptep)
+		return -ENOMEM;
+
+	/* Protect and unlock. */
+	rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits);
+	gmap_pte_op_end(ptl);
+	return rc;
 }
 
 /*
@@ -884,17 +956,21 @@ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
 			      unsigned long len, int prot, unsigned long bits)
 {
 	unsigned long vmaddr;
-	spinlock_t *ptl;
-	pte_t *ptep;
+	pmd_t *pmdp;
 	int rc;
 
 	BUG_ON(gmap_is_shadow(gmap));
 	while (len) {
 		rc = -EAGAIN;
-		ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
-		if (ptep) {
-			rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits);
-			gmap_pte_op_end(ptl);
+		pmdp = gmap_pmd_op_walk(gmap, gaddr);
+		if (pmdp) {
+			rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
+					      bits);
+			if (!rc) {
+				len -= PAGE_SIZE;
+				gaddr += PAGE_SIZE;
+			}
+			gmap_pmd_op_end(gmap, pmdp);
 		}
 		if (rc) {
 			vmaddr = __gmap_translate(gmap, gaddr);
@@ -903,10 +979,7 @@ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
 			rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
 			if (rc)
 				return rc;
-			continue;
 		}
-		gaddr += PAGE_SIZE;
-		len -= PAGE_SIZE;
 	}
 	return 0;
 }
-- 
cgit v1.2.3-55-g7522


From 2c46e974dd8b5316e65637af0ff6d4bc78554b2e Mon Sep 17 00:00:00 2001
From: Janosch Frank
Date: Fri, 13 Jul 2018 11:28:18 +0100
Subject: s390/mm: Abstract gmap notify bit setting

Currently we use the software PGSTE bits PGSTE_IN_BIT and
PGSTE_VSIE_BIT to notify before an invalidation occurs on a prefix
page or a VSIE page respectively. Both bits are pgste specific, but
are used when protecting a memory range.

Let's introduce abstract GMAP_NOTIFY_* bits that will be realized into
the respective bits when gmap DAT table entries are protected.

Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
---
 arch/s390/include/asm/gmap.h |  4 ++++
 arch/s390/mm/gmap.c          | 11 +++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'arch/s390/mm')

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index e07cce88dfb0..c1bc5633fc6e 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -9,6 +9,10 @@
 #ifndef _ASM_S390_GMAP_H
 #define _ASM_S390_GMAP_H
 
+/* Generic bits for GMAP notification on DAT table entry changes. */
+#define GMAP_NOTIFY_SHADOW	0x2
+#define GMAP_NOTIFY_MPROT	0x1
+
 /**
  * struct gmap_struct - guest address space
  * @list: list head for the mm->context gmap list
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 71b0e9ca0137..0bada5e097cb 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -912,7 +912,7 @@ static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
  * @gaddr: virtual address in the guest address space
  * @pmdp: pointer to the pmd associated with the pte
  * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
- * @bits: pgste notification bits to set
+ * @bits: notification bits to set
  *
  * Returns 0 if successfully protected, -ENOMEM if out of memory and
  * -EAGAIN if a fixup is needed.
@@ -925,6 +925,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
 	int rc;
 	pte_t *ptep;
 	spinlock_t *ptl = NULL;
+	unsigned long pbits = 0;
 
 	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
 		return -EAGAIN;
@@ -933,8 +934,10 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
 	if (!ptep)
 		return -ENOMEM;
 
+	pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0;
+	pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
 	/* Protect and unlock. */
-	rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits);
+	rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
 	gmap_pte_op_end(ptl);
 	return rc;
 }
@@ -1008,7 +1011,7 @@ int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
 	if (!MACHINE_HAS_ESOP && prot == PROT_READ)
 		return -EINVAL;
 	down_read(&gmap->mm->mmap_sem);
-	rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT);
+	rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
 	up_read(&gmap->mm->mmap_sem);
 	return rc;
 }
@@ -1599,7 +1602,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
 	down_read(&parent->mm->mmap_sem);
 	rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
 				((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
-				PROT_READ, PGSTE_VSIE_BIT);
+				PROT_READ, GMAP_NOTIFY_SHADOW);
 	up_read(&parent->mm->mmap_sem);
 	spin_lock(&parent->shadow_lock);
 	new->initialized = true;
-- 
cgit v1.2.3-55-g7522


From 58b7e200d2f1f9af9ef69a401a877791837a1c87 Mon Sep 17 00:00:00 2001
From: Janosch Frank
Date: Fri, 13 Jul 2018 11:28:20 +0100
Subject: s390/mm: Add gmap pmd linking

Let's allow pmds to be linked into gmap for the upcoming s390 KVM huge
page support.

Before this patch we copied the full userspace pmd entry. This is not
correct, as it contains SW defined bits that might be interpreted
differently in the GMAP context. Now we only copy over all hardware
relevant information leaving out the software bits.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
---
 arch/s390/include/asm/pgtable.h |  6 ++++--
 arch/s390/mm/gmap.c             | 13 +++++++++----
 2 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'arch/s390/mm')

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 5ab636089c60..fe36b3bb2afd 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -268,8 +268,10 @@ static inline int is_module_addr(void *addr)
 #define _REGION_ENTRY_BITS_LARGE 0xffffffff8000fe2fUL
 
 /* Bits in the segment table entry */
-#define _SEGMENT_ENTRY_BITS	0xfffffffffffffe33UL
-#define _SEGMENT_ENTRY_BITS_LARGE 0xfffffffffff0ff33UL
+#define _SEGMENT_ENTRY_BITS			0xfffffffffffffe33UL
+#define _SEGMENT_ENTRY_BITS_LARGE		0xfffffffffff0ff33UL
+#define _SEGMENT_ENTRY_HARDWARE_BITS		0xfffffffffffffe30UL
+#define _SEGMENT_ENTRY_HARDWARE_BITS_LARGE	0xfffffffffff00730UL
 #define _SEGMENT_ENTRY_ORIGIN_LARGE ~0xfffffUL /* large page address	    */
 #define _SEGMENT_ENTRY_ORIGIN	~0x7ffUL/* page table origin		    */
 #define _SEGMENT_ENTRY_PROTECT	0x200	/* segment protection bit	    */
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 0bada5e097cb..870e81fcb0cf 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -596,10 +596,15 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 	if (*table == _SEGMENT_ENTRY_EMPTY) {
 		rc = radix_tree_insert(&gmap->host_to_guest,
 				       vmaddr >> PMD_SHIFT, table);
-		if (!rc)
-			*table = pmd_val(*pmd);
-	} else
-		rc = 0;
+		if (!rc) {
+			if (pmd_large(*pmd)) {
+				*table = pmd_val(*pmd) &
+					_SEGMENT_ENTRY_HARDWARE_BITS_LARGE;
+			} else
+				*table = pmd_val(*pmd) &
+					_SEGMENT_ENTRY_HARDWARE_BITS;
+		}
+	}
 	spin_unlock(&gmap->guest_table_lock);
 	spin_unlock(ptl);
 	radix_tree_preload_end();
-- 
cgit v1.2.3-55-g7522


From 7c4b13a7c042fd6b71dc48d291208f8a97fad333 Mon Sep 17 00:00:00 2001
From: Janosch Frank
Date: Fri, 13 Jul 2018 11:28:21 +0100
Subject: s390/mm: Add gmap pmd notification bit setting

Like for ptes, we also need invalidation notification for pmds, to
make sure the guest lowcore pages are always accessible and later
addition of shadowed pmds.

With PMDs we do not have PGSTEs or some other bits we could use in the
host PMD. Instead we pick one of the free bits in the gmap PMD. Every
time a host pmd will be invalidated, we will check if the respective
gmap PMD has the bit set and in that case fire up the notifier.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
---
 arch/s390/include/asm/gmap.h |  3 +++
 arch/s390/mm/gmap.c          | 60 +++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 57 insertions(+), 6 deletions(-)

(limited to 'arch/s390/mm')

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index c1bc5633fc6e..276268b48aff 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -13,6 +13,9 @@
 #define GMAP_NOTIFY_SHADOW	0x2
 #define GMAP_NOTIFY_MPROT	0x1
 
+/* Status bits only for huge segment entries */
+#define _SEGMENT_ENTRY_GMAP_IN		0x8000	/* invalidation notify bit */
+
 /**
  * struct gmap_struct - guest address space
  * @list: list head for the mm->context gmap list
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 870e81fcb0cf..96dd94d51ad4 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -911,6 +911,40 @@ static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
 		spin_unlock(&gmap->guest_table_lock);
 }
 
+/*
+ * gmap_protect_pmd - remove access rights to memory and set pmd notification bits
+ * @pmdp: pointer to the pmd to be protected
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: notification bits to set
+ *
+ * Returns:
+ * 0 if successfully protected
+ * -EAGAIN if a fixup is needed
+ * -EINVAL if unsupported notifier bits have been specified
+ *
+ * Expected to be called with sg->mm->mmap_sem in read and
+ * guest_table_lock held.
+ */
+static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
+			    pmd_t *pmdp, int prot, unsigned long bits)
+{
+	int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
+	int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
+
+	/* Fixup needed */
+	if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
+		return -EAGAIN;
+
+	if (bits & GMAP_NOTIFY_MPROT)
+		pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN;
+
+	/* Shadow GMAP protection needs split PMDs */
+	if (bits & GMAP_NOTIFY_SHADOW)
+		return -EINVAL;
+
+	return 0;
+}
+
 /*
  * gmap_protect_pte - remove access rights to memory and set pgste bits
  * @gmap: pointer to guest mapping meta data structure
@@ -963,7 +997,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
 static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
 			      unsigned long len, int prot, unsigned long bits)
 {
-	unsigned long vmaddr;
+	unsigned long vmaddr, dist;
 	pmd_t *pmdp;
 	int rc;
 
@@ -972,15 +1006,29 @@ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
 		rc = -EAGAIN;
 		pmdp = gmap_pmd_op_walk(gmap, gaddr);
 		if (pmdp) {
-			rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
-					      bits);
-			if (!rc) {
-				len -= PAGE_SIZE;
-				gaddr += PAGE_SIZE;
+			if (!pmd_large(*pmdp)) {
+				rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
+						      bits);
+				if (!rc) {
+					len -= PAGE_SIZE;
+					gaddr += PAGE_SIZE;
+				}
+			} else {
+				rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot,
+						      bits);
+				if (!rc) {
+					dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK);
+					len = len < dist ? 0 : len - dist;
+					gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE;
+				}
 			}
 			gmap_pmd_op_end(gmap, pmdp);
 		}
 		if (rc) {
+			if (rc == -EINVAL)
+				return rc;
+
+			/* -EAGAIN, fixup of userspace mm and gmap */
 			vmaddr = __gmap_translate(gmap, gaddr);
 			if (IS_ERR_VALUE(vmaddr))
 				return vmaddr;
-- 
cgit v1.2.3-55-g7522


From 6a3762778d1ba1a58ab473124790cd612d10eadc Mon Sep 17 00:00:00 2001
From: Janosch Frank
Date: Fri, 13 Jul 2018 11:28:22 +0100
Subject: s390/mm: Add gmap pmd invalidation and clearing

If the host invalidates a pmd, we also have to invalidate the
corresponding gmap pmds, as well as flush them from the TLB. This is
necessary, as we don't share the pmd tables between host and guest as
we do with ptes.

The clearing part of these three new functions sets a guest pmd entry
to _SEGMENT_ENTRY_EMPTY, so the guest will fault on it and we will
re-link it.

Flushing the gmap is not necessary in the host's lazy local and csp
cases. Both purge the TLB completely.

Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
---
 arch/s390/include/asm/pgtable.h |   4 ++
 arch/s390/mm/gmap.c             | 125 ++++++++++++++++++++++++++++++++++++++++
 arch/s390/mm/pgtable.c          |  17 +++++-
 3 files changed, 143 insertions(+), 3 deletions(-)

(limited to 'arch/s390/mm')

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index fe36b3bb2afd..087e0282f165 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1118,6 +1118,10 @@ int set_pgste_bits(struct mm_struct *mm, unsigned long addr,
 int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep);
 int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
 			unsigned long *oldpte, unsigned long *oldpgste);
+void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr);
+void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr);
+void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr);
+void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr);
 
 /*
  * Certain architectures need to do special things when PTEs
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 96dd94d51ad4..87c174ee3a8c 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2221,6 +2221,131 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
 }
 EXPORT_SYMBOL_GPL(ptep_notify);
 
+static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
+			     unsigned long gaddr)
+{
+	pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN;
+	gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
+}
+
+static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
+			    int purge)
+{
+	pmd_t *pmdp;
+	struct gmap *gmap;
+	unsigned long gaddr;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+		spin_lock(&gmap->guest_table_lock);
+		pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest,
+						  vmaddr >> PMD_SHIFT);
+		if (pmdp) {
+			gaddr = __gmap_segment_gaddr((unsigned long *)pmdp);
+			pmdp_notify_gmap(gmap, pmdp, gaddr);
+			WARN_ON(pmd_val(*pmdp) & ~_SEGMENT_ENTRY_HARDWARE_BITS_LARGE);
+			if (purge)
+				__pmdp_csp(pmdp);
+			pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
+		}
+		spin_unlock(&gmap->guest_table_lock);
+	}
+	rcu_read_unlock();
+}
+
+/**
+ * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without
+ *                        flushing
+ * @mm: pointer to the process mm_struct
+ * @vmaddr: virtual address in the process address space
+ */
+void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
+{
+	gmap_pmdp_clear(mm, vmaddr, 0);
+}
+EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
+
+/**
+ * gmap_pmdp_csp - csp all affected guest pmd entries
+ * @mm: pointer to the process mm_struct
+ * @vmaddr: virtual address in the process address space
+ */
+void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr)
+{
+	gmap_pmdp_clear(mm, vmaddr, 1);
+}
+EXPORT_SYMBOL_GPL(gmap_pmdp_csp);
+
+/**
+ * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
+ * @mm: pointer to the process mm_struct
+ * @vmaddr: virtual address in the process address space
+ */
+void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
+{
+	unsigned long *entry, gaddr;
+	struct gmap *gmap;
+	pmd_t *pmdp;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+		spin_lock(&gmap->guest_table_lock);
+		entry = radix_tree_delete(&gmap->host_to_guest,
+					  vmaddr >> PMD_SHIFT);
+		if (entry) {
+			pmdp = (pmd_t *)entry;
+			gaddr = __gmap_segment_gaddr(entry);
+			pmdp_notify_gmap(gmap, pmdp, gaddr);
+			WARN_ON(*entry & ~_SEGMENT_ENTRY_HARDWARE_BITS_LARGE);
+			if (MACHINE_HAS_TLB_GUEST)
+				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
+					    gmap->asce, IDTE_LOCAL);
+			else if (MACHINE_HAS_IDTE)
+				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
+			*entry = _SEGMENT_ENTRY_EMPTY;
+		}
+		spin_unlock(&gmap->guest_table_lock);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
+
+/**
+ * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry
+ * @mm: pointer to the process mm_struct
+ * @vmaddr: virtual address in the process address space
+ */
+void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
+{
+	unsigned long *entry, gaddr;
+	struct gmap *gmap;
+	pmd_t *pmdp;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+		spin_lock(&gmap->guest_table_lock);
+		entry = radix_tree_delete(&gmap->host_to_guest,
+					  vmaddr >> PMD_SHIFT);
+		if (entry) {
+			pmdp = (pmd_t *)entry;
+			gaddr = __gmap_segment_gaddr(entry);
+			pmdp_notify_gmap(gmap, pmdp, gaddr);
+			WARN_ON(*entry & ~_SEGMENT_ENTRY_HARDWARE_BITS_LARGE);
+			if (MACHINE_HAS_TLB_GUEST)
+				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
+					    gmap->asce, IDTE_GLOBAL);
+			else if (MACHINE_HAS_IDTE)
+				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
+			else
+				__pmdp_csp(pmdp);
+			*entry = _SEGMENT_ENTRY_EMPTY;
+		}
+		spin_unlock(&gmap->guest_table_lock);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
+
 static inline void thp_split_mm(struct mm_struct *mm)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 301e466e4263..fe84c0715395 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -347,18 +347,27 @@ static inline void pmdp_idte_local(struct mm_struct *mm,
 			    mm->context.asce, IDTE_LOCAL);
 	else
 		__pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL);
+	if (mm_has_pgste(mm))
+		gmap_pmdp_idte_local(mm, addr);
 }
 
 static inline void pmdp_idte_global(struct mm_struct *mm,
 				    unsigned long addr, pmd_t *pmdp)
 {
-	if (MACHINE_HAS_TLB_GUEST)
+	if (MACHINE_HAS_TLB_GUEST) {
 		__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
 			    mm->context.asce, IDTE_GLOBAL);
-	else if (MACHINE_HAS_IDTE)
+		if (mm_has_pgste(mm))
+			gmap_pmdp_idte_global(mm, addr);
+	} else if (MACHINE_HAS_IDTE) {
 		__pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL);
-	else
+		if (mm_has_pgste(mm))
+			gmap_pmdp_idte_global(mm, addr);
+	} else {
 		__pmdp_csp(pmdp);
+		if (mm_has_pgste(mm))
+			gmap_pmdp_csp(mm, addr);
+	}
 }
 
 static inline pmd_t pmdp_flush_direct(struct mm_struct *mm,
@@ -392,6 +401,8 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
 			  cpumask_of(smp_processor_id()))) {
 		pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID;
 		mm->context.flush_mm = 1;
+		if (mm_has_pgste(mm))
+			gmap_pmdp_invalidate(mm, addr);
 	} else {
 		pmdp_idte_global(mm, addr, pmdp);
 	}
-- 
cgit v1.2.3-55-g7522


From 0959e168678d2d95648317e1e5e46bcb358272eb Mon Sep 17 00:00:00 2001
From: Janosch Frank
Date: Tue, 17 Jul 2018 13:21:22 +0100
Subject: s390/mm: Add huge page dirty sync support

To do dirty loging with huge pages, we protect huge pmds in the
gmap. When they are written to, we unprotect them and mark them dirty.

We introduce the function gmap_test_and_clear_dirty_pmd which handles
dirty sync for huge pages.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
---
 arch/s390/include/asm/gmap.h    |   3 +
 arch/s390/include/asm/pgtable.h |   3 +-
 arch/s390/kvm/kvm-s390.c        |  25 +++++---
 arch/s390/mm/gmap.c             | 127 ++++++++++++++++++++++++++++++++++++++--
 arch/s390/mm/pgtable.c          |  34 +----------
 5 files changed, 148 insertions(+), 44 deletions(-)

(limited to 'arch/s390/mm')

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 276268b48aff..fcbd638fb9f4 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -15,6 +15,7 @@
 
 /* Status bits only for huge segment entries */
 #define _SEGMENT_ENTRY_GMAP_IN		0x8000	/* invalidation notify bit */
+#define _SEGMENT_ENTRY_GMAP_UC		0x4000	/* dirty (migration) */
 
 /**
  * struct gmap_struct - guest address space
@@ -139,4 +140,6 @@ void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
 int gmap_mprotect_notify(struct gmap *, unsigned long start,
 			 unsigned long len, int prot);
 
+void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
+			     unsigned long gaddr, unsigned long vmaddr);
 #endif /* _ASM_S390_GMAP_H */
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 087e0282f165..0e7cb0dc9c33 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1103,7 +1103,8 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
 		    pte_t *sptep, pte_t *tptep, pte_t pte);
 void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
 
-bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
+bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long address,
+			    pte_t *ptep);
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 			  unsigned char key, bool nq);
 int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3b7a5151b6a5..4cff5e31ca36 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -511,19 +511,30 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 }
 
 static void kvm_s390_sync_dirty_log(struct kvm *kvm,
-					struct kvm_memory_slot *memslot)
+				    struct kvm_memory_slot *memslot)
 {
+	int i;
 	gfn_t cur_gfn, last_gfn;
-	unsigned long address;
+	unsigned long gaddr, vmaddr;
 	struct gmap *gmap = kvm->arch.gmap;
+	DECLARE_BITMAP(bitmap, _PAGE_ENTRIES);
 
-	/* Loop over all guest pages */
+	/* Loop over all guest segments */
+	cur_gfn = memslot->base_gfn;
 	last_gfn = memslot->base_gfn + memslot->npages;
-	for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) {
-		address = gfn_to_hva_memslot(memslot, cur_gfn);
+	for (; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES) {
+		gaddr = gfn_to_gpa(cur_gfn);
+		vmaddr = gfn_to_hva_memslot(memslot, cur_gfn);
+		if (kvm_is_error_hva(vmaddr))
+			continue;
+
+		bitmap_zero(bitmap, _PAGE_ENTRIES);
+		gmap_sync_dirty_log_pmd(gmap, bitmap, gaddr, vmaddr);
+		for (i = 0; i < _PAGE_ENTRIES; i++) {
+			if (test_bit(i, bitmap))
+				mark_page_dirty(kvm, cur_gfn + i);
+		}
 
-		if (test_and_clear_guest_dirty(gmap->mm, address))
-			mark_page_dirty(kvm, cur_gfn);
 		if (fatal_signal_pending(current))
 			return;
 		cond_resched();
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 87c174ee3a8c..3807de25a706 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -521,6 +521,9 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table,
 	rcu_read_unlock();
 }
 
+static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
+			   unsigned long gaddr);
+
 /**
  * gmap_link - set up shadow page tables to connect a host to a guest address
  * @gmap: pointer to guest mapping meta data structure
@@ -541,6 +544,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
+	u64 unprot;
 	int rc;
 
 	BUG_ON(gmap_is_shadow(gmap));
@@ -598,12 +602,19 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 				       vmaddr >> PMD_SHIFT, table);
 		if (!rc) {
 			if (pmd_large(*pmd)) {
-				*table = pmd_val(*pmd) &
-					_SEGMENT_ENTRY_HARDWARE_BITS_LARGE;
+				*table = (pmd_val(*pmd) &
+					  _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
+					| _SEGMENT_ENTRY_GMAP_UC;
 			} else
 				*table = pmd_val(*pmd) &
 					_SEGMENT_ENTRY_HARDWARE_BITS;
 		}
+	} else if (*table & _SEGMENT_ENTRY_PROTECT &&
+		   !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
+		unprot = (u64)*table;
+		unprot &= ~_SEGMENT_ENTRY_PROTECT;
+		unprot |= _SEGMENT_ENTRY_GMAP_UC;
+		gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr);
 	}
 	spin_unlock(&gmap->guest_table_lock);
 	spin_unlock(ptl);
@@ -930,11 +941,23 @@ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
 {
 	int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
 	int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
+	pmd_t new = *pmdp;
 
 	/* Fixup needed */
 	if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
 		return -EAGAIN;
 
+	if (prot == PROT_NONE && !pmd_i) {
+		pmd_val(new) |= _SEGMENT_ENTRY_INVALID;
+		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
+	}
+
+	if (prot == PROT_READ && !pmd_p) {
+		pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID;
+		pmd_val(new) |= _SEGMENT_ENTRY_PROTECT;
+		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
+	}
+
 	if (bits & GMAP_NOTIFY_MPROT)
 		pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN;
 
@@ -2228,6 +2251,32 @@ static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
 	gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
 }
 
+/**
+ * gmap_pmdp_xchg - exchange a gmap pmd with another
+ * @gmap: pointer to the guest address space structure
+ * @pmdp: pointer to the pmd entry
+ * @new: replacement entry
+ * @gaddr: the affected guest address
+ *
+ * This function is assumed to be called with the guest_table_lock
+ * held.
+ */
+static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
+			   unsigned long gaddr)
+{
+	gaddr &= HPAGE_MASK;
+	pmdp_notify_gmap(gmap, pmdp, gaddr);
+	pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN;
+	if (MACHINE_HAS_TLB_GUEST)
+		__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
+			    IDTE_GLOBAL);
+	else if (MACHINE_HAS_IDTE)
+		__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
+	else
+		__pmdp_csp(pmdp);
+	*pmdp = new;
+}
+
 static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
 			    int purge)
 {
@@ -2243,7 +2292,8 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
 		if (pmdp) {
 			gaddr = __gmap_segment_gaddr((unsigned long *)pmdp);
 			pmdp_notify_gmap(gmap, pmdp, gaddr);
-			WARN_ON(pmd_val(*pmdp) & ~_SEGMENT_ENTRY_HARDWARE_BITS_LARGE);
+			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+						   _SEGMENT_ENTRY_GMAP_UC));
 			if (purge)
 				__pmdp_csp(pmdp);
 			pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
@@ -2296,7 +2346,8 @@ void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
 			pmdp = (pmd_t *)entry;
 			gaddr = __gmap_segment_gaddr(entry);
 			pmdp_notify_gmap(gmap, pmdp, gaddr);
-			WARN_ON(*entry & ~_SEGMENT_ENTRY_HARDWARE_BITS_LARGE);
+			WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+					   _SEGMENT_ENTRY_GMAP_UC));
 			if (MACHINE_HAS_TLB_GUEST)
 				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
 					    gmap->asce, IDTE_LOCAL);
@@ -2330,7 +2381,8 @@ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
 			pmdp = (pmd_t *)entry;
 			gaddr = __gmap_segment_gaddr(entry);
 			pmdp_notify_gmap(gmap, pmdp, gaddr);
-			WARN_ON(*entry & ~_SEGMENT_ENTRY_HARDWARE_BITS_LARGE);
+			WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+					   _SEGMENT_ENTRY_GMAP_UC));
 			if (MACHINE_HAS_TLB_GUEST)
 				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
 					    gmap->asce, IDTE_GLOBAL);
@@ -2346,6 +2398,71 @@ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
 }
 EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
 
+/**
+ * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status
+ * @gmap: pointer to guest address space
+ * @pmdp: pointer to the pmd to be tested
+ * @gaddr: virtual address in the guest address space
+ *
+ * This function is assumed to be called with the guest_table_lock
+ * held.
+ */
+bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
+				   unsigned long gaddr)
+{
+	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
+		return false;
+
+	/* Already protected memory, which did not change is clean */
+	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT &&
+	    !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC))
+		return false;
+
+	/* Clear UC indication and reset protection */
+	pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC;
+	gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
+	return true;
+}
+
+/**
+ * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment
+ * @gmap: pointer to guest address space
+ * @bitmap: dirty bitmap for this pmd
+ * @gaddr: virtual address in the guest address space
+ * @vmaddr: virtual address in the host address space
+ *
+ * This function is assumed to be called with the guest_table_lock
+ * held.
+ */
+void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
+			     unsigned long gaddr, unsigned long vmaddr)
+{
+	int i;
+	pmd_t *pmdp;
+	pte_t *ptep;
+	spinlock_t *ptl;
+
+	pmdp = gmap_pmd_op_walk(gmap, gaddr);
+	if (!pmdp)
+		return;
+
+	if (pmd_large(*pmdp)) {
+		if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
+			bitmap_fill(bitmap, _PAGE_ENTRIES);
+	} else {
+		for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
+			ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl);
+			if (!ptep)
+				continue;
+			if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
+				set_bit(i, bitmap);
+			spin_unlock(ptl);
+		}
+	}
+	gmap_pmd_op_end(gmap, pmdp);
+}
+EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
+
 static inline void thp_split_mm(struct mm_struct *mm)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index fe84c0715395..684df964e345 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -704,40 +704,14 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 /*
  * Test and reset if a guest page is dirty
  */
-bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
+bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr,
+		       pte_t *ptep)
 {
-	spinlock_t *ptl;
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
 	pgste_t pgste;
-	pte_t *ptep;
 	pte_t pte;
 	bool dirty;
 	int nodat;
 
-	pgd = pgd_offset(mm, addr);
-	p4d = p4d_alloc(mm, pgd, addr);
-	if (!p4d)
-		return false;
-	pud = pud_alloc(mm, p4d, addr);
-	if (!pud)
-		return false;
-	pmd = pmd_alloc(mm, pud, addr);
-	if (!pmd)
-		return false;
-	/* We can't run guests backed by huge pages, but userspace can
-	 * still set them up and then try to migrate them without any
-	 * migration support.
-	 */
-	if (pmd_large(*pmd))
-		return true;
-
-	ptep = pte_alloc_map_lock(mm, pmd, addr, &ptl);
-	if (unlikely(!ptep))
-		return false;
-
 	pgste = pgste_get_lock(ptep);
 	dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
 	pgste_val(pgste) &= ~PGSTE_UC_BIT;
@@ -753,11 +727,9 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
 		*ptep = pte;
 	}
 	pgste_set_unlock(ptep, pgste);
-
-	spin_unlock(ptl);
 	return dirty;
 }
-EXPORT_SYMBOL_GPL(test_and_clear_guest_dirty);
+EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc);
 
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 			  unsigned char key, bool nq)
-- 
cgit v1.2.3-55-g7522


From 964c2c05c9f3095a18387a57b289cf06de637521 Mon Sep 17 00:00:00 2001
From: Dominik Dingel
Date: Fri, 13 Jul 2018 11:28:25 +0100
Subject: s390/mm: Clear huge page storage keys on enable_skey

When a guest starts using storage keys, we trap and set a default one
for its whole valid address space. With this patch we are now able to
do that for large pages.

To speed up the storage key insertion, we use
__storage_key_init_range, which in-turn will use sske_frame to set
multiple storage keys with one instruction. As it has been previously
used for debuging we have to get rid of the default key check and make
it quiescing.

Signed-off-by: Dominik Dingel <dingel@linux.vnet.ibm.com>
Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
[replaced page_set_storage_key loop with __storage_key_init_range]
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
---
 arch/s390/mm/gmap.c     | 32 +++++++++++++++++++++++++++++---
 arch/s390/mm/pageattr.c |  6 ++----
 2 files changed, 31 insertions(+), 7 deletions(-)

(limited to 'arch/s390/mm')

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 3807de25a706..a8e32e60226b 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2539,17 +2539,43 @@ EXPORT_SYMBOL_GPL(s390_enable_sie);
  * Enable storage key handling from now on and initialize the storage
  * keys with the default key.
  */
-static int __s390_enable_skey(pte_t *pte, unsigned long addr,
-			      unsigned long next, struct mm_walk *walk)
+static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
+				  unsigned long next, struct mm_walk *walk)
 {
 	/* Clear storage key */
 	ptep_zap_key(walk->mm, addr, pte);
 	return 0;
 }
 
+static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
+				      unsigned long hmask, unsigned long next,
+				      struct mm_walk *walk)
+{
+	pmd_t *pmd = (pmd_t *)pte;
+	unsigned long start, end;
+
+	/*
+	 * The write check makes sure we do not set a key on shared
+	 * memory. This is needed as the walker does not differentiate
+	 * between actual guest memory and the process executable or
+	 * shared libraries.
+	 */
+	if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID ||
+	    !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE))
+		return 0;
+
+	start = pmd_val(*pmd) & HPAGE_MASK;
+	end = start + HPAGE_SIZE - 1;
+	__storage_key_init_range(start, end);
+	return 0;
+}
+
 int s390_enable_skey(void)
 {
-	struct mm_walk walk = { .pte_entry = __s390_enable_skey };
+	struct mm_walk walk = {
+		.hugetlb_entry = __s390_enable_skey_hugetlb,
+		.pte_entry = __s390_enable_skey_pte,
+	};
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	int rc = 0;
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index c44171588d08..f8c6faab41f4 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -14,7 +14,7 @@
 
 static inline unsigned long sske_frame(unsigned long addr, unsigned char skey)
 {
-	asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],9,0"
+	asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],1,0"
 		     : [addr] "+a" (addr) : [skey] "d" (skey));
 	return addr;
 }
@@ -23,8 +23,6 @@ void __storage_key_init_range(unsigned long start, unsigned long end)
 {
 	unsigned long boundary, size;
 
-	if (!PAGE_DEFAULT_KEY)
-		return;
 	while (start < end) {
 		if (MACHINE_HAS_EDAT1) {
 			/* set storage keys for a 1MB frame */
@@ -37,7 +35,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end)
 				continue;
 			}
 		}
-		page_set_storage_key(start, PAGE_DEFAULT_KEY, 0);
+		page_set_storage_key(start, PAGE_DEFAULT_KEY, 1);
 		start += PAGE_SIZE;
 	}
 }
-- 
cgit v1.2.3-55-g7522


From 3afdfca69870963ae01e280732a5ee493a2fcbb3 Mon Sep 17 00:00:00 2001
From: Janosch Frank
Date: Fri, 13 Jul 2018 11:28:26 +0100
Subject: s390/mm: Clear skeys for newly mapped huge guest pmds

Similarly to the pte skey handling, where we set the storage key to
the default key for each newly mapped pte, we have to also do that for
huge pmds.

With the PG_arch_1 flag we keep track if the area has already been
cleared of its skeys.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/hugetlb.h |  5 ++++-
 arch/s390/mm/gmap.c             |  2 ++
 arch/s390/mm/hugetlbpage.c      | 24 ++++++++++++++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'arch/s390/mm')

diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index 9c5fc50204dd..2d1afa58a4b6 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -37,7 +37,10 @@ static inline int prepare_hugepage_range(struct file *file,
 	return 0;
 }
 
-#define arch_clear_hugepage_flags(page)		do { } while (0)
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+	clear_bit(PG_arch_1, &page->flags);
+}
 
 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
 				  pte_t *ptep, unsigned long sz)
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index a8e32e60226b..a6738c0c4499 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2553,6 +2553,7 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
 {
 	pmd_t *pmd = (pmd_t *)pte;
 	unsigned long start, end;
+	struct page *page = pmd_page(*pmd);
 
 	/*
 	 * The write check makes sure we do not set a key on shared
@@ -2567,6 +2568,7 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
 	start = pmd_val(*pmd) & HPAGE_MASK;
 	end = start + HPAGE_SIZE - 1;
 	__storage_key_init_range(start, end);
+	set_bit(PG_arch_1, &page->flags);
 	return 0;
 }
 
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index e804090f4470..b0246c705a19 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -123,6 +123,29 @@ static inline pte_t __rste_to_pte(unsigned long rste)
 	return pte;
 }
 
+static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
+{
+	struct page *page;
+	unsigned long size, paddr;
+
+	if (!mm_uses_skeys(mm) ||
+	    rste & _SEGMENT_ENTRY_INVALID)
+		return;
+
+	if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
+		page = pud_page(__pud(rste));
+		size = PUD_SIZE;
+		paddr = rste & PUD_MASK;
+	} else {
+		page = pmd_page(__pmd(rste));
+		size = PMD_SIZE;
+		paddr = rste & PMD_MASK;
+	}
+
+	if (!test_and_set_bit(PG_arch_1, &page->flags))
+		__storage_key_init_range(paddr, paddr + size - 1);
+}
+
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t pte)
 {
@@ -137,6 +160,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		rste |= _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE;
 	else
 		rste |= _SEGMENT_ENTRY_LARGE;
+	clear_huge_pte_skeys(mm, rste);
 	pte_val(*ptep) = rste;
 }
 
-- 
cgit v1.2.3-55-g7522


From 637ff9efe5eab419ea7f2bd6f2cf50f3cb69e322 Mon Sep 17 00:00:00 2001
From: Janosch Frank
Date: Fri, 13 Jul 2018 11:28:28 +0100
Subject: s390/mm: Add huge pmd storage key handling

Storage keys for guests with huge page mappings have to be managed in
hardware. There are no PGSTEs for PMDs that we could use to retain the
guests's logical view of the key.

Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
---
 arch/s390/mm/pgtable.c | 108 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 98 insertions(+), 10 deletions(-)

(limited to 'arch/s390/mm')

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 684df964e345..37d68706f5aa 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -410,6 +410,24 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
 	return old;
 }
 
+static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd = pgd_offset(mm, addr);
+	p4d = p4d_alloc(mm, pgd, addr);
+	if (!p4d)
+		return NULL;
+	pud = pud_alloc(mm, p4d, addr);
+	if (!pud)
+		return NULL;
+	pmd = pmd_alloc(mm, pud, addr);
+	return pmd;
+}
+
 pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
 		       pmd_t *pmdp, pmd_t new)
 {
@@ -734,12 +752,36 @@ EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc);
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 			  unsigned char key, bool nq)
 {
-	unsigned long keyul;
+	unsigned long keyul, paddr;
 	spinlock_t *ptl;
 	pgste_t old, new;
+	pmd_t *pmdp;
 	pte_t *ptep;
 
-	ptep = get_locked_pte(mm, addr, &ptl);
+	pmdp = pmd_alloc_map(mm, addr);
+	if (unlikely(!pmdp))
+		return -EFAULT;
+
+	ptl = pmd_lock(mm, pmdp);
+	if (!pmd_present(*pmdp)) {
+		spin_unlock(ptl);
+		return -EFAULT;
+	}
+
+	if (pmd_large(*pmdp)) {
+		paddr = pmd_val(*pmdp) & HPAGE_MASK;
+		paddr |= addr & ~HPAGE_MASK;
+		/*
+		 * Huge pmds need quiescing operations, they are
+		 * always mapped.
+		 */
+		page_set_storage_key(paddr, key, 1);
+		spin_unlock(ptl);
+		return 0;
+	}
+	spin_unlock(ptl);
+
+	ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
 	if (unlikely(!ptep))
 		return -EFAULT;
 
@@ -750,14 +792,14 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 	pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
 	pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
 	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
-		unsigned long address, bits, skey;
+		unsigned long bits, skey;
 
-		address = pte_val(*ptep) & PAGE_MASK;
-		skey = (unsigned long) page_get_storage_key(address);
+		paddr = pte_val(*ptep) & PAGE_MASK;
+		skey = (unsigned long) page_get_storage_key(paddr);
 		bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
 		skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
 		/* Set storage key ACC and FP */
-		page_set_storage_key(address, skey, !nq);
+		page_set_storage_key(paddr, skey, !nq);
 		/* Merge host changed & referenced into pgste  */
 		pgste_val(new) |= bits << 52;
 	}
@@ -813,11 +855,32 @@ EXPORT_SYMBOL(cond_set_guest_storage_key);
 int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
 {
 	spinlock_t *ptl;
+	unsigned long paddr;
 	pgste_t old, new;
+	pmd_t *pmdp;
 	pte_t *ptep;
 	int cc = 0;
 
-	ptep = get_locked_pte(mm, addr, &ptl);
+	pmdp = pmd_alloc_map(mm, addr);
+	if (unlikely(!pmdp))
+		return -EFAULT;
+
+	ptl = pmd_lock(mm, pmdp);
+	if (!pmd_present(*pmdp)) {
+		spin_unlock(ptl);
+		return -EFAULT;
+	}
+
+	if (pmd_large(*pmdp)) {
+		paddr = pmd_val(*pmdp) & HPAGE_MASK;
+		paddr |= addr & ~HPAGE_MASK;
+		cc = page_reset_referenced(paddr);
+		spin_unlock(ptl);
+		return cc;
+	}
+	spin_unlock(ptl);
+
+	ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
 	if (unlikely(!ptep))
 		return -EFAULT;
 
@@ -826,7 +889,8 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
 	pgste_val(new) &= ~PGSTE_GR_BIT;
 
 	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
-		cc = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
+		paddr = pte_val(*ptep) & PAGE_MASK;
+		cc = page_reset_referenced(paddr);
 		/* Merge real referenced bit into host-set */
 		pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT;
 	}
@@ -845,18 +909,42 @@ EXPORT_SYMBOL(reset_guest_reference_bit);
 int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 			  unsigned char *key)
 {
+	unsigned long paddr;
 	spinlock_t *ptl;
 	pgste_t pgste;
+	pmd_t *pmdp;
 	pte_t *ptep;
 
-	ptep = get_locked_pte(mm, addr, &ptl);
+	pmdp = pmd_alloc_map(mm, addr);
+	if (unlikely(!pmdp))
+		return -EFAULT;
+
+	ptl = pmd_lock(mm, pmdp);
+	if (!pmd_present(*pmdp)) {
+		/* Not yet mapped memory has a zero key */
+		spin_unlock(ptl);
+		*key = 0;
+		return 0;
+	}
+
+	if (pmd_large(*pmdp)) {
+		paddr = pmd_val(*pmdp) & HPAGE_MASK;
+		paddr |= addr & ~HPAGE_MASK;
+		*key = page_get_storage_key(paddr);
+		spin_unlock(ptl);
+		return 0;
+	}
+	spin_unlock(ptl);
+
+	ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
 	if (unlikely(!ptep))
 		return -EFAULT;
 
 	pgste = pgste_get_lock(ptep);
 	*key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
+	paddr = pte_val(*ptep) & PAGE_MASK;
 	if (!(pte_val(*ptep) & _PAGE_INVALID))
-		*key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+		*key = page_get_storage_key(paddr);
 	/* Reflect guest's logical view, not physical */
 	*key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
 	pgste_set_unlock(ptep, pgste);
-- 
cgit v1.2.3-55-g7522


From 7d735b9ae82d073e23ff7d2648e0aa8056049d16 Mon Sep 17 00:00:00 2001
From: Dominik Dingel
Date: Fri, 13 Jul 2018 11:28:29 +0100
Subject: s390/mm: hugetlb pages within a gmap can not be freed

Guests backed by huge pages could theoretically free unused pages via
the diagnose 10 instruction. We currently don't allow that, so we
don't have to refault it once it's needed again.

Signed-off-by: Dominik Dingel <dingel@linux.vnet.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
---
 arch/s390/mm/gmap.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/s390/mm')

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index a6738c0c4499..736ed32a83c5 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -706,6 +706,12 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
 		vmaddr |= gaddr & ~PMD_MASK;
 		/* Find vma in the parent mm */
 		vma = find_vma(gmap->mm, vmaddr);
+		/*
+		 * We do not discard pages that are backed by
+		 * hugetlbfs, so we don't have to refault them.
+		 */
+		if (vma && is_vm_hugetlb_page(vma))
+			continue;
 		size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
 		zap_page_range(vma, vmaddr, size);
 	}
-- 
cgit v1.2.3-55-g7522


From a9e00d8349c98e0973c8b0d671d69e838f7b5bcc Mon Sep 17 00:00:00 2001
From: Janosch Frank
Date: Fri, 13 Jul 2018 11:28:37 +0100
Subject: s390/mm: Add huge page gmap linking support

Let's allow huge pmd linking when enabled through the
KVM_CAP_S390_HPAGE_1M capability. Also we can now restrict gmap
invalidation and notification to the cases where the capability has
been activated and save some cycles when that's not the case.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
---
 arch/s390/include/asm/mmu.h         | 2 ++
 arch/s390/include/asm/mmu_context.h | 1 +
 arch/s390/mm/gmap.c                 | 9 ++++++---
 arch/s390/mm/pgtable.c              | 8 ++++----
 4 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'arch/s390/mm')

diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index f5ff9dbad8ac..f31a15044c24 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -24,6 +24,8 @@ typedef struct {
 	unsigned int uses_skeys:1;
 	/* The mmu context uses CMM. */
 	unsigned int uses_cmm:1;
+	/* The gmaps associated with this context are allowed to use huge pages. */
+	unsigned int allow_gmap_hpage_1m:1;
 } mm_context_t;
 
 #define INIT_MM_CONTEXT(name)						   \
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index d16bc79c30bb..0717ee76885d 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -32,6 +32,7 @@ static inline int init_new_context(struct task_struct *tsk,
 	mm->context.has_pgste = 0;
 	mm->context.uses_skeys = 0;
 	mm->context.uses_cmm = 0;
+	mm->context.allow_gmap_hpage_1m = 0;
 #endif
 	switch (mm->context.asce_limit) {
 	case _REGION2_SIZE:
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 736ed32a83c5..bb44990c8212 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2,8 +2,10 @@
 /*
  *  KVM guest address space mapping code
  *
- *    Copyright IBM Corp. 2007, 2016
+ *    Copyright IBM Corp. 2007, 2016, 2018
  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *		 David Hildenbrand <david@redhat.com>
+ *		 Janosch Frank <frankja@linux.vnet.ibm.com>
  */
 
 #include <linux/kernel.h>
@@ -588,8 +590,8 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 		return -EFAULT;
 	pmd = pmd_offset(pud, vmaddr);
 	VM_BUG_ON(pmd_none(*pmd));
-	/* large pmds cannot yet be handled */
-	if (pmd_large(*pmd))
+	/* Are we allowed to use huge pages? */
+	if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
 		return -EFAULT;
 	/* Link gmap segment table entry location to page table. */
 	rc = radix_tree_preload(GFP_KERNEL);
@@ -1632,6 +1634,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
 	unsigned long limit;
 	int rc;
 
+	BUG_ON(parent->mm->context.allow_gmap_hpage_1m);
 	BUG_ON(gmap_is_shadow(parent));
 	spin_lock(&parent->shadow_lock);
 	sg = gmap_find_shadow(parent, asce, edat_level);
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 37d68706f5aa..f2cc7da473e4 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -347,7 +347,7 @@ static inline void pmdp_idte_local(struct mm_struct *mm,
 			    mm->context.asce, IDTE_LOCAL);
 	else
 		__pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL);
-	if (mm_has_pgste(mm))
+	if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
 		gmap_pmdp_idte_local(mm, addr);
 }
 
@@ -357,15 +357,15 @@ static inline void pmdp_idte_global(struct mm_struct *mm,
 	if (MACHINE_HAS_TLB_GUEST) {
 		__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
 			    mm->context.asce, IDTE_GLOBAL);
-		if (mm_has_pgste(mm))
+		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
 			gmap_pmdp_idte_global(mm, addr);
 	} else if (MACHINE_HAS_IDTE) {
 		__pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL);
-		if (mm_has_pgste(mm))
+		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
 			gmap_pmdp_idte_global(mm, addr);
 	} else {
 		__pmdp_csp(pmdp);
-		if (mm_has_pgste(mm))
+		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
 			gmap_pmdp_csp(mm, addr);
 	}
 }
-- 
cgit v1.2.3-55-g7522


From 37a366face294facb9c9d9fdd9f5b64a27456cbd Mon Sep 17 00:00:00 2001
From: Gerald Schaefer
Date: Tue, 7 Aug 2018 18:57:11 +0200
Subject: s390/mm: fix addressing exception after suspend/resume

Commit c9b5ad546e7d "s390/mm: tag normal pages vs pages used in page tables"
accidentally changed the logic in arch_set_page_states(), which is used by
the suspend/resume code. set_page_stable(page, order) was changed to
set_page_stable_dat(page, 0). After this, only the first page of higher order
pages will be set to stable, and a write to one of the unstable pages will
result in an addressing exception.

Fix this by using "order" again, instead of "0".

Fixes: c9b5ad546e7d ("s390/mm: tag normal pages vs pages used in page tables")
Cc: stable@vger.kernel.org # 4.14+
Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/page-states.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/s390/mm')

diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
index 382153ff17e3..dc3cede7f2ec 100644
--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@@ -271,7 +271,7 @@ void arch_set_page_states(int make_stable)
 			list_for_each(l, &zone->free_area[order].free_list[t]) {
 				page = list_entry(l, struct page, lru);
 				if (make_stable)
-					set_page_stable_dat(page, 0);
+					set_page_stable_dat(page, order);
 				else
 					set_page_unused(page, order);
 			}
-- 
cgit v1.2.3-55-g7522