From 6b2ddf33baec23dace85bd647e3fc4ac070963e8 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Sun, 17 Jun 2018 00:30:43 +0200 Subject: s390/extmem: fix gcc 8 stringop-overflow warning arch/s390/mm/extmem.c: In function '__segment_load': arch/s390/mm/extmem.c:436:2: warning: 'strncat' specified bound 7 equals source length [-Wstringop-overflow=] strncat(seg->res_name, " (DCSS)", 7); What gcc complains about here is the misuse of strncat function, which in this case does not limit a number of bytes taken from "src", so it is in the end the same as strcat(seg->res_name, " (DCSS)"); Keeping in mind that a res_name is 15 bytes, strncat in this case would overflow the buffer and write 0 into alignment byte between the fields in the struct. To avoid that increasing res_name size to 16, and reusing strlcat. Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik Signed-off-by: Martin Schwidefsky --- arch/s390/mm/extmem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c index 6ad15d3fab81..84111a43ea29 100644 --- a/arch/s390/mm/extmem.c +++ b/arch/s390/mm/extmem.c @@ -80,7 +80,7 @@ struct qin64 { struct dcss_segment { struct list_head list; char dcss_name[8]; - char res_name[15]; + char res_name[16]; unsigned long start_addr; unsigned long end; atomic_t ref_count; @@ -433,7 +433,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long memcpy(&seg->res_name, seg->dcss_name, 8); EBCASC(seg->res_name, 8); seg->res_name[8] = '\0'; - strncat(seg->res_name, " (DCSS)", 7); + strlcat(seg->res_name, " (DCSS)", sizeof(seg->res_name)); seg->res->name = seg->res_name; rc = seg->vm_segtype; if (rc == SEG_TYPE_SC || -- cgit v1.2.3-55-g7522 From 5bedf8aa03c28cb8dc98bdd32a41b66d8f7d3eaa Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Sun, 24 Jun 2018 12:17:43 +0200 Subject: s390/mm: correct allocate_pgste proc_handler callback Since proc_dointvec does not perform value range control, proc_dointvec_minmax should be used to limit value range, which is clearly intended here, as the internal representation of the value: unsigned int alloc_pgste:1; In fact it currently works, since we have mm->context.alloc_pgste = page_table_allocate_pgste || ... ... since commit 23fefe119ceb5 ("s390/kvm: avoid global config of vm.alloc_pgste=1") Before that it was mm->context.alloc_pgste = page_table_allocate_pgste; which was broken. That was introduced with commit 0b46e0a3ec0d7 ("s390/kvm: remove delayed reallocation of page tables for KVM"). Fixes: 0b46e0a3ec0d7 ("s390/kvm: remove delayed reallocation of page tables for KVM") Acked-by: Christian Borntraeger Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik Signed-off-by: Martin Schwidefsky --- arch/s390/mm/pgalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 84bd6329a88d..6cf9c9ff2bff 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -28,7 +28,7 @@ static struct ctl_table page_table_sysctl[] = { .data = &page_table_allocate_pgste, .maxlen = sizeof(int), .mode = S_IRUGO | S_IWUSR, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &page_table_allocate_pgste_min, .extra2 = &page_table_allocate_pgste_max, }, -- cgit v1.2.3-55-g7522 From 71e33a1dd712dc6f68de733218d4506bea26bda6 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Sun, 1 Jul 2018 15:56:28 +0200 Subject: s390/cmm: split and simplify cmm pages proc handler Split cmm_pages_handler into cmm_pages_handler and cmm_timed_pages_handler, each handling separate proc entry. And reuse proc_doulongvec_minmax to simplify proc handlers. Min/max values are optional and are omitted here. Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik Signed-off-by: Martin Schwidefsky --- arch/s390/mm/cmm.c | 67 ++++++++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 35 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index 6cf024eb2085..50d8d1c887a4 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -251,45 +251,42 @@ static int cmm_skip_blanks(char *cp, char **endp) return str != cp; } -static struct ctl_table cmm_table[]; - static int cmm_pages_handler(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - char buf[16], *p; - unsigned int len; - long nr; + long nr = cmm_get_pages(); + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &nr, + .maxlen = sizeof(long), + }; + int rc; - if (!*lenp || (*ppos && !write)) { - *lenp = 0; - return 0; - } + rc = proc_doulongvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; - if (write) { - len = *lenp; - if (copy_from_user(buf, buffer, - len > sizeof(buf) ? sizeof(buf) : len)) - return -EFAULT; - buf[sizeof(buf) - 1] = '\0'; - cmm_skip_blanks(buf, &p); - nr = simple_strtoul(p, &p, 0); - if (ctl == &cmm_table[0]) - cmm_set_pages(nr); - else - cmm_add_timed_pages(nr); - } else { - if (ctl == &cmm_table[0]) - nr = cmm_get_pages(); - else - nr = cmm_get_timed_pages(); - len = sprintf(buf, "%ld\n", nr); - if (len > *lenp) - len = *lenp; - if (copy_to_user(buffer, buf, len)) - return -EFAULT; - } - *lenp = len; - *ppos += len; + cmm_set_pages(nr); + return 0; +} + +static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + long nr = cmm_get_timed_pages(); + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &nr, + .maxlen = sizeof(long), + }; + int rc; + + rc = proc_doulongvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + + cmm_add_timed_pages(nr); return 0; } @@ -338,7 +335,7 @@ static struct ctl_table cmm_table[] = { { .procname = "cmm_timed_pages", .mode = 0644, - .proc_handler = cmm_pages_handler, + .proc_handler = cmm_timed_pages_handler, }, { .procname = "cmm_timeout", -- cgit v1.2.3-55-g7522 From ac4c4fc87d6beb6dd8287f24670e5e81c24b1de7 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Fri, 13 Jul 2018 08:13:13 +0200 Subject: s390/cmm: avoid add_timer on concurrently used timer cmm_set_timer could be called concurrently from cmm_thread, cmm proc handler, upon cmm smsg receive and timer function itself. To avoid potential race condition and hitting BUG_ON in add_timer on already pending timer simply reuse mod_timer which is according to documentation "the only safe way to modify the timeout" with multiple unserialized concurrent users. mod_timer can handle both active and inactive timers which allows to carry out minor code simplification as well. Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik Signed-off-by: Martin Schwidefsky --- arch/s390/mm/cmm.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index 50d8d1c887a4..510a18299196 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -191,12 +191,7 @@ static void cmm_set_timer(void) del_timer(&cmm_timer); return; } - if (timer_pending(&cmm_timer)) { - if (mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds*HZ)) - return; - } - cmm_timer.expires = jiffies + cmm_timeout_seconds*HZ; - add_timer(&cmm_timer); + mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds * HZ); } static void cmm_timer_fn(struct timer_list *unused) -- cgit v1.2.3-55-g7522 From 306d6c49ac9ded11114cb53b0925da52f2c2ada1 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Mon, 16 Jul 2018 10:38:57 +0200 Subject: s390/kvm: fix deadlock when killed by oom When the oom killer kills a userspace process in the page fault handler while in guest context, the fault handler fails to release the mm_sem if the FAULT_FLAG_RETRY_NOWAIT option is set. This leads to a deadlock when tearing down the mm when the process terminates. This bug can only happen when pfault is enabled, so only KVM clients are affected. The problem arises in the rare cases in which handle_mm_fault does not release the mm_sem. This patch fixes the issue by manually releasing the mm_sem when needed. Fixes: 24eb3a824c4f3 ("KVM: s390: Add FAULT_FLAG_RETRY_NOWAIT for guest fault") Cc: # 3.15+ Signed-off-by: Claudio Imbrenda Signed-off-by: Martin Schwidefsky --- arch/s390/mm/fault.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index e074480d3598..4cc3f06b0ab3 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -502,6 +502,8 @@ retry: /* No reason to continue if interrupted by SIGKILL. */ if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { fault = VM_FAULT_SIGNAL; + if (flags & FAULT_FLAG_RETRY_NOWAIT) + goto out_up; goto out; } if (unlikely(fault & VM_FAULT_ERROR)) -- cgit v1.2.3-55-g7522 From 5a045bb9c44caebcf4e88bf78343166596c0014b Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Fri, 13 Jul 2018 11:28:16 +0100 Subject: s390/mm: Make gmap_protect_range more modular This patch reworks the gmap_protect_range logic and extracts the pte handling into an own function. Also we do now walk to the pmd and make it accessible in the function for later use. This way we can add huge page handling logic more easily. Signed-off-by: Janosch Frank Reviewed-by: Christian Borntraeger Reviewed-by: Martin Schwidefsky --- arch/s390/mm/gmap.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 83 insertions(+), 10 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index bc56ec8abcf7..71b0e9ca0137 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -864,7 +864,79 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, */ static void gmap_pte_op_end(spinlock_t *ptl) { - spin_unlock(ptl); + if (ptl) + spin_unlock(ptl); +} + +/** + * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock + * and return the pmd pointer + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * + * Returns a pointer to the pmd for a guest address, or NULL + */ +static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) +{ + pmd_t *pmdp; + + BUG_ON(gmap_is_shadow(gmap)); + spin_lock(&gmap->guest_table_lock); + pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1); + + if (!pmdp || pmd_none(*pmdp)) { + spin_unlock(&gmap->guest_table_lock); + return NULL; + } + + /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ + if (!pmd_large(*pmdp)) + spin_unlock(&gmap->guest_table_lock); + return pmdp; +} + +/** + * gmap_pmd_op_end - release the guest_table_lock if needed + * @gmap: pointer to the guest mapping meta data structure + * @pmdp: pointer to the pmd + */ +static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) +{ + if (pmd_large(*pmdp)) + spin_unlock(&gmap->guest_table_lock); +} + +/* + * gmap_protect_pte - remove access rights to memory and set pgste bits + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @pmdp: pointer to the pmd associated with the pte + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: pgste notification bits to set + * + * Returns 0 if successfully protected, -ENOMEM if out of memory and + * -EAGAIN if a fixup is needed. + * + * Expected to be called with sg->mm->mmap_sem in read + */ +static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, + pmd_t *pmdp, int prot, unsigned long bits) +{ + int rc; + pte_t *ptep; + spinlock_t *ptl = NULL; + + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) + return -EAGAIN; + + ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl); + if (!ptep) + return -ENOMEM; + + /* Protect and unlock. */ + rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits); + gmap_pte_op_end(ptl); + return rc; } /* @@ -884,17 +956,21 @@ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, unsigned long len, int prot, unsigned long bits) { unsigned long vmaddr; - spinlock_t *ptl; - pte_t *ptep; + pmd_t *pmdp; int rc; BUG_ON(gmap_is_shadow(gmap)); while (len) { rc = -EAGAIN; - ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); - if (ptep) { - rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits); - gmap_pte_op_end(ptl); + pmdp = gmap_pmd_op_walk(gmap, gaddr); + if (pmdp) { + rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, + bits); + if (!rc) { + len -= PAGE_SIZE; + gaddr += PAGE_SIZE; + } + gmap_pmd_op_end(gmap, pmdp); } if (rc) { vmaddr = __gmap_translate(gmap, gaddr); @@ -903,10 +979,7 @@ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); if (rc) return rc; - continue; } - gaddr += PAGE_SIZE; - len -= PAGE_SIZE; } return 0; } -- cgit v1.2.3-55-g7522 From 2c46e974dd8b5316e65637af0ff6d4bc78554b2e Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Fri, 13 Jul 2018 11:28:18 +0100 Subject: s390/mm: Abstract gmap notify bit setting Currently we use the software PGSTE bits PGSTE_IN_BIT and PGSTE_VSIE_BIT to notify before an invalidation occurs on a prefix page or a VSIE page respectively. Both bits are pgste specific, but are used when protecting a memory range. Let's introduce abstract GMAP_NOTIFY_* bits that will be realized into the respective bits when gmap DAT table entries are protected. Signed-off-by: Janosch Frank Reviewed-by: Christian Borntraeger Reviewed-by: David Hildenbrand --- arch/s390/include/asm/gmap.h | 4 ++++ arch/s390/mm/gmap.c | 11 +++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index e07cce88dfb0..c1bc5633fc6e 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -9,6 +9,10 @@ #ifndef _ASM_S390_GMAP_H #define _ASM_S390_GMAP_H +/* Generic bits for GMAP notification on DAT table entry changes. */ +#define GMAP_NOTIFY_SHADOW 0x2 +#define GMAP_NOTIFY_MPROT 0x1 + /** * struct gmap_struct - guest address space * @list: list head for the mm->context gmap list diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 71b0e9ca0137..0bada5e097cb 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -912,7 +912,7 @@ static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) * @gaddr: virtual address in the guest address space * @pmdp: pointer to the pmd associated with the pte * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * @bits: pgste notification bits to set + * @bits: notification bits to set * * Returns 0 if successfully protected, -ENOMEM if out of memory and * -EAGAIN if a fixup is needed. @@ -925,6 +925,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, int rc; pte_t *ptep; spinlock_t *ptl = NULL; + unsigned long pbits = 0; if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) return -EAGAIN; @@ -933,8 +934,10 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, if (!ptep) return -ENOMEM; + pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0; + pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; /* Protect and unlock. */ - rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits); + rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); gmap_pte_op_end(ptl); return rc; } @@ -1008,7 +1011,7 @@ int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, if (!MACHINE_HAS_ESOP && prot == PROT_READ) return -EINVAL; down_read(&gmap->mm->mmap_sem); - rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT); + rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT); up_read(&gmap->mm->mmap_sem); return rc; } @@ -1599,7 +1602,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, down_read(&parent->mm->mmap_sem); rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, - PROT_READ, PGSTE_VSIE_BIT); + PROT_READ, GMAP_NOTIFY_SHADOW); up_read(&parent->mm->mmap_sem); spin_lock(&parent->shadow_lock); new->initialized = true; -- cgit v1.2.3-55-g7522 From 58b7e200d2f1f9af9ef69a401a877791837a1c87 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Fri, 13 Jul 2018 11:28:20 +0100 Subject: s390/mm: Add gmap pmd linking Let's allow pmds to be linked into gmap for the upcoming s390 KVM huge page support. Before this patch we copied the full userspace pmd entry. This is not correct, as it contains SW defined bits that might be interpreted differently in the GMAP context. Now we only copy over all hardware relevant information leaving out the software bits. Signed-off-by: Janosch Frank Reviewed-by: David Hildenbrand --- arch/s390/include/asm/pgtable.h | 6 ++++-- arch/s390/mm/gmap.c | 13 +++++++++---- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 5ab636089c60..fe36b3bb2afd 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -268,8 +268,10 @@ static inline int is_module_addr(void *addr) #define _REGION_ENTRY_BITS_LARGE 0xffffffff8000fe2fUL /* Bits in the segment table entry */ -#define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL -#define _SEGMENT_ENTRY_BITS_LARGE 0xfffffffffff0ff33UL +#define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL +#define _SEGMENT_ENTRY_BITS_LARGE 0xfffffffffff0ff33UL +#define _SEGMENT_ENTRY_HARDWARE_BITS 0xfffffffffffffe30UL +#define _SEGMENT_ENTRY_HARDWARE_BITS_LARGE 0xfffffffffff00730UL #define _SEGMENT_ENTRY_ORIGIN_LARGE ~0xfffffUL /* large page address */ #define _SEGMENT_ENTRY_ORIGIN ~0x7ffUL/* page table origin */ #define _SEGMENT_ENTRY_PROTECT 0x200 /* segment protection bit */ diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 0bada5e097cb..870e81fcb0cf 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -596,10 +596,15 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) if (*table == _SEGMENT_ENTRY_EMPTY) { rc = radix_tree_insert(&gmap->host_to_guest, vmaddr >> PMD_SHIFT, table); - if (!rc) - *table = pmd_val(*pmd); - } else - rc = 0; + if (!rc) { + if (pmd_large(*pmd)) { + *table = pmd_val(*pmd) & + _SEGMENT_ENTRY_HARDWARE_BITS_LARGE; + } else + *table = pmd_val(*pmd) & + _SEGMENT_ENTRY_HARDWARE_BITS; + } + } spin_unlock(&gmap->guest_table_lock); spin_unlock(ptl); radix_tree_preload_end(); -- cgit v1.2.3-55-g7522 From 7c4b13a7c042fd6b71dc48d291208f8a97fad333 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Fri, 13 Jul 2018 11:28:21 +0100 Subject: s390/mm: Add gmap pmd notification bit setting Like for ptes, we also need invalidation notification for pmds, to make sure the guest lowcore pages are always accessible and later addition of shadowed pmds. With PMDs we do not have PGSTEs or some other bits we could use in the host PMD. Instead we pick one of the free bits in the gmap PMD. Every time a host pmd will be invalidated, we will check if the respective gmap PMD has the bit set and in that case fire up the notifier. Signed-off-by: Janosch Frank --- arch/s390/include/asm/gmap.h | 3 +++ arch/s390/mm/gmap.c | 60 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 57 insertions(+), 6 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index c1bc5633fc6e..276268b48aff 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -13,6 +13,9 @@ #define GMAP_NOTIFY_SHADOW 0x2 #define GMAP_NOTIFY_MPROT 0x1 +/* Status bits only for huge segment entries */ +#define _SEGMENT_ENTRY_GMAP_IN 0x8000 /* invalidation notify bit */ + /** * struct gmap_struct - guest address space * @list: list head for the mm->context gmap list diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 870e81fcb0cf..96dd94d51ad4 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -911,6 +911,40 @@ static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) spin_unlock(&gmap->guest_table_lock); } +/* + * gmap_protect_pmd - remove access rights to memory and set pmd notification bits + * @pmdp: pointer to the pmd to be protected + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: notification bits to set + * + * Returns: + * 0 if successfully protected + * -EAGAIN if a fixup is needed + * -EINVAL if unsupported notifier bits have been specified + * + * Expected to be called with sg->mm->mmap_sem in read and + * guest_table_lock held. + */ +static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, + pmd_t *pmdp, int prot, unsigned long bits) +{ + int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID; + int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT; + + /* Fixup needed */ + if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE))) + return -EAGAIN; + + if (bits & GMAP_NOTIFY_MPROT) + pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN; + + /* Shadow GMAP protection needs split PMDs */ + if (bits & GMAP_NOTIFY_SHADOW) + return -EINVAL; + + return 0; +} + /* * gmap_protect_pte - remove access rights to memory and set pgste bits * @gmap: pointer to guest mapping meta data structure @@ -963,7 +997,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, unsigned long len, int prot, unsigned long bits) { - unsigned long vmaddr; + unsigned long vmaddr, dist; pmd_t *pmdp; int rc; @@ -972,15 +1006,29 @@ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, rc = -EAGAIN; pmdp = gmap_pmd_op_walk(gmap, gaddr); if (pmdp) { - rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, - bits); - if (!rc) { - len -= PAGE_SIZE; - gaddr += PAGE_SIZE; + if (!pmd_large(*pmdp)) { + rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, + bits); + if (!rc) { + len -= PAGE_SIZE; + gaddr += PAGE_SIZE; + } + } else { + rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, + bits); + if (!rc) { + dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK); + len = len < dist ? 0 : len - dist; + gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE; + } } gmap_pmd_op_end(gmap, pmdp); } if (rc) { + if (rc == -EINVAL) + return rc; + + /* -EAGAIN, fixup of userspace mm and gmap */ vmaddr = __gmap_translate(gmap, gaddr); if (IS_ERR_VALUE(vmaddr)) return vmaddr; -- cgit v1.2.3-55-g7522 From 6a3762778d1ba1a58ab473124790cd612d10eadc Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Fri, 13 Jul 2018 11:28:22 +0100 Subject: s390/mm: Add gmap pmd invalidation and clearing If the host invalidates a pmd, we also have to invalidate the corresponding gmap pmds, as well as flush them from the TLB. This is necessary, as we don't share the pmd tables between host and guest as we do with ptes. The clearing part of these three new functions sets a guest pmd entry to _SEGMENT_ENTRY_EMPTY, so the guest will fault on it and we will re-link it. Flushing the gmap is not necessary in the host's lazy local and csp cases. Both purge the TLB completely. Signed-off-by: Janosch Frank Reviewed-by: Martin Schwidefsky Acked-by: David Hildenbrand --- arch/s390/include/asm/pgtable.h | 4 ++ arch/s390/mm/gmap.c | 125 ++++++++++++++++++++++++++++++++++++++++ arch/s390/mm/pgtable.c | 17 +++++- 3 files changed, 143 insertions(+), 3 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index fe36b3bb2afd..087e0282f165 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1118,6 +1118,10 @@ int set_pgste_bits(struct mm_struct *mm, unsigned long addr, int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep); int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, unsigned long *oldpte, unsigned long *oldpgste); +void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr); +void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr); +void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr); +void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr); /* * Certain architectures need to do special things when PTEs diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 96dd94d51ad4..87c174ee3a8c 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2221,6 +2221,131 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, } EXPORT_SYMBOL_GPL(ptep_notify); +static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, + unsigned long gaddr) +{ + pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN; + gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); +} + +static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, + int purge) +{ + pmd_t *pmdp; + struct gmap *gmap; + unsigned long gaddr; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT); + if (pmdp) { + gaddr = __gmap_segment_gaddr((unsigned long *)pmdp); + pmdp_notify_gmap(gmap, pmdp, gaddr); + WARN_ON(pmd_val(*pmdp) & ~_SEGMENT_ENTRY_HARDWARE_BITS_LARGE); + if (purge) + __pmdp_csp(pmdp); + pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY; + } + spin_unlock(&gmap->guest_table_lock); + } + rcu_read_unlock(); +} + +/** + * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without + * flushing + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr) +{ + gmap_pmdp_clear(mm, vmaddr, 0); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate); + +/** + * gmap_pmdp_csp - csp all affected guest pmd entries + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr) +{ + gmap_pmdp_clear(mm, vmaddr, 1); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_csp); + +/** + * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) +{ + unsigned long *entry, gaddr; + struct gmap *gmap; + pmd_t *pmdp; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + entry = radix_tree_delete(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT); + if (entry) { + pmdp = (pmd_t *)entry; + gaddr = __gmap_segment_gaddr(entry); + pmdp_notify_gmap(gmap, pmdp, gaddr); + WARN_ON(*entry & ~_SEGMENT_ENTRY_HARDWARE_BITS_LARGE); + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, + gmap->asce, IDTE_LOCAL); + else if (MACHINE_HAS_IDTE) + __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); + *entry = _SEGMENT_ENTRY_EMPTY; + } + spin_unlock(&gmap->guest_table_lock); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); + +/** + * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry + * @mm: pointer to the process mm_struct + * @vmaddr: virtual address in the process address space + */ +void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) +{ + unsigned long *entry, gaddr; + struct gmap *gmap; + pmd_t *pmdp; + + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); + entry = radix_tree_delete(&gmap->host_to_guest, + vmaddr >> PMD_SHIFT); + if (entry) { + pmdp = (pmd_t *)entry; + gaddr = __gmap_segment_gaddr(entry); + pmdp_notify_gmap(gmap, pmdp, gaddr); + WARN_ON(*entry & ~_SEGMENT_ENTRY_HARDWARE_BITS_LARGE); + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, + gmap->asce, IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); + else + __pmdp_csp(pmdp); + *entry = _SEGMENT_ENTRY_EMPTY; + } + spin_unlock(&gmap->guest_table_lock); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global); + static inline void thp_split_mm(struct mm_struct *mm) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 301e466e4263..fe84c0715395 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -347,18 +347,27 @@ static inline void pmdp_idte_local(struct mm_struct *mm, mm->context.asce, IDTE_LOCAL); else __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); + if (mm_has_pgste(mm)) + gmap_pmdp_idte_local(mm, addr); } static inline void pmdp_idte_global(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - if (MACHINE_HAS_TLB_GUEST) + if (MACHINE_HAS_TLB_GUEST) { __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_GLOBAL); - else if (MACHINE_HAS_IDTE) + if (mm_has_pgste(mm)) + gmap_pmdp_idte_global(mm, addr); + } else if (MACHINE_HAS_IDTE) { __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); - else + if (mm_has_pgste(mm)) + gmap_pmdp_idte_global(mm, addr); + } else { __pmdp_csp(pmdp); + if (mm_has_pgste(mm)) + gmap_pmdp_csp(mm, addr); + } } static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, @@ -392,6 +401,8 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, cpumask_of(smp_processor_id()))) { pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID; mm->context.flush_mm = 1; + if (mm_has_pgste(mm)) + gmap_pmdp_invalidate(mm, addr); } else { pmdp_idte_global(mm, addr, pmdp); } -- cgit v1.2.3-55-g7522 From 0959e168678d2d95648317e1e5e46bcb358272eb Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Tue, 17 Jul 2018 13:21:22 +0100 Subject: s390/mm: Add huge page dirty sync support To do dirty loging with huge pages, we protect huge pmds in the gmap. When they are written to, we unprotect them and mark them dirty. We introduce the function gmap_test_and_clear_dirty_pmd which handles dirty sync for huge pages. Signed-off-by: Janosch Frank Acked-by: David Hildenbrand --- arch/s390/include/asm/gmap.h | 3 + arch/s390/include/asm/pgtable.h | 3 +- arch/s390/kvm/kvm-s390.c | 25 +++++--- arch/s390/mm/gmap.c | 127 ++++++++++++++++++++++++++++++++++++++-- arch/s390/mm/pgtable.c | 34 +---------- 5 files changed, 148 insertions(+), 44 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 276268b48aff..fcbd638fb9f4 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -15,6 +15,7 @@ /* Status bits only for huge segment entries */ #define _SEGMENT_ENTRY_GMAP_IN 0x8000 /* invalidation notify bit */ +#define _SEGMENT_ENTRY_GMAP_UC 0x4000 /* dirty (migration) */ /** * struct gmap_struct - guest address space @@ -139,4 +140,6 @@ void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *, int gmap_mprotect_notify(struct gmap *, unsigned long start, unsigned long len, int prot); +void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], + unsigned long gaddr, unsigned long vmaddr); #endif /* _ASM_S390_GMAP_H */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 087e0282f165..0e7cb0dc9c33 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1103,7 +1103,8 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *sptep, pte_t *tptep, pte_t pte); void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep); -bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address); +bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long address, + pte_t *ptep); int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned char key, bool nq); int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3b7a5151b6a5..4cff5e31ca36 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -511,19 +511,30 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) } static void kvm_s390_sync_dirty_log(struct kvm *kvm, - struct kvm_memory_slot *memslot) + struct kvm_memory_slot *memslot) { + int i; gfn_t cur_gfn, last_gfn; - unsigned long address; + unsigned long gaddr, vmaddr; struct gmap *gmap = kvm->arch.gmap; + DECLARE_BITMAP(bitmap, _PAGE_ENTRIES); - /* Loop over all guest pages */ + /* Loop over all guest segments */ + cur_gfn = memslot->base_gfn; last_gfn = memslot->base_gfn + memslot->npages; - for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) { - address = gfn_to_hva_memslot(memslot, cur_gfn); + for (; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES) { + gaddr = gfn_to_gpa(cur_gfn); + vmaddr = gfn_to_hva_memslot(memslot, cur_gfn); + if (kvm_is_error_hva(vmaddr)) + continue; + + bitmap_zero(bitmap, _PAGE_ENTRIES); + gmap_sync_dirty_log_pmd(gmap, bitmap, gaddr, vmaddr); + for (i = 0; i < _PAGE_ENTRIES; i++) { + if (test_bit(i, bitmap)) + mark_page_dirty(kvm, cur_gfn + i); + } - if (test_and_clear_guest_dirty(gmap->mm, address)) - mark_page_dirty(kvm, cur_gfn); if (fatal_signal_pending(current)) return; cond_resched(); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 87c174ee3a8c..3807de25a706 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -521,6 +521,9 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table, rcu_read_unlock(); } +static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new, + unsigned long gaddr); + /** * gmap_link - set up shadow page tables to connect a host to a guest address * @gmap: pointer to guest mapping meta data structure @@ -541,6 +544,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) p4d_t *p4d; pud_t *pud; pmd_t *pmd; + u64 unprot; int rc; BUG_ON(gmap_is_shadow(gmap)); @@ -598,12 +602,19 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) vmaddr >> PMD_SHIFT, table); if (!rc) { if (pmd_large(*pmd)) { - *table = pmd_val(*pmd) & - _SEGMENT_ENTRY_HARDWARE_BITS_LARGE; + *table = (pmd_val(*pmd) & + _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) + | _SEGMENT_ENTRY_GMAP_UC; } else *table = pmd_val(*pmd) & _SEGMENT_ENTRY_HARDWARE_BITS; } + } else if (*table & _SEGMENT_ENTRY_PROTECT && + !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) { + unprot = (u64)*table; + unprot &= ~_SEGMENT_ENTRY_PROTECT; + unprot |= _SEGMENT_ENTRY_GMAP_UC; + gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr); } spin_unlock(&gmap->guest_table_lock); spin_unlock(ptl); @@ -930,11 +941,23 @@ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, { int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID; int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT; + pmd_t new = *pmdp; /* Fixup needed */ if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE))) return -EAGAIN; + if (prot == PROT_NONE && !pmd_i) { + pmd_val(new) |= _SEGMENT_ENTRY_INVALID; + gmap_pmdp_xchg(gmap, pmdp, new, gaddr); + } + + if (prot == PROT_READ && !pmd_p) { + pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID; + pmd_val(new) |= _SEGMENT_ENTRY_PROTECT; + gmap_pmdp_xchg(gmap, pmdp, new, gaddr); + } + if (bits & GMAP_NOTIFY_MPROT) pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN; @@ -2228,6 +2251,32 @@ static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); } +/** + * gmap_pmdp_xchg - exchange a gmap pmd with another + * @gmap: pointer to the guest address space structure + * @pmdp: pointer to the pmd entry + * @new: replacement entry + * @gaddr: the affected guest address + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, + unsigned long gaddr) +{ + gaddr &= HPAGE_MASK; + pmdp_notify_gmap(gmap, pmdp, gaddr); + pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN; + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, + IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); + else + __pmdp_csp(pmdp); + *pmdp = new; +} + static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, int purge) { @@ -2243,7 +2292,8 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, if (pmdp) { gaddr = __gmap_segment_gaddr((unsigned long *)pmdp); pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(pmd_val(*pmdp) & ~_SEGMENT_ENTRY_HARDWARE_BITS_LARGE); + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC)); if (purge) __pmdp_csp(pmdp); pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY; @@ -2296,7 +2346,8 @@ void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) pmdp = (pmd_t *)entry; gaddr = __gmap_segment_gaddr(entry); pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(*entry & ~_SEGMENT_ENTRY_HARDWARE_BITS_LARGE); + WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC)); if (MACHINE_HAS_TLB_GUEST) __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_LOCAL); @@ -2330,7 +2381,8 @@ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) pmdp = (pmd_t *)entry; gaddr = __gmap_segment_gaddr(entry); pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(*entry & ~_SEGMENT_ENTRY_HARDWARE_BITS_LARGE); + WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC)); if (MACHINE_HAS_TLB_GUEST) __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_GLOBAL); @@ -2346,6 +2398,71 @@ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) } EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global); +/** + * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status + * @gmap: pointer to guest address space + * @pmdp: pointer to the pmd to be tested + * @gaddr: virtual address in the guest address space + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, + unsigned long gaddr) +{ + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) + return false; + + /* Already protected memory, which did not change is clean */ + if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT && + !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC)) + return false; + + /* Clear UC indication and reset protection */ + pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC; + gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); + return true; +} + +/** + * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment + * @gmap: pointer to guest address space + * @bitmap: dirty bitmap for this pmd + * @gaddr: virtual address in the guest address space + * @vmaddr: virtual address in the host address space + * + * This function is assumed to be called with the guest_table_lock + * held. + */ +void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], + unsigned long gaddr, unsigned long vmaddr) +{ + int i; + pmd_t *pmdp; + pte_t *ptep; + spinlock_t *ptl; + + pmdp = gmap_pmd_op_walk(gmap, gaddr); + if (!pmdp) + return; + + if (pmd_large(*pmdp)) { + if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) + bitmap_fill(bitmap, _PAGE_ENTRIES); + } else { + for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) { + ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl); + if (!ptep) + continue; + if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) + set_bit(i, bitmap); + spin_unlock(ptl); + } + } + gmap_pmd_op_end(gmap, pmdp); +} +EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd); + static inline void thp_split_mm(struct mm_struct *mm) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index fe84c0715395..684df964e345 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -704,40 +704,14 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) /* * Test and reset if a guest page is dirty */ -bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) +bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) { - spinlock_t *ptl; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; pgste_t pgste; - pte_t *ptep; pte_t pte; bool dirty; int nodat; - pgd = pgd_offset(mm, addr); - p4d = p4d_alloc(mm, pgd, addr); - if (!p4d) - return false; - pud = pud_alloc(mm, p4d, addr); - if (!pud) - return false; - pmd = pmd_alloc(mm, pud, addr); - if (!pmd) - return false; - /* We can't run guests backed by huge pages, but userspace can - * still set them up and then try to migrate them without any - * migration support. - */ - if (pmd_large(*pmd)) - return true; - - ptep = pte_alloc_map_lock(mm, pmd, addr, &ptl); - if (unlikely(!ptep)) - return false; - pgste = pgste_get_lock(ptep); dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); pgste_val(pgste) &= ~PGSTE_UC_BIT; @@ -753,11 +727,9 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) *ptep = pte; } pgste_set_unlock(ptep, pgste); - - spin_unlock(ptl); return dirty; } -EXPORT_SYMBOL_GPL(test_and_clear_guest_dirty); +EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc); int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned char key, bool nq) -- cgit v1.2.3-55-g7522 From 964c2c05c9f3095a18387a57b289cf06de637521 Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Fri, 13 Jul 2018 11:28:25 +0100 Subject: s390/mm: Clear huge page storage keys on enable_skey When a guest starts using storage keys, we trap and set a default one for its whole valid address space. With this patch we are now able to do that for large pages. To speed up the storage key insertion, we use __storage_key_init_range, which in-turn will use sske_frame to set multiple storage keys with one instruction. As it has been previously used for debuging we have to get rid of the default key check and make it quiescing. Signed-off-by: Dominik Dingel Signed-off-by: Janosch Frank [replaced page_set_storage_key loop with __storage_key_init_range] Reviewed-by: Martin Schwidefsky Reviewed-by: David Hildenbrand --- arch/s390/mm/gmap.c | 32 +++++++++++++++++++++++++++++--- arch/s390/mm/pageattr.c | 6 ++---- 2 files changed, 31 insertions(+), 7 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 3807de25a706..a8e32e60226b 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2539,17 +2539,43 @@ EXPORT_SYMBOL_GPL(s390_enable_sie); * Enable storage key handling from now on and initialize the storage * keys with the default key. */ -static int __s390_enable_skey(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk) +static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) { /* Clear storage key */ ptep_zap_key(walk->mm, addr, pte); return 0; } +static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, + unsigned long hmask, unsigned long next, + struct mm_walk *walk) +{ + pmd_t *pmd = (pmd_t *)pte; + unsigned long start, end; + + /* + * The write check makes sure we do not set a key on shared + * memory. This is needed as the walker does not differentiate + * between actual guest memory and the process executable or + * shared libraries. + */ + if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID || + !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE)) + return 0; + + start = pmd_val(*pmd) & HPAGE_MASK; + end = start + HPAGE_SIZE - 1; + __storage_key_init_range(start, end); + return 0; +} + int s390_enable_skey(void) { - struct mm_walk walk = { .pte_entry = __s390_enable_skey }; + struct mm_walk walk = { + .hugetlb_entry = __s390_enable_skey_hugetlb, + .pte_entry = __s390_enable_skey_pte, + }; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int rc = 0; diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index c44171588d08..f8c6faab41f4 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -14,7 +14,7 @@ static inline unsigned long sske_frame(unsigned long addr, unsigned char skey) { - asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],9,0" + asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],1,0" : [addr] "+a" (addr) : [skey] "d" (skey)); return addr; } @@ -23,8 +23,6 @@ void __storage_key_init_range(unsigned long start, unsigned long end) { unsigned long boundary, size; - if (!PAGE_DEFAULT_KEY) - return; while (start < end) { if (MACHINE_HAS_EDAT1) { /* set storage keys for a 1MB frame */ @@ -37,7 +35,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end) continue; } } - page_set_storage_key(start, PAGE_DEFAULT_KEY, 0); + page_set_storage_key(start, PAGE_DEFAULT_KEY, 1); start += PAGE_SIZE; } } -- cgit v1.2.3-55-g7522 From 3afdfca69870963ae01e280732a5ee493a2fcbb3 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Fri, 13 Jul 2018 11:28:26 +0100 Subject: s390/mm: Clear skeys for newly mapped huge guest pmds Similarly to the pte skey handling, where we set the storage key to the default key for each newly mapped pte, we have to also do that for huge pmds. With the PG_arch_1 flag we keep track if the area has already been cleared of its skeys. Signed-off-by: Janosch Frank Reviewed-by: Martin Schwidefsky --- arch/s390/include/asm/hugetlb.h | 5 ++++- arch/s390/mm/gmap.c | 2 ++ arch/s390/mm/hugetlbpage.c | 24 ++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 9c5fc50204dd..2d1afa58a4b6 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -37,7 +37,10 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -#define arch_clear_hugepage_flags(page) do { } while (0) +static inline void arch_clear_hugepage_flags(struct page *page) +{ + clear_bit(PG_arch_1, &page->flags); +} static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index a8e32e60226b..a6738c0c4499 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2553,6 +2553,7 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, { pmd_t *pmd = (pmd_t *)pte; unsigned long start, end; + struct page *page = pmd_page(*pmd); /* * The write check makes sure we do not set a key on shared @@ -2567,6 +2568,7 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, start = pmd_val(*pmd) & HPAGE_MASK; end = start + HPAGE_SIZE - 1; __storage_key_init_range(start, end); + set_bit(PG_arch_1, &page->flags); return 0; } diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index e804090f4470..b0246c705a19 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -123,6 +123,29 @@ static inline pte_t __rste_to_pte(unsigned long rste) return pte; } +static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) +{ + struct page *page; + unsigned long size, paddr; + + if (!mm_uses_skeys(mm) || + rste & _SEGMENT_ENTRY_INVALID) + return; + + if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { + page = pud_page(__pud(rste)); + size = PUD_SIZE; + paddr = rste & PUD_MASK; + } else { + page = pmd_page(__pmd(rste)); + size = PMD_SIZE; + paddr = rste & PMD_MASK; + } + + if (!test_and_set_bit(PG_arch_1, &page->flags)) + __storage_key_init_range(paddr, paddr + size - 1); +} + void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { @@ -137,6 +160,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, rste |= _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE; else rste |= _SEGMENT_ENTRY_LARGE; + clear_huge_pte_skeys(mm, rste); pte_val(*ptep) = rste; } -- cgit v1.2.3-55-g7522 From 637ff9efe5eab419ea7f2bd6f2cf50f3cb69e322 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Fri, 13 Jul 2018 11:28:28 +0100 Subject: s390/mm: Add huge pmd storage key handling Storage keys for guests with huge page mappings have to be managed in hardware. There are no PGSTEs for PMDs that we could use to retain the guests's logical view of the key. Signed-off-by: Janosch Frank Reviewed-by: David Hildenbrand --- arch/s390/mm/pgtable.c | 108 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 98 insertions(+), 10 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 684df964e345..37d68706f5aa 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -410,6 +410,24 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, return old; } +static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return NULL; + pud = pud_alloc(mm, p4d, addr); + if (!pud) + return NULL; + pmd = pmd_alloc(mm, pud, addr); + return pmd; +} + pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t new) { @@ -734,12 +752,36 @@ EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc); int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned char key, bool nq) { - unsigned long keyul; + unsigned long keyul, paddr; spinlock_t *ptl; pgste_t old, new; + pmd_t *pmdp; pte_t *ptep; - ptep = get_locked_pte(mm, addr, &ptl); + pmdp = pmd_alloc_map(mm, addr); + if (unlikely(!pmdp)) + return -EFAULT; + + ptl = pmd_lock(mm, pmdp); + if (!pmd_present(*pmdp)) { + spin_unlock(ptl); + return -EFAULT; + } + + if (pmd_large(*pmdp)) { + paddr = pmd_val(*pmdp) & HPAGE_MASK; + paddr |= addr & ~HPAGE_MASK; + /* + * Huge pmds need quiescing operations, they are + * always mapped. + */ + page_set_storage_key(paddr, key, 1); + spin_unlock(ptl); + return 0; + } + spin_unlock(ptl); + + ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl); if (unlikely(!ptep)) return -EFAULT; @@ -750,14 +792,14 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; if (!(pte_val(*ptep) & _PAGE_INVALID)) { - unsigned long address, bits, skey; + unsigned long bits, skey; - address = pte_val(*ptep) & PAGE_MASK; - skey = (unsigned long) page_get_storage_key(address); + paddr = pte_val(*ptep) & PAGE_MASK; + skey = (unsigned long) page_get_storage_key(paddr); bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); /* Set storage key ACC and FP */ - page_set_storage_key(address, skey, !nq); + page_set_storage_key(paddr, skey, !nq); /* Merge host changed & referenced into pgste */ pgste_val(new) |= bits << 52; } @@ -813,11 +855,32 @@ EXPORT_SYMBOL(cond_set_guest_storage_key); int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) { spinlock_t *ptl; + unsigned long paddr; pgste_t old, new; + pmd_t *pmdp; pte_t *ptep; int cc = 0; - ptep = get_locked_pte(mm, addr, &ptl); + pmdp = pmd_alloc_map(mm, addr); + if (unlikely(!pmdp)) + return -EFAULT; + + ptl = pmd_lock(mm, pmdp); + if (!pmd_present(*pmdp)) { + spin_unlock(ptl); + return -EFAULT; + } + + if (pmd_large(*pmdp)) { + paddr = pmd_val(*pmdp) & HPAGE_MASK; + paddr |= addr & ~HPAGE_MASK; + cc = page_reset_referenced(paddr); + spin_unlock(ptl); + return cc; + } + spin_unlock(ptl); + + ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl); if (unlikely(!ptep)) return -EFAULT; @@ -826,7 +889,8 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) pgste_val(new) &= ~PGSTE_GR_BIT; if (!(pte_val(*ptep) & _PAGE_INVALID)) { - cc = page_reset_referenced(pte_val(*ptep) & PAGE_MASK); + paddr = pte_val(*ptep) & PAGE_MASK; + cc = page_reset_referenced(paddr); /* Merge real referenced bit into host-set */ pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT; } @@ -845,18 +909,42 @@ EXPORT_SYMBOL(reset_guest_reference_bit); int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned char *key) { + unsigned long paddr; spinlock_t *ptl; pgste_t pgste; + pmd_t *pmdp; pte_t *ptep; - ptep = get_locked_pte(mm, addr, &ptl); + pmdp = pmd_alloc_map(mm, addr); + if (unlikely(!pmdp)) + return -EFAULT; + + ptl = pmd_lock(mm, pmdp); + if (!pmd_present(*pmdp)) { + /* Not yet mapped memory has a zero key */ + spin_unlock(ptl); + *key = 0; + return 0; + } + + if (pmd_large(*pmdp)) { + paddr = pmd_val(*pmdp) & HPAGE_MASK; + paddr |= addr & ~HPAGE_MASK; + *key = page_get_storage_key(paddr); + spin_unlock(ptl); + return 0; + } + spin_unlock(ptl); + + ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl); if (unlikely(!ptep)) return -EFAULT; pgste = pgste_get_lock(ptep); *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; + paddr = pte_val(*ptep) & PAGE_MASK; if (!(pte_val(*ptep) & _PAGE_INVALID)) - *key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK); + *key = page_get_storage_key(paddr); /* Reflect guest's logical view, not physical */ *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; pgste_set_unlock(ptep, pgste); -- cgit v1.2.3-55-g7522 From 7d735b9ae82d073e23ff7d2648e0aa8056049d16 Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Fri, 13 Jul 2018 11:28:29 +0100 Subject: s390/mm: hugetlb pages within a gmap can not be freed Guests backed by huge pages could theoretically free unused pages via the diagnose 10 instruction. We currently don't allow that, so we don't have to refault it once it's needed again. Signed-off-by: Dominik Dingel Reviewed-by: Martin Schwidefsky Reviewed-by: David Hildenbrand Signed-off-by: Janosch Frank --- arch/s390/mm/gmap.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index a6738c0c4499..736ed32a83c5 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -706,6 +706,12 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) vmaddr |= gaddr & ~PMD_MASK; /* Find vma in the parent mm */ vma = find_vma(gmap->mm, vmaddr); + /* + * We do not discard pages that are backed by + * hugetlbfs, so we don't have to refault them. + */ + if (vma && is_vm_hugetlb_page(vma)) + continue; size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); zap_page_range(vma, vmaddr, size); } -- cgit v1.2.3-55-g7522 From a9e00d8349c98e0973c8b0d671d69e838f7b5bcc Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Fri, 13 Jul 2018 11:28:37 +0100 Subject: s390/mm: Add huge page gmap linking support Let's allow huge pmd linking when enabled through the KVM_CAP_S390_HPAGE_1M capability. Also we can now restrict gmap invalidation and notification to the cases where the capability has been activated and save some cycles when that's not the case. Signed-off-by: Janosch Frank Reviewed-by: David Hildenbrand --- arch/s390/include/asm/mmu.h | 2 ++ arch/s390/include/asm/mmu_context.h | 1 + arch/s390/mm/gmap.c | 9 ++++++--- arch/s390/mm/pgtable.c | 8 ++++---- 4 files changed, 13 insertions(+), 7 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index f5ff9dbad8ac..f31a15044c24 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -24,6 +24,8 @@ typedef struct { unsigned int uses_skeys:1; /* The mmu context uses CMM. */ unsigned int uses_cmm:1; + /* The gmaps associated with this context are allowed to use huge pages. */ + unsigned int allow_gmap_hpage_1m:1; } mm_context_t; #define INIT_MM_CONTEXT(name) \ diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index d16bc79c30bb..0717ee76885d 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -32,6 +32,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.has_pgste = 0; mm->context.uses_skeys = 0; mm->context.uses_cmm = 0; + mm->context.allow_gmap_hpage_1m = 0; #endif switch (mm->context.asce_limit) { case _REGION2_SIZE: diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 736ed32a83c5..bb44990c8212 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2,8 +2,10 @@ /* * KVM guest address space mapping code * - * Copyright IBM Corp. 2007, 2016 + * Copyright IBM Corp. 2007, 2016, 2018 * Author(s): Martin Schwidefsky + * David Hildenbrand + * Janosch Frank */ #include @@ -588,8 +590,8 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) return -EFAULT; pmd = pmd_offset(pud, vmaddr); VM_BUG_ON(pmd_none(*pmd)); - /* large pmds cannot yet be handled */ - if (pmd_large(*pmd)) + /* Are we allowed to use huge pages? */ + if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) return -EFAULT; /* Link gmap segment table entry location to page table. */ rc = radix_tree_preload(GFP_KERNEL); @@ -1632,6 +1634,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, unsigned long limit; int rc; + BUG_ON(parent->mm->context.allow_gmap_hpage_1m); BUG_ON(gmap_is_shadow(parent)); spin_lock(&parent->shadow_lock); sg = gmap_find_shadow(parent, asce, edat_level); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 37d68706f5aa..f2cc7da473e4 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -347,7 +347,7 @@ static inline void pmdp_idte_local(struct mm_struct *mm, mm->context.asce, IDTE_LOCAL); else __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); - if (mm_has_pgste(mm)) + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) gmap_pmdp_idte_local(mm, addr); } @@ -357,15 +357,15 @@ static inline void pmdp_idte_global(struct mm_struct *mm, if (MACHINE_HAS_TLB_GUEST) { __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_GLOBAL); - if (mm_has_pgste(mm)) + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) gmap_pmdp_idte_global(mm, addr); } else if (MACHINE_HAS_IDTE) { __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); - if (mm_has_pgste(mm)) + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) gmap_pmdp_idte_global(mm, addr); } else { __pmdp_csp(pmdp); - if (mm_has_pgste(mm)) + if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) gmap_pmdp_csp(mm, addr); } } -- cgit v1.2.3-55-g7522 From 37a366face294facb9c9d9fdd9f5b64a27456cbd Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Tue, 7 Aug 2018 18:57:11 +0200 Subject: s390/mm: fix addressing exception after suspend/resume Commit c9b5ad546e7d "s390/mm: tag normal pages vs pages used in page tables" accidentally changed the logic in arch_set_page_states(), which is used by the suspend/resume code. set_page_stable(page, order) was changed to set_page_stable_dat(page, 0). After this, only the first page of higher order pages will be set to stable, and a write to one of the unstable pages will result in an addressing exception. Fix this by using "order" again, instead of "0". Fixes: c9b5ad546e7d ("s390/mm: tag normal pages vs pages used in page tables") Cc: stable@vger.kernel.org # 4.14+ Reviewed-by: Heiko Carstens Signed-off-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/mm/page-states.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index 382153ff17e3..dc3cede7f2ec 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -271,7 +271,7 @@ void arch_set_page_states(int make_stable) list_for_each(l, &zone->free_area[order].free_list[t]) { page = list_entry(l, struct page, lru); if (make_stable) - set_page_stable_dat(page, 0); + set_page_stable_dat(page, order); else set_page_unused(page, order); } -- cgit v1.2.3-55-g7522