From 1e9877902dc7e11d2be038371c6fbf2dfcd469d7 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:01:54 -0800 Subject: mm/gup: Introduce get_user_pages_remote() For protection keys, we need to understand whether protections should be enforced in software or not. In general, we enforce protections when working on our own task, but not when on others. We call these "current" and "remote" operations. This patch introduces a new get_user_pages() variant: get_user_pages_remote() Which is a replacement for when get_user_pages() is called on non-current tsk/mm. We also introduce a new gup flag: FOLL_REMOTE which can be used for the "__" gup variants to get this new behavior. The uprobes is_trap_at_addr() location holds mmap_sem and calls get_user_pages(current->mm) on an instruction address. This makes it a pretty unique gup caller. Being an instruction access and also really originating from the kernel (vs. the app), I opted to consider this a 'remote' access where protection keys will not be enforced. Without protection keys, this patch should not change any behavior. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Naoya Horiguchi Cc: Peter Zijlstra Cc: Rik van Riel Cc: Srikar Dronamraju Cc: Vlastimil Babka Cc: jack@suse.cz Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210154.3F0E51EA@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- mm/gup.c | 27 ++++++++++++++++++++++----- mm/memory.c | 2 +- mm/process_vm_access.c | 11 ++++++++--- 3 files changed, 31 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 7bf19ffa2199..36ca850936c9 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -870,7 +870,7 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, EXPORT_SYMBOL(get_user_pages_unlocked); /* - * get_user_pages() - pin user pages in memory + * get_user_pages_remote() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. * @mm: mm_struct of target mm @@ -924,12 +924,29 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * should use get_user_pages because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ -long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, int write, - int force, struct page **pages, struct vm_area_struct **vmas) +long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) { return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, - pages, vmas, NULL, false, FOLL_TOUCH); + pages, vmas, NULL, false, + FOLL_TOUCH | FOLL_REMOTE); +} +EXPORT_SYMBOL(get_user_pages_remote); + +/* + * This is the same as get_user_pages_remote() for the time + * being. + */ +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) +{ + return __get_user_pages_locked(tsk, mm, start, nr_pages, + write, force, pages, vmas, NULL, false, + FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); diff --git a/mm/memory.c b/mm/memory.c index 38090ca37a08..8bfbad0cca8c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3685,7 +3685,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, void *maddr; struct page *page = NULL; - ret = get_user_pages(tsk, mm, addr, 1, + ret = get_user_pages_remote(tsk, mm, addr, 1, write, 1, &page, &vma); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 5d453e58ddbf..07514d41ebcc 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -98,9 +98,14 @@ static int process_vm_rw_single_vec(unsigned long addr, int pages = min(nr_pages, max_pages_per_loop); size_t bytes; - /* Get the pages we're interested in */ - pages = get_user_pages_unlocked(task, mm, pa, pages, - vm_write, 0, process_pages); + /* + * Get the pages we're interested in. We must + * add FOLL_REMOTE because task/mm might not + * current/current->mm + */ + pages = __get_user_pages_unlocked(task, mm, pa, pages, + vm_write, 0, process_pages, + FOLL_REMOTE); if (pages <= 0) return -EFAULT; -- cgit v1.2.3-55-g7522 From cde70140fed8429acf7a14e2e2cbd3e329036653 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:01:55 -0800 Subject: mm/gup: Overload get_user_pages() functions The concept here was a suggestion from Ingo. The implementation horrors are all mine. This allows get_user_pages(), get_user_pages_unlocked(), and get_user_pages_locked() to be called with or without the leading tsk/mm arguments. We will give a compile-time warning about the old style being __deprecated and we will also WARN_ON() if the non-remote version is used for a remote-style access. Doing this, folks will get nice warnings and will not break the build. This should be nice for -next and will hopefully let developers fix up their own code instead of maintainers needing to do it at merge time. The way we do this is hideous. It uses the __VA_ARGS__ macro functionality to call different functions based on the number of arguments passed to the macro. There's an additional hack to ensure that our EXPORT_SYMBOL() of the deprecated symbols doesn't trigger a warning. We should be able to remove this mess as soon as -rc1 hits in the release after this is merged. Signed-off-by: Dave Hansen Cc: Al Viro Cc: Alexander Kuleshov Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Dan Williams Cc: Dave Hansen Cc: Dominik Dingel Cc: Geliang Tang Cc: Jan Kara Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Leon Romanovsky Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Mateusz Guzik Cc: Maxime Coquelin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oleg Nesterov Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Xie XiuQi Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210155.73222EE1@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- include/linux/mm.h | 74 ++++++++++++++++++++++++++++++++++++++++++++++-------- mm/gup.c | 62 ++++++++++++++++++++++++++++++++++----------- mm/nommu.c | 64 ++++++++++++++++++++++++++++++++-------------- mm/util.c | 4 +-- 4 files changed, 158 insertions(+), 46 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index faf3b709eead..4c7317828fda 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1229,24 +1229,78 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas); -long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, - struct vm_area_struct **vmas); -long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, - int *locked); +long get_user_pages6(unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas); +long get_user_pages_locked6(unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, int *locked); long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, unsigned int gup_flags); -long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages); int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); +/* suppress warnings from use in EXPORT_SYMBOL() */ +#ifndef __DISABLE_GUP_DEPRECATED +#define __gup_deprecated __deprecated +#else +#define __gup_deprecated +#endif +/* + * These macros provide backward-compatibility with the old + * get_user_pages() variants which took tsk/mm. These + * functions/macros provide both compile-time __deprecated so we + * can catch old-style use and not break the build. The actual + * functions also have WARN_ON()s to let us know at runtime if + * the get_user_pages() should have been the "remote" variant. + * + * These are hideous, but temporary. + * + * If you run into one of these __deprecated warnings, look + * at how you are calling get_user_pages(). If you are calling + * it with current/current->mm as the first two arguments, + * simply remove those arguments. The behavior will be the same + * as it is now. If you are calling it on another task, use + * get_user_pages_remote() instead. + * + * Any questions? Ask Dave Hansen + */ +long +__gup_deprecated +get_user_pages8(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas); +#define GUP_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, get_user_pages, ...) \ + get_user_pages +#define get_user_pages(...) GUP_MACRO(__VA_ARGS__, \ + get_user_pages8, x, \ + get_user_pages6, x, x, x, x, x)(__VA_ARGS__) + +__gup_deprecated +long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + int *locked); +#define GUPL_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, get_user_pages_locked, ...) \ + get_user_pages_locked +#define get_user_pages_locked(...) GUPL_MACRO(__VA_ARGS__, \ + get_user_pages_locked8, x, \ + get_user_pages_locked6, x, x, x, x)(__VA_ARGS__) + +__gup_deprecated +long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages); +#define GUPU_MACRO(_1, _2, _3, _4, _5, _6, _7, get_user_pages_unlocked, ...) \ + get_user_pages_unlocked +#define get_user_pages_unlocked(...) GUPU_MACRO(__VA_ARGS__, \ + get_user_pages_unlocked7, x, \ + get_user_pages_unlocked5, x, x, x, x)(__VA_ARGS__) + /* Container for pinned pfns / pages */ struct frame_vector { unsigned int nr_allocated; /* Number of frames we have space for */ diff --git a/mm/gup.c b/mm/gup.c index 36ca850936c9..8a035e042b35 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1,3 +1,4 @@ +#define __DISABLE_GUP_DEPRECATED 1 #include #include #include @@ -807,15 +808,15 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, * if (locked) * up_read(&mm->mmap_sem); */ -long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages_locked6(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, int *locked) { - return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, - pages, NULL, locked, true, FOLL_TOUCH); + return __get_user_pages_locked(current, current->mm, start, nr_pages, + write, force, pages, NULL, locked, true, + FOLL_TOUCH); } -EXPORT_SYMBOL(get_user_pages_locked); +EXPORT_SYMBOL(get_user_pages_locked6); /* * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to @@ -860,14 +861,13 @@ EXPORT_SYMBOL(__get_user_pages_unlocked); * or if "force" shall be set to 1 (get_user_pages_fast misses the * "force" parameter). */ -long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages) { - return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, - force, pages, FOLL_TOUCH); + return __get_user_pages_unlocked(current, current->mm, start, nr_pages, + write, force, pages, FOLL_TOUCH); } -EXPORT_SYMBOL(get_user_pages_unlocked); +EXPORT_SYMBOL(get_user_pages_unlocked5); /* * get_user_pages_remote() - pin user pages in memory @@ -939,16 +939,15 @@ EXPORT_SYMBOL(get_user_pages_remote); * This is the same as get_user_pages_remote() for the time * being. */ -long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages6(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { - return __get_user_pages_locked(tsk, mm, start, nr_pages, + return __get_user_pages_locked(current, current->mm, start, nr_pages, write, force, pages, vmas, NULL, false, FOLL_TOUCH); } -EXPORT_SYMBOL(get_user_pages); +EXPORT_SYMBOL(get_user_pages6); /** * populate_vma_page_range() - populate a range of pages in the vma. @@ -1484,3 +1483,38 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, } #endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ + +long get_user_pages8(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) +{ + WARN_ONCE(tsk != current, "get_user_pages() called on remote task"); + WARN_ONCE(mm != current->mm, "get_user_pages() called on remote mm"); + + return get_user_pages6(start, nr_pages, write, force, pages, vmas); +} +EXPORT_SYMBOL(get_user_pages8); + +long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, int *locked) +{ + WARN_ONCE(tsk != current, "get_user_pages_locked() called on remote task"); + WARN_ONCE(mm != current->mm, "get_user_pages_locked() called on remote mm"); + + return get_user_pages_locked6(start, nr_pages, write, force, pages, locked); +} +EXPORT_SYMBOL(get_user_pages_locked8); + +long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages) +{ + WARN_ONCE(tsk != current, "get_user_pages_unlocked() called on remote task"); + WARN_ONCE(mm != current->mm, "get_user_pages_unlocked() called on remote mm"); + + return get_user_pages_unlocked5(start, nr_pages, write, force, pages); +} +EXPORT_SYMBOL(get_user_pages_unlocked7); + diff --git a/mm/nommu.c b/mm/nommu.c index fbf6f0f1d6c9..b64d04d19702 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -15,6 +15,8 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define __DISABLE_GUP_DEPRECATED + #include #include #include @@ -182,8 +184,7 @@ finish_or_fault: * slab page or a secondary page from a compound page * - don't permit access to VMAs that don't support it, such as I/O mappings */ -long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages6(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { @@ -194,20 +195,18 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, - NULL); + return __get_user_pages(current, current->mm, start, nr_pages, flags, + pages, vmas, NULL); } -EXPORT_SYMBOL(get_user_pages); +EXPORT_SYMBOL(get_user_pages6); -long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, - int *locked) +long get_user_pages_locked6(unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + int *locked) { - return get_user_pages(tsk, mm, start, nr_pages, write, force, - pages, NULL); + return get_user_pages6(start, nr_pages, write, force, pages, NULL); } -EXPORT_SYMBOL(get_user_pages_locked); +EXPORT_SYMBOL(get_user_pages_locked6); long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, @@ -216,21 +215,20 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, { long ret; down_read(&mm->mmap_sem); - ret = get_user_pages(tsk, mm, start, nr_pages, write, force, - pages, NULL); + ret = __get_user_pages(tsk, mm, start, nr_pages, gup_flags, pages, + NULL, NULL); up_read(&mm->mmap_sem); return ret; } EXPORT_SYMBOL(__get_user_pages_unlocked); -long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages) { - return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, - force, pages, 0); + return __get_user_pages_unlocked(current, current->mm, start, nr_pages, + write, force, pages, 0); } -EXPORT_SYMBOL(get_user_pages_unlocked); +EXPORT_SYMBOL(get_user_pages_unlocked5); /** * follow_pfn - look up PFN at a user virtual address @@ -2108,3 +2106,31 @@ static int __meminit init_admin_reserve(void) return 0; } subsys_initcall(init_admin_reserve); + +long get_user_pages8(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) +{ + return get_user_pages6(start, nr_pages, write, force, pages, vmas); +} +EXPORT_SYMBOL(get_user_pages8); + +long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + int *locked) +{ + return get_user_pages_locked6(start, nr_pages, write, + force, pages, locked); +} +EXPORT_SYMBOL(get_user_pages_locked8); + +long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages) +{ + return get_user_pages_unlocked5(start, nr_pages, write, force, pages); +} +EXPORT_SYMBOL(get_user_pages_unlocked7); + diff --git a/mm/util.c b/mm/util.c index 4fb14ca5a419..1e6011699cab 100644 --- a/mm/util.c +++ b/mm/util.c @@ -283,9 +283,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); int __weak get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - struct mm_struct *mm = current->mm; - return get_user_pages_unlocked(current, mm, start, nr_pages, - write, 0, pages); + return get_user_pages_unlocked(start, nr_pages, write, 0, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast); -- cgit v1.2.3-55-g7522 From d4edcf0d56958db0aca0196314ca38a5e730ea92 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:01:56 -0800 Subject: mm/gup: Switch all callers of get_user_pages() to not pass tsk/mm We will soon modify the vanilla get_user_pages() so it can no longer be used on mm/tasks other than 'current/current->mm', which is by far the most common way it is called. For now, we allow the old-style calls, but warn when they are used. (implemented in previous patch) This patch switches all callers of: get_user_pages() get_user_pages_unlocked() get_user_pages_locked() to stop passing tsk/mm so they will no longer see the warnings. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Naoya Horiguchi Cc: Peter Zijlstra Cc: Rik van Riel Cc: Srikar Dronamraju Cc: Vlastimil Babka Cc: jack@suse.cz Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210156.113E9407@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/cris/arch-v32/drivers/cryptocop.c | 8 ++------ arch/ia64/kernel/err_inject.c | 3 +-- arch/mips/mm/gup.c | 3 +-- arch/s390/mm/gup.c | 4 +--- arch/sh/mm/gup.c | 2 +- arch/sparc/mm/gup.c | 2 +- arch/x86/mm/gup.c | 2 +- arch/x86/mm/mpx.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 +-- drivers/gpu/drm/radeon/radeon_ttm.c | 3 +-- drivers/gpu/drm/via/via_dmablit.c | 3 +-- drivers/infiniband/core/umem.c | 2 +- drivers/infiniband/hw/mthca/mthca_memfree.c | 3 +-- drivers/infiniband/hw/qib/qib_user_pages.c | 3 +-- drivers/infiniband/hw/usnic/usnic_uiom.c | 2 +- drivers/media/pci/ivtv/ivtv-udma.c | 4 ++-- drivers/media/pci/ivtv/ivtv-yuv.c | 10 ++++------ drivers/media/v4l2-core/videobuf-dma-sg.c | 3 +-- drivers/misc/mic/scif/scif_rma.c | 2 -- drivers/misc/sgi-gru/grufault.c | 3 +-- drivers/scsi/st.c | 2 -- drivers/video/fbdev/pvr2fb.c | 4 ++-- drivers/virt/fsl_hypervisor.c | 5 ++--- mm/frame_vector.c | 2 +- mm/gup.c | 6 ++++-- mm/ksm.c | 2 +- mm/mempolicy.c | 6 +++--- net/ceph/pagevec.c | 2 +- virt/kvm/kvm_main.c | 10 +++++----- 29 files changed, 44 insertions(+), 64 deletions(-) (limited to 'mm') diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c index 877da1908234..617645d21b20 100644 --- a/arch/cris/arch-v32/drivers/cryptocop.c +++ b/arch/cris/arch-v32/drivers/cryptocop.c @@ -2719,9 +2719,7 @@ static int cryptocop_ioctl_process(struct inode *inode, struct file *filp, unsig /* Acquire the mm page semaphore. */ down_read(¤t->mm->mmap_sem); - err = get_user_pages(current, - current->mm, - (unsigned long int)(oper.indata + prev_ix), + err = get_user_pages((unsigned long int)(oper.indata + prev_ix), noinpages, 0, /* read access only for in data */ 0, /* no force */ @@ -2736,9 +2734,7 @@ static int cryptocop_ioctl_process(struct inode *inode, struct file *filp, unsig } noinpages = err; if (oper.do_cipher){ - err = get_user_pages(current, - current->mm, - (unsigned long int)oper.cipher_outdata, + err = get_user_pages((unsigned long int)oper.cipher_outdata, nooutpages, 1, /* write access for out data */ 0, /* no force */ diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c index 0c161ed6d18e..09f845793d12 100644 --- a/arch/ia64/kernel/err_inject.c +++ b/arch/ia64/kernel/err_inject.c @@ -142,8 +142,7 @@ store_virtual_to_phys(struct device *dev, struct device_attribute *attr, u64 virt_addr=simple_strtoull(buf, NULL, 16); int ret; - ret = get_user_pages(current, current->mm, virt_addr, - 1, VM_READ, 0, NULL, NULL); + ret = get_user_pages(virt_addr, 1, VM_READ, 0, NULL, NULL); if (ret<=0) { #ifdef ERR_INJ_DEBUG printk("Virtual address %lx is not existing.\n",virt_addr); diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c index 1afd87c999b0..982e83f9d11f 100644 --- a/arch/mips/mm/gup.c +++ b/arch/mips/mm/gup.c @@ -286,8 +286,7 @@ slow_irqon: start += nr << PAGE_SHIFT; pages += nr; - ret = get_user_pages_unlocked(current, mm, start, - (end - start) >> PAGE_SHIFT, + ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, write, 0, pages); /* Have to be a bit careful with return values */ diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 13dab0c1645c..49a1c84ed266 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -210,7 +210,6 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - struct mm_struct *mm = current->mm; int nr, ret; might_sleep(); @@ -222,8 +221,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, /* Try to get the remaining pages with get_user_pages */ start += nr << PAGE_SHIFT; pages += nr; - ret = get_user_pages_unlocked(current, mm, start, - nr_pages - nr, write, 0, pages); + ret = get_user_pages_unlocked(start, nr_pages - nr, write, 0, pages); /* Have to be a bit careful with return values */ if (nr > 0) ret = (ret < 0) ? nr : ret + nr; diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c index e7af6a65baab..40fa6c8adc43 100644 --- a/arch/sh/mm/gup.c +++ b/arch/sh/mm/gup.c @@ -257,7 +257,7 @@ slow_irqon: start += nr << PAGE_SHIFT; pages += nr; - ret = get_user_pages_unlocked(current, mm, start, + ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, write, 0, pages); /* Have to be a bit careful with return values */ diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index eb3d8e8ebc6b..4e06750a5d29 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -237,7 +237,7 @@ slow: start += nr << PAGE_SHIFT; pages += nr; - ret = get_user_pages_unlocked(current, mm, start, + ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, write, 0, pages); /* Have to be a bit careful with return values */ diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 6d5eb5900372..ce5e4545203b 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -422,7 +422,7 @@ slow_irqon: start += nr << PAGE_SHIFT; pages += nr; - ret = get_user_pages_unlocked(current, mm, start, + ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, write, 0, pages); diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index b2fd67da1701..84fa4a482c78 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -546,8 +546,8 @@ static int mpx_resolve_fault(long __user *addr, int write) int nr_pages = 1; int force = 0; - gup_ret = get_user_pages(current, current->mm, (unsigned long)addr, - nr_pages, write, force, NULL, NULL); + gup_ret = get_user_pages((unsigned long)addr, nr_pages, write, + force, NULL, NULL); /* * get_user_pages() returns number of pages gotten. * 0 means we failed to fault in and get anything, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 6442a06d6fdc..5fedfb68d7ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -518,8 +518,7 @@ static int amdgpu_ttm_tt_pin_userptr(struct ttm_tt *ttm) uint64_t userptr = gtt->userptr + pinned * PAGE_SIZE; struct page **pages = ttm->pages + pinned; - r = get_user_pages(current, current->mm, userptr, num_pages, - write, 0, pages, NULL); + r = get_user_pages(userptr, num_pages, write, 0, pages, NULL); if (r < 0) goto release_pages; diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index e34307459e50..927a9f2d6570 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -554,8 +554,7 @@ static int radeon_ttm_tt_pin_userptr(struct ttm_tt *ttm) uint64_t userptr = gtt->userptr + pinned * PAGE_SIZE; struct page **pages = ttm->pages + pinned; - r = get_user_pages(current, current->mm, userptr, num_pages, - write, 0, pages, NULL); + r = get_user_pages(userptr, num_pages, write, 0, pages, NULL); if (r < 0) goto release_pages; diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c index d0cbd5ecd7f0..e797dfc07ae3 100644 --- a/drivers/gpu/drm/via/via_dmablit.c +++ b/drivers/gpu/drm/via/via_dmablit.c @@ -239,8 +239,7 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg, drm_via_dmablit_t *xfer) if (NULL == vsg->pages) return -ENOMEM; down_read(¤t->mm->mmap_sem); - ret = get_user_pages(current, current->mm, - (unsigned long)xfer->mem_addr, + ret = get_user_pages((unsigned long)xfer->mem_addr, vsg->num_pages, (vsg->direction == DMA_FROM_DEVICE), 0, vsg->pages, NULL); diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 38acb3cfc545..fe4d2e1a8b58 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -188,7 +188,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, sg_list_start = umem->sg_head.sgl; while (npages) { - ret = get_user_pages(current, current->mm, cur_base, + ret = get_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), 1, !umem->writable, page_list, vma_list); diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index 7d2e42dd6926..6c00d04b8b28 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -472,8 +472,7 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar, goto out; } - ret = get_user_pages(current, current->mm, uaddr & PAGE_MASK, 1, 1, 0, - pages, NULL); + ret = get_user_pages(uaddr & PAGE_MASK, 1, 1, 0, pages, NULL); if (ret < 0) goto out; diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index 74f90b2619f6..2d2b94fd3633 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -66,8 +66,7 @@ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages, } for (got = 0; got < num_pages; got += ret) { - ret = get_user_pages(current, current->mm, - start_page + got * PAGE_SIZE, + ret = get_user_pages(start_page + got * PAGE_SIZE, num_pages - got, 1, 1, p + got, NULL); if (ret < 0) diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 645a5f6e6c88..7209fbc03ccb 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -144,7 +144,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, ret = 0; while (npages) { - ret = get_user_pages(current, current->mm, cur_base, + ret = get_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof(struct page *)), 1, !writable, page_list, NULL); diff --git a/drivers/media/pci/ivtv/ivtv-udma.c b/drivers/media/pci/ivtv/ivtv-udma.c index 24152accc66c..4769469fe842 100644 --- a/drivers/media/pci/ivtv/ivtv-udma.c +++ b/drivers/media/pci/ivtv/ivtv-udma.c @@ -124,8 +124,8 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr, } /* Get user pages for DMA Xfer */ - err = get_user_pages_unlocked(current, current->mm, - user_dma.uaddr, user_dma.page_count, 0, 1, dma->map); + err = get_user_pages_unlocked(user_dma.uaddr, user_dma.page_count, 0, + 1, dma->map); if (user_dma.page_count != err) { IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n", diff --git a/drivers/media/pci/ivtv/ivtv-yuv.c b/drivers/media/pci/ivtv/ivtv-yuv.c index 2b8e7b2f2b86..b094054cda6e 100644 --- a/drivers/media/pci/ivtv/ivtv-yuv.c +++ b/drivers/media/pci/ivtv/ivtv-yuv.c @@ -75,14 +75,12 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, ivtv_udma_get_page_info (&uv_dma, (unsigned long)args->uv_source, 360 * uv_decode_height); /* Get user pages for DMA Xfer */ - y_pages = get_user_pages_unlocked(current, current->mm, - y_dma.uaddr, y_dma.page_count, 0, 1, - &dma->map[0]); + y_pages = get_user_pages_unlocked(y_dma.uaddr, + y_dma.page_count, 0, 1, &dma->map[0]); uv_pages = 0; /* silence gcc. value is set and consumed only if: */ if (y_pages == y_dma.page_count) { - uv_pages = get_user_pages_unlocked(current, current->mm, - uv_dma.uaddr, uv_dma.page_count, 0, 1, - &dma->map[y_pages]); + uv_pages = get_user_pages_unlocked(uv_dma.uaddr, + uv_dma.page_count, 0, 1, &dma->map[y_pages]); } if (y_pages != y_dma.page_count || uv_pages != uv_dma.page_count) { diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index f669cedca8bd..df4c052c6bd6 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -181,8 +181,7 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma, dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n", data, size, dma->nr_pages); - err = get_user_pages(current, current->mm, - data & PAGE_MASK, dma->nr_pages, + err = get_user_pages(data & PAGE_MASK, dma->nr_pages, rw == READ, 1, /* force */ dma->pages, NULL); diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c index 8310b4dbff06..0fa0d242b295 100644 --- a/drivers/misc/mic/scif/scif_rma.c +++ b/drivers/misc/mic/scif/scif_rma.c @@ -1394,8 +1394,6 @@ retry: } pinned_pages->nr_pages = get_user_pages( - current, - mm, (u64)addr, nr_pages, !!(prot & SCIF_PROT_WRITE), diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c index f74fc0ca2ef9..a2d97b9b17e3 100644 --- a/drivers/misc/sgi-gru/grufault.c +++ b/drivers/misc/sgi-gru/grufault.c @@ -198,8 +198,7 @@ static int non_atomic_pte_lookup(struct vm_area_struct *vma, #else *pageshift = PAGE_SHIFT; #endif - if (get_user_pages - (current, current->mm, vaddr, 1, write, 0, &page, NULL) <= 0) + if (get_user_pages(vaddr, 1, write, 0, &page, NULL) <= 0) return -EFAULT; *paddr = page_to_phys(page); put_page(page); diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 2e522951b619..664852af4417 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -4817,8 +4817,6 @@ static int sgl_map_user_pages(struct st_buffer *STbp, /* Try to fault in all of the necessary pages */ /* rw==READ means read from drive, write into memory area */ res = get_user_pages_unlocked( - current, - current->mm, uaddr, nr_pages, rw == READ, diff --git a/drivers/video/fbdev/pvr2fb.c b/drivers/video/fbdev/pvr2fb.c index 0e24eb9c219c..71a923e53f93 100644 --- a/drivers/video/fbdev/pvr2fb.c +++ b/drivers/video/fbdev/pvr2fb.c @@ -686,8 +686,8 @@ static ssize_t pvr2fb_write(struct fb_info *info, const char *buf, if (!pages) return -ENOMEM; - ret = get_user_pages_unlocked(current, current->mm, (unsigned long)buf, - nr_pages, WRITE, 0, pages); + ret = get_user_pages_unlocked((unsigned long)buf, nr_pages, WRITE, + 0, pages); if (ret < nr_pages) { nr_pages = ret; diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c index 32c8fc5f7a5c..60bdad3a689b 100644 --- a/drivers/virt/fsl_hypervisor.c +++ b/drivers/virt/fsl_hypervisor.c @@ -244,9 +244,8 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p) /* Get the physical addresses of the source buffer */ down_read(¤t->mm->mmap_sem); - num_pinned = get_user_pages(current, current->mm, - param.local_vaddr - lb_offset, num_pages, - (param.source == -1) ? READ : WRITE, + num_pinned = get_user_pages(param.local_vaddr - lb_offset, + num_pages, (param.source == -1) ? READ : WRITE, 0, pages, NULL); up_read(¤t->mm->mmap_sem); diff --git a/mm/frame_vector.c b/mm/frame_vector.c index 7cf2b7163222..381bb07ed14f 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -58,7 +58,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { vec->got_ref = true; vec->is_pfns = false; - ret = get_user_pages_locked(current, mm, start, nr_frames, + ret = get_user_pages_locked(start, nr_frames, write, force, (struct page **)(vec->ptrs), &locked); goto out; } diff --git a/mm/gup.c b/mm/gup.c index 8a035e042b35..de24ef4cd1af 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -936,8 +936,10 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, EXPORT_SYMBOL(get_user_pages_remote); /* - * This is the same as get_user_pages_remote() for the time - * being. + * This is the same as get_user_pages_remote(), just with a + * less-flexible calling convention where we assume that the task + * and mm being operated on are the current task's. We also + * obviously don't pass FOLL_REMOTE in here. */ long get_user_pages6(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, diff --git a/mm/ksm.c b/mm/ksm.c index ca6d2a06a615..c2013f638d11 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -352,7 +352,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm) /* * We use break_ksm to break COW on a ksm page: it's a stripped down * - * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) + * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1) * put_page(page); * * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4c4187c0e1de..dd0ce7fbd47b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -844,12 +844,12 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) } } -static int lookup_node(struct mm_struct *mm, unsigned long addr) +static int lookup_node(unsigned long addr) { struct page *p; int err; - err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); + err = get_user_pages(addr & PAGE_MASK, 1, 0, 0, &p, NULL); if (err >= 0) { err = page_to_nid(p); put_page(p); @@ -904,7 +904,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { - err = lookup_node(mm, addr); + err = lookup_node(addr); if (err < 0) goto out; *policy = err; diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index d4f5f220a8e5..10297f7a89ba 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -24,7 +24,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data, return ERR_PTR(-ENOMEM); while (got < num_pages) { - rc = get_user_pages_unlocked(current, current->mm, + rc = get_user_pages_unlocked( (unsigned long)data + ((unsigned long)got * PAGE_SIZE), num_pages - got, write_page, 0, pages + got); if (rc < 0) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a11cfd20a6a0..0253ad900ec3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1264,15 +1264,16 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w return gfn_to_hva_memslot_prot(slot, gfn, writable); } -static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int write, struct page **page) +static int get_user_page_nowait(unsigned long start, int write, + struct page **page) { int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; if (write) flags |= FOLL_WRITE; - return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); + return __get_user_pages(current, current->mm, start, 1, flags, page, + NULL, NULL); } static inline int check_user_page_hwpoison(unsigned long addr) @@ -1334,8 +1335,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, if (async) { down_read(¤t->mm->mmap_sem); - npages = get_user_page_nowait(current, current->mm, - addr, write_fault, page); + npages = get_user_page_nowait(addr, write_fault, page); up_read(¤t->mm->mmap_sem); } else npages = __get_user_pages_unlocked(current, current->mm, addr, 1, -- cgit v1.2.3-55-g7522 From 63c17fb8e5a46a16e10e82005748837fd11a2024 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:02:08 -0800 Subject: mm/core, x86/mm/pkeys: Store protection bits in high VMA flags vma->vm_flags is an 'unsigned long', so has space for 32 flags on 32-bit architectures. The high 32 bits are unused on 64-bit platforms. We've steered away from using the unused high VMA bits for things because we would have difficulty supporting it on 32-bit. Protection Keys are not available in 32-bit mode, so there is no concern about supporting this feature in 32-bit mode or on 32-bit CPUs. This patch carves out 4 bits from the high half of vma->vm_flags and allows architectures to set config option to make them available. Sparse complains about these constants unless we explicitly call them "UL". Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dan Williams Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jan Kara Cc: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Linus Torvalds Cc: Mel Gorman Cc: Michal Hocko Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Sasha Levin Cc: Valentin Rothberg Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Xie XiuQi Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210208.81AF00D5@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + include/linux/mm.h | 11 +++++++++++ mm/Kconfig | 3 +++ 3 files changed, 15 insertions(+) (limited to 'mm') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3632cdd03201..fb2ebeb9a692 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -155,6 +155,7 @@ config X86 select VIRT_TO_BUS select X86_DEV_DMA_OPS if X86_64 select X86_FEATURE_NAMES if PROC_FS + select ARCH_USES_HIGH_VMA_FLAGS if X86_INTEL_MEMORY_PROTECTION_KEYS config INSTRUCTION_DECODER def_bool y diff --git a/include/linux/mm.h b/include/linux/mm.h index 4c7317828fda..54d173bcc327 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -170,6 +170,17 @@ extern unsigned int kobjsize(const void *objp); #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ +#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS +#define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) +#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) +#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) +#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) +#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ + #if defined(CONFIG_X86) # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ #elif defined(CONFIG_PPC) diff --git a/mm/Kconfig b/mm/Kconfig index 03cbfa072f42..6cf439948297 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -669,3 +669,6 @@ config ZONE_DEVICE config FRAME_VECTOR bool + +config ARCH_USES_HIGH_VMA_FLAGS + bool -- cgit v1.2.3-55-g7522 From d4925e00d59698a201231cf99dce47d8b922bb34 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:02:16 -0800 Subject: mm/gup: Factor out VMA fault permission checking This code matches a fault condition up with the VMA and ensures that the VMA allows the fault to be handled instead of just erroring out. We will be extending this in a moment to comprehend protection keys. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Borislav Petkov Cc: Brian Gerst Cc: Dan Williams Cc: Dave Hansen Cc: Denys Vlasenko Cc: Dominik Dingel Cc: Eric B Munson Cc: H. Peter Anvin Cc: Jason Low Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Sasha Levin Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210216.C3824032@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- mm/gup.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index de24ef4cd1af..b935c2c71ec9 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -610,6 +610,18 @@ next_page: } EXPORT_SYMBOL(__get_user_pages); +bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) +{ + vm_flags_t vm_flags; + + vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; + + if (!(vm_flags & vma->vm_flags)) + return false; + + return true; +} + /* * fixup_user_fault() - manually resolve a user page fault * @tsk: the task_struct to use for page fault accounting, or @@ -645,7 +657,6 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, bool *unlocked) { struct vm_area_struct *vma; - vm_flags_t vm_flags; int ret, major = 0; if (unlocked) @@ -656,8 +667,7 @@ retry: if (!vma || address < vma->vm_start) return -EFAULT; - vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; - if (!(vm_flags & vma->vm_flags)) + if (!vma_permits_fault(vma, fault_flags)) return -EFAULT; ret = handle_mm_fault(mm, vma, address, fault_flags); -- cgit v1.2.3-55-g7522 From 33a709b25a760b91184bb335cf7d7c32b8123013 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:02:19 -0800 Subject: mm/gup, x86/mm/pkeys: Check VMAs and PTEs for protection keys Today, for normal faults and page table walks, we check the VMA and/or PTE to ensure that it is compatible with the action. For instance, if we get a write fault on a non-writeable VMA, we SIGSEGV. We try to do the same thing for protection keys. Basically, we try to make sure that if a user does this: mprotect(ptr, size, PROT_NONE); *ptr = foo; they see the same effects with protection keys when they do this: mprotect(ptr, size, PROT_READ|PROT_WRITE); set_pkey(ptr, size, 4); wrpkru(0xffffff3f); // access disable pkey 4 *ptr = foo; The state to do that checking is in the VMA, but we also sometimes have to do it on the page tables only, like when doing a get_user_pages_fast() where we have no VMA. We add two functions and expose them to generic code: arch_pte_access_permitted(pte_flags, write) arch_vma_access_permitted(vma, write) These are, of course, backed up in x86 arch code with checks against the PTE or VMA's protection key. But, there are also cases where we do not want to respect protection keys. When we ptrace(), for instance, we do not want to apply the tracer's PKRU permissions to the PTEs from the process being traced. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Alexey Kardashevskiy Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Boaz Harrosh Cc: Borislav Petkov Cc: Brian Gerst Cc: Dan Williams Cc: Dave Hansen Cc: David Gibson Cc: David Hildenbrand Cc: David Vrabel Cc: Denys Vlasenko Cc: Dominik Dingel Cc: Dominik Vogt Cc: Guan Xuetao Cc: H. Peter Anvin Cc: Heiko Carstens Cc: Hugh Dickins Cc: Jason Low Cc: Jerome Marchand Cc: Juergen Gross Cc: Kirill A. Shutemov Cc: Laurent Dufour Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michael Ellerman Cc: Michal Hocko Cc: Mikulas Patocka Cc: Minchan Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rik van Riel Cc: Sasha Levin Cc: Shachar Raindel Cc: Stephen Smalley Cc: Toshi Kani Cc: Vlastimil Babka Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-s390@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20160212210219.14D5D715@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/mmu_context.h | 11 +++++++ arch/s390/include/asm/mmu_context.h | 11 +++++++ arch/unicore32/include/asm/mmu_context.h | 11 +++++++ arch/x86/include/asm/mmu_context.h | 49 ++++++++++++++++++++++++++++++++ arch/x86/include/asm/pgtable.h | 29 +++++++++++++++++++ arch/x86/mm/fault.c | 21 +++++++++++++- arch/x86/mm/gup.c | 5 ++++ include/asm-generic/mm_hooks.h | 11 +++++++ mm/gup.c | 18 ++++++++++-- mm/memory.c | 4 +++ 10 files changed, 166 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 878c27771717..a0f1838c8e78 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -148,5 +148,16 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm, { } +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write) +{ + /* by default, allow everything */ + return true; +} + +static inline bool arch_pte_access_permitted(pte_t pte, bool write) +{ + /* by default, allow everything */ + return true; +} #endif /* __KERNEL__ */ #endif /* __ASM_POWERPC_MMU_CONTEXT_H */ diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index fb1b93ea3e3f..2627b338382c 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -130,4 +130,15 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm, { } +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write) +{ + /* by default, allow everything */ + return true; +} + +static inline bool arch_pte_access_permitted(pte_t pte, bool write) +{ + /* by default, allow everything */ + return true; +} #endif /* __S390_MMU_CONTEXT_H */ diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h index 1cb5220afaf9..3133f947ade2 100644 --- a/arch/unicore32/include/asm/mmu_context.h +++ b/arch/unicore32/include/asm/mmu_context.h @@ -97,4 +97,15 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm, { } +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write) +{ + /* by default, allow everything */ + return true; +} + +static inline bool arch_pte_access_permitted(pte_t pte, bool write) +{ + /* by default, allow everything */ + return true; +} #endif diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 94c4c8b5cb8f..19036cdbed8f 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -286,4 +286,53 @@ static inline int vma_pkey(struct vm_area_struct *vma) return pkey; } +static inline bool __pkru_allows_pkey(u16 pkey, bool write) +{ + u32 pkru = read_pkru(); + + if (!__pkru_allows_read(pkru, pkey)) + return false; + if (write && !__pkru_allows_write(pkru, pkey)) + return false; + + return true; +} + +/* + * We only want to enforce protection keys on the current process + * because we effectively have no access to PKRU for other + * processes or any way to tell *which * PKRU in a threaded + * process we could use. + * + * So do not enforce things if the VMA is not from the current + * mm, or if we are in a kernel thread. + */ +static inline bool vma_is_foreign(struct vm_area_struct *vma) +{ + if (!current->mm) + return true; + /* + * Should PKRU be enforced on the access to this VMA? If + * the VMA is from another process, then PKRU has no + * relevance and should not be enforced. + */ + if (current->mm != vma->vm_mm) + return true; + + return false; +} + +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write) +{ + /* allow access if the VMA is not one from this process */ + if (vma_is_foreign(vma)) + return true; + return __pkru_allows_pkey(vma_pkey(vma), write); +} + +static inline bool arch_pte_access_permitted(pte_t pte, bool write) +{ + return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write); +} + #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e997dcc6ee2b..3cbfae80abb2 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -919,6 +919,35 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) } #endif +#define PKRU_AD_BIT 0x1 +#define PKRU_WD_BIT 0x2 + +static inline bool __pkru_allows_read(u32 pkru, u16 pkey) +{ + int pkru_pkey_bits = pkey * 2; + return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits)); +} + +static inline bool __pkru_allows_write(u32 pkru, u16 pkey) +{ + int pkru_pkey_bits = pkey * 2; + /* + * Access-disable disables writes too so we need to check + * both bits here. + */ + return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits)); +} + +static inline u16 pte_flags_pkey(unsigned long pte_flags) +{ +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS + /* ifdef to avoid doing 59-bit shift on 32-bit values */ + return (pte_flags & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0; +#else + return 0; +#endif +} + #include #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6e71dcf699ab..319331afae24 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -897,6 +897,16 @@ bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) __bad_area(regs, error_code, address, NULL, SEGV_MAPERR); } +static inline bool bad_area_access_from_pkeys(unsigned long error_code, + struct vm_area_struct *vma) +{ + if (!boot_cpu_has(X86_FEATURE_OSPKE)) + return false; + if (error_code & PF_PK) + return true; + return false; +} + static noinline void bad_area_access_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct vm_area_struct *vma) @@ -906,7 +916,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, * But, doing it this way allows compiler optimizations * if pkeys are compiled out. */ - if (boot_cpu_has(X86_FEATURE_OSPKE) && (error_code & PF_PK)) + if (bad_area_access_from_pkeys(error_code, vma)) __bad_area(regs, error_code, address, vma, SEGV_PKUERR); else __bad_area(regs, error_code, address, vma, SEGV_ACCERR); @@ -1081,6 +1091,15 @@ int show_unhandled_signals = 1; static inline int access_error(unsigned long error_code, struct vm_area_struct *vma) { + /* + * Access or read was blocked by protection keys. We do + * this check before any others because we do not want + * to, for instance, confuse a protection-key-denied + * write with one for which we should do a COW. + */ + if (error_code & PF_PK) + return 1; + if (error_code & PF_WRITE) { /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 2f0a32945cda..bab259e75984 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -11,6 +11,7 @@ #include #include +#include #include static inline pte_t gup_get_pte(pte_t *ptep) @@ -89,6 +90,10 @@ static inline int pte_allows_gup(unsigned long pteval, int write) if ((pteval & need_pte_bits) != need_pte_bits) return 0; + /* Check memory protection keys permissions. */ + if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write)) + return 0; + return 1; } diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h index 866aa461efa5..c1fc5af3c384 100644 --- a/include/asm-generic/mm_hooks.h +++ b/include/asm-generic/mm_hooks.h @@ -26,4 +26,15 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm, { } +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write) +{ + /* by default, allow everything */ + return true; +} + +static inline bool arch_pte_access_permitted(pte_t pte, bool write) +{ + /* by default, allow everything */ + return true; +} #endif /* _ASM_GENERIC_MM_HOOKS_H */ diff --git a/mm/gup.c b/mm/gup.c index b935c2c71ec9..e0f5f3574d16 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -444,6 +445,8 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (!(vm_flags & VM_MAYREAD)) return -EFAULT; } + if (!arch_vma_access_permitted(vma, (gup_flags & FOLL_WRITE))) + return -EFAULT; return 0; } @@ -612,13 +615,19 @@ EXPORT_SYMBOL(__get_user_pages); bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) { - vm_flags_t vm_flags; - - vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; + bool write = !!(fault_flags & FAULT_FLAG_WRITE); + vm_flags_t vm_flags = write ? VM_WRITE : VM_READ; if (!(vm_flags & vma->vm_flags)) return false; + /* + * The architecture might have a hardware protection + * mechanism other than read/write that can deny access + */ + if (!arch_vma_access_permitted(vma, write)) + return false; + return true; } @@ -1172,6 +1181,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, pte_protnone(pte) || (write && !pte_write(pte))) goto pte_unmap; + if (!arch_pte_access_permitted(pte, write)) + goto pte_unmap; + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); head = compound_head(page); diff --git a/mm/memory.c b/mm/memory.c index 8bfbad0cca8c..d7e84fe6504d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -65,6 +65,7 @@ #include #include +#include #include #include #include @@ -3378,6 +3379,9 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd; pte_t *pte; + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE)) + return VM_FAULT_SIGSEGV; + if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); -- cgit v1.2.3-55-g7522 From 1b2ee1266ea647713dbaf44825967c180dfc8d76 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:02:21 -0800 Subject: mm/core: Do not enforce PKEY permissions on remote mm access We try to enforce protection keys in software the same way that we do in hardware. (See long example below). But, we only want to do this when accessing our *own* process's memory. If GDB set PKRU[6].AD=1 (disable access to PKEY 6), then tried to PTRACE_POKE a target process which just happened to have some mprotect_pkey(pkey=6) memory, we do *not* want to deny the debugger access to that memory. PKRU is fundamentally a thread-local structure and we do not want to enforce it on access to _another_ thread's data. This gets especially tricky when we have workqueues or other delayed-work mechanisms that might run in a random process's context. We can check that we only enforce pkeys when operating on our *own* mm, but delayed work gets performed when a random user context is active. We might end up with a situation where a delayed-work gup fails when running randomly under its "own" task but succeeds when running under another process. We want to avoid that. To avoid that, we use the new GUP flag: FOLL_REMOTE and add a fault flag: FAULT_FLAG_REMOTE. They indicate that we are walking an mm which is not guranteed to be the same as current->mm and should not be subject to protection key enforcement. Thanks to Jerome Glisse for pointing out this scenario. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Alexey Kardashevskiy Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Boaz Harrosh Cc: Borislav Petkov Cc: Brian Gerst Cc: Dan Williams Cc: Dave Chinner Cc: Dave Hansen Cc: David Gibson Cc: Denys Vlasenko Cc: Dominik Dingel Cc: Dominik Vogt Cc: Eric B Munson Cc: Geliang Tang Cc: Guan Xuetao Cc: H. Peter Anvin Cc: Heiko Carstens Cc: Hugh Dickins Cc: Jan Kara Cc: Jason Low Cc: Jerome Marchand Cc: Joerg Roedel Cc: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Laurent Dufour Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michael Ellerman Cc: Michal Hocko Cc: Mikulas Patocka Cc: Minchan Kim Cc: Oleg Nesterov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rik van Riel Cc: Sasha Levin Cc: Shachar Raindel Cc: Vlastimil Babka Cc: Xie XiuQi Cc: iommu@lists.linux-foundation.org Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-s390@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/mmu_context.h | 3 ++- arch/s390/include/asm/mmu_context.h | 3 ++- arch/unicore32/include/asm/mmu_context.h | 3 ++- arch/x86/include/asm/mmu_context.h | 5 +++-- drivers/iommu/amd_iommu_v2.c | 1 + include/asm-generic/mm_hooks.h | 3 ++- include/linux/mm.h | 1 + mm/gup.c | 15 ++++++++++----- mm/ksm.c | 10 ++++++++-- mm/memory.c | 3 ++- 10 files changed, 33 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index a0f1838c8e78..df9bf3ed025b 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -148,7 +148,8 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm, { } -static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write) +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, + bool write, bool foreign) { /* by default, allow everything */ return true; diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 2627b338382c..8906600922ce 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -130,7 +130,8 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm, { } -static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write) +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, + bool write, bool foreign) { /* by default, allow everything */ return true; diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h index 3133f947ade2..e35632ef23c7 100644 --- a/arch/unicore32/include/asm/mmu_context.h +++ b/arch/unicore32/include/asm/mmu_context.h @@ -97,7 +97,8 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm, { } -static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write) +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, + bool write, bool foreign) { /* by default, allow everything */ return true; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 19036cdbed8f..b4d939a17e60 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -322,10 +322,11 @@ static inline bool vma_is_foreign(struct vm_area_struct *vma) return false; } -static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write) +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, + bool write, bool foreign) { /* allow access if the VMA is not one from this process */ - if (vma_is_foreign(vma)) + if (foreign || vma_is_foreign(vma)) return true; return __pkru_allows_pkey(vma_pkey(vma), write); } diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index c865737326e1..56999d2fac07 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -526,6 +526,7 @@ static void do_fault(struct work_struct *work) flags |= FAULT_FLAG_USER; if (fault->flags & PPR_FAULT_WRITE) flags |= FAULT_FLAG_WRITE; + flags |= FAULT_FLAG_REMOTE; down_read(&mm->mmap_sem); vma = find_extend_vma(mm, address); diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h index c1fc5af3c384..d5c9633bd955 100644 --- a/include/asm-generic/mm_hooks.h +++ b/include/asm-generic/mm_hooks.h @@ -26,7 +26,8 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm, { } -static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write) +static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, + bool write, bool foreign) { /* by default, allow everything */ return true; diff --git a/include/linux/mm.h b/include/linux/mm.h index 3056369bab1d..2aaa0f0d67ea 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -251,6 +251,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_KILLABLE 0x10 /* The fault task is in SIGKILL killable region */ #define FAULT_FLAG_TRIED 0x20 /* Second try */ #define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */ +#define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ /* * vm_fault is filled by the the pagefault handler and passed to the vma's diff --git a/mm/gup.c b/mm/gup.c index e0f5f3574d16..d276760163b3 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -365,6 +365,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, return -ENOENT; if (*flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; + if (*flags & FOLL_REMOTE) + fault_flags |= FAULT_FLAG_REMOTE; if (nonblocking) fault_flags |= FAULT_FLAG_ALLOW_RETRY; if (*flags & FOLL_NOWAIT) @@ -415,11 +417,13 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) { vm_flags_t vm_flags = vma->vm_flags; + int write = (gup_flags & FOLL_WRITE); + int foreign = (gup_flags & FOLL_REMOTE); if (vm_flags & (VM_IO | VM_PFNMAP)) return -EFAULT; - if (gup_flags & FOLL_WRITE) { + if (write) { if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; @@ -445,7 +449,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (!(vm_flags & VM_MAYREAD)) return -EFAULT; } - if (!arch_vma_access_permitted(vma, (gup_flags & FOLL_WRITE))) + if (!arch_vma_access_permitted(vma, write, foreign)) return -EFAULT; return 0; } @@ -615,7 +619,8 @@ EXPORT_SYMBOL(__get_user_pages); bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) { - bool write = !!(fault_flags & FAULT_FLAG_WRITE); + bool write = !!(fault_flags & FAULT_FLAG_WRITE); + bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); vm_flags_t vm_flags = write ? VM_WRITE : VM_READ; if (!(vm_flags & vma->vm_flags)) @@ -623,9 +628,9 @@ bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) /* * The architecture might have a hardware protection - * mechanism other than read/write that can deny access + * mechanism other than read/write that can deny access. */ - if (!arch_vma_access_permitted(vma, write)) + if (!arch_vma_access_permitted(vma, write, foreign)) return false; return true; diff --git a/mm/ksm.c b/mm/ksm.c index c2013f638d11..b99e828172f6 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -359,6 +359,10 @@ static inline bool ksm_test_exit(struct mm_struct *mm) * in case the application has unmapped and remapped mm,addr meanwhile. * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. + * + * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context + * of the process that owns 'vma'. We also do not want to enforce + * protection keys here anyway. */ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) { @@ -367,12 +371,14 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) do { cond_resched(); - page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION); + page = follow_page(vma, addr, + FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); if (IS_ERR_OR_NULL(page)) break; if (PageKsm(page)) ret = handle_mm_fault(vma->vm_mm, vma, addr, - FAULT_FLAG_WRITE); + FAULT_FLAG_WRITE | + FAULT_FLAG_REMOTE); else ret = VM_FAULT_WRITE; put_page(page); diff --git a/mm/memory.c b/mm/memory.c index d7e84fe6504d..76c44e5dffa2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3379,7 +3379,8 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd; pte_t *pte; - if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE)) + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, + flags & FAULT_FLAG_REMOTE)) return VM_FAULT_SIGSEGV; if (unlikely(is_vm_hugetlb_page(vma))) -- cgit v1.2.3-55-g7522 From d61172b4b695b821388cdb6088a41d431bcbb93b Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:02:24 -0800 Subject: mm/core, x86/mm/pkeys: Differentiate instruction fetches As discussed earlier, we attempt to enforce protection keys in software. However, the code checks all faults to ensure that they are not violating protection key permissions. It was assumed that all faults are either write faults where we check PKRU[key].WD (write disable) or read faults where we check the AD (access disable) bit. But, there is a third category of faults for protection keys: instruction faults. Instruction faults never run afoul of protection keys because they do not affect instruction fetches. So, plumb the PF_INSTR bit down in to the arch_vma_access_permitted() function where we do the protection key checks. We also add a new FAULT_FLAG_INSTRUCTION. This is because handle_mm_fault() is not passed the architecture-specific error_code where we keep PF_INSTR, so we need to encode the instruction fetch information in to the arch-generic fault flags. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210224.96928009@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/mmu_context.h | 2 +- arch/s390/include/asm/mmu_context.h | 2 +- arch/x86/include/asm/mmu_context.h | 5 ++++- arch/x86/mm/fault.c | 8 ++++++-- include/asm-generic/mm_hooks.h | 2 +- include/linux/mm.h | 1 + mm/gup.c | 11 +++++++++-- mm/memory.c | 1 + 8 files changed, 24 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index df9bf3ed025b..4eaab40e3ade 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -149,7 +149,7 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm, } static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, - bool write, bool foreign) + bool write, bool execute, bool foreign) { /* by default, allow everything */ return true; diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 8906600922ce..fa66b6dfa97a 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -131,7 +131,7 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm, } static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, - bool write, bool foreign) + bool write, bool execute, bool foreign) { /* by default, allow everything */ return true; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index b4d939a17e60..6572b949cbca 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -323,8 +323,11 @@ static inline bool vma_is_foreign(struct vm_area_struct *vma) } static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, - bool write, bool foreign) + bool write, bool execute, bool foreign) { + /* pkeys never affect instruction fetches */ + if (execute) + return true; /* allow access if the VMA is not one from this process */ if (foreign || vma_is_foreign(vma)) return true; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 68ecdffe284e..d81744e6f39f 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -908,7 +908,8 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code, if (error_code & PF_PK) return true; /* this checks permission keys on the VMA: */ - if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), foreign)) + if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), + (error_code & PF_INSTR), foreign)) return true; return false; } @@ -1112,7 +1113,8 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) * faults just to hit a PF_PK as soon as we fill in a * page. */ - if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), foreign)) + if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE), + (error_code & PF_INSTR), foreign)) return 1; if (error_code & PF_WRITE) { @@ -1267,6 +1269,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, if (error_code & PF_WRITE) flags |= FAULT_FLAG_WRITE; + if (error_code & PF_INSTR) + flags |= FAULT_FLAG_INSTRUCTION; /* * When running in the kernel we expect faults to occur only to diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h index d5c9633bd955..cc5d9a1405df 100644 --- a/include/asm-generic/mm_hooks.h +++ b/include/asm-generic/mm_hooks.h @@ -27,7 +27,7 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm, } static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, - bool write, bool foreign) + bool write, bool execute, bool foreign) { /* by default, allow everything */ return true; diff --git a/include/linux/mm.h b/include/linux/mm.h index 2aaa0f0d67ea..7955c3eb83db 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -252,6 +252,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_TRIED 0x20 /* Second try */ #define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */ #define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ +#define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ /* * vm_fault is filled by the the pagefault handler and passed to the vma's diff --git a/mm/gup.c b/mm/gup.c index d276760163b3..7f1c4fb77cfa 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -449,7 +449,11 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (!(vm_flags & VM_MAYREAD)) return -EFAULT; } - if (!arch_vma_access_permitted(vma, write, foreign)) + /* + * gups are always data accesses, not instruction + * fetches, so execute=false here + */ + if (!arch_vma_access_permitted(vma, write, false, foreign)) return -EFAULT; return 0; } @@ -629,8 +633,11 @@ bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) /* * The architecture might have a hardware protection * mechanism other than read/write that can deny access. + * + * gup always represents data access, not instruction + * fetches, so execute=false here: */ - if (!arch_vma_access_permitted(vma, write, foreign)) + if (!arch_vma_access_permitted(vma, write, false, foreign)) return false; return true; diff --git a/mm/memory.c b/mm/memory.c index 76c44e5dffa2..99e9f928264a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3380,6 +3380,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *pte; if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, + flags & FAULT_FLAG_INSTRUCTION, flags & FAULT_FLAG_REMOTE)) return VM_FAULT_SIGSEGV; -- cgit v1.2.3-55-g7522 From e6bfb70959a0ca6ddedb29e779a293c6f71ed0e7 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:02:31 -0800 Subject: mm/core, arch, powerpc: Pass a protection key in to calc_vm_flag_bits() This plumbs a protection key through calc_vm_flag_bits(). We could have done this in calc_vm_prot_bits(), but I did not feel super strongly which way to go. It was pretty arbitrary which one to use. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arve Hjønnevåg Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Gang Cc: Dan Williams Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: Denys Vlasenko Cc: Eric W. Biederman Cc: Geliang Tang Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Leon Romanovsky Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Maxime Coquelin Cc: Mel Gorman Cc: Michael Ellerman Cc: Oleg Nesterov Cc: Paul Gortmaker Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rik van Riel Cc: Riley Andrews Cc: Vladimir Davydov Cc: devel@driverdev.osuosl.org Cc: linux-api@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20160212210231.E6F1F0D6@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/mman.h | 5 +++-- drivers/char/agp/frontend.c | 2 +- drivers/staging/android/ashmem.c | 4 ++-- include/linux/mman.h | 6 +++--- mm/mmap.c | 2 +- mm/mprotect.c | 2 +- mm/nommu.c | 2 +- 7 files changed, 12 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h index 8565c254151a..2563c435a4b1 100644 --- a/arch/powerpc/include/asm/mman.h +++ b/arch/powerpc/include/asm/mman.h @@ -18,11 +18,12 @@ * This file is included by linux/mman.h, so we can't use cacl_vm_prot_bits() * here. How important is the optimization? */ -static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot) +static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, + unsigned long pkey) { return (prot & PROT_SAO) ? VM_SAO : 0; } -#define arch_calc_vm_prot_bits(prot) arch_calc_vm_prot_bits(prot) +#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey) static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags) { diff --git a/drivers/char/agp/frontend.c b/drivers/char/agp/frontend.c index 09f17eb73486..0f64d149c98d 100644 --- a/drivers/char/agp/frontend.c +++ b/drivers/char/agp/frontend.c @@ -156,7 +156,7 @@ static pgprot_t agp_convert_mmap_flags(int prot) { unsigned long prot_bits; - prot_bits = calc_vm_prot_bits(prot) | VM_SHARED; + prot_bits = calc_vm_prot_bits(prot, 0) | VM_SHARED; return vm_get_page_prot(prot_bits); } diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c index 5bb1283d19cd..2695ff121b04 100644 --- a/drivers/staging/android/ashmem.c +++ b/drivers/staging/android/ashmem.c @@ -372,8 +372,8 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma) } /* requested protection bits must match our allowed protection mask */ - if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) & - calc_vm_prot_bits(PROT_MASK))) { + if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask, 0)) & + calc_vm_prot_bits(PROT_MASK, 0))) { ret = -EPERM; goto out; } diff --git a/include/linux/mman.h b/include/linux/mman.h index 16373c8f5f57..33e17f6a327a 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -35,7 +35,7 @@ static inline void vm_unacct_memory(long pages) */ #ifndef arch_calc_vm_prot_bits -#define arch_calc_vm_prot_bits(prot) 0 +#define arch_calc_vm_prot_bits(prot, pkey) 0 #endif #ifndef arch_vm_get_page_prot @@ -70,12 +70,12 @@ static inline int arch_validate_prot(unsigned long prot) * Combine the mmap "prot" argument into "vm_flags" used internally. */ static inline unsigned long -calc_vm_prot_bits(unsigned long prot) +calc_vm_prot_bits(unsigned long prot, unsigned long pkey) { return _calc_vm_trans(prot, PROT_READ, VM_READ ) | _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) | _calc_vm_trans(prot, PROT_EXEC, VM_EXEC) | - arch_calc_vm_prot_bits(prot); + arch_calc_vm_prot_bits(prot, pkey); } /* diff --git a/mm/mmap.c b/mm/mmap.c index e2e9f48b06c2..784d2d6142a2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1313,7 +1313,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ - vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | + vm_flags |= calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_LOCKED) diff --git a/mm/mprotect.c b/mm/mprotect.c index f7cb3d4d9c2e..3790c8bee380 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -380,7 +380,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) prot |= PROT_EXEC; - vm_flags = calc_vm_prot_bits(prot); + vm_flags = calc_vm_prot_bits(prot, 0); down_write(¤t->mm->mmap_sem); diff --git a/mm/nommu.c b/mm/nommu.c index b64d04d19702..5ba39b82f241 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1082,7 +1082,7 @@ static unsigned long determine_vm_flags(struct file *file, { unsigned long vm_flags; - vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); + vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags); /* vm_flags |= mm->def_flags; */ if (!(capabilities & NOMMU_MAP_DIRECT)) { -- cgit v1.2.3-55-g7522 From 66d375709d2c891acc639538fd3179fa0cbb0daf Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:02:32 -0800 Subject: mm/core, x86/mm/pkeys: Add arch_validate_pkey() The syscall-level code is passed a protection key and need to return an appropriate error code if the protection key is bogus. We will be using this in subsequent patches. Note that this also begins a series of arch-specific calls that we need to expose in otherwise arch-independent code. We create a linux/pkeys.h header where we will put *all* the stubs for these functions. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210232.774EEAAB@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/include/asm/pkeys.h | 6 ++++++ include/linux/pkeys.h | 25 +++++++++++++++++++++++++ mm/Kconfig | 2 ++ 4 files changed, 34 insertions(+) create mode 100644 arch/x86/include/asm/pkeys.h create mode 100644 include/linux/pkeys.h (limited to 'mm') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b8754348de4d..eda18cecdbbd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -156,6 +156,7 @@ config X86 select X86_DEV_DMA_OPS if X86_64 select X86_FEATURE_NAMES if PROC_FS select ARCH_USES_HIGH_VMA_FLAGS if X86_INTEL_MEMORY_PROTECTION_KEYS + select ARCH_HAS_PKEYS if X86_INTEL_MEMORY_PROTECTION_KEYS config INSTRUCTION_DECODER def_bool y diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h new file mode 100644 index 000000000000..04243c23380c --- /dev/null +++ b/arch/x86/include/asm/pkeys.h @@ -0,0 +1,6 @@ +#ifndef _ASM_X86_PKEYS_H +#define _ASM_X86_PKEYS_H + +#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1) + +#endif /*_ASM_X86_PKEYS_H */ diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h new file mode 100644 index 000000000000..55e465f93a28 --- /dev/null +++ b/include/linux/pkeys.h @@ -0,0 +1,25 @@ +#ifndef _LINUX_PKEYS_H +#define _LINUX_PKEYS_H + +#include +#include + +#ifdef CONFIG_ARCH_HAS_PKEYS +#include +#else /* ! CONFIG_ARCH_HAS_PKEYS */ +#define arch_max_pkey() (1) +#endif /* ! CONFIG_ARCH_HAS_PKEYS */ + +/* + * This is called from mprotect_pkey(). + * + * Returns true if the protection keys is valid. + */ +static inline bool validate_pkey(int pkey) +{ + if (pkey < 0) + return false; + return (pkey < arch_max_pkey()); +} + +#endif /* _LINUX_PKEYS_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 6cf439948297..2702bb6c21df 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -672,3 +672,5 @@ config FRAME_VECTOR config ARCH_USES_HIGH_VMA_FLAGS bool +config ARCH_HAS_PKEYS + bool -- cgit v1.2.3-55-g7522 From 62b5f7d013fc455b8db26cf01e421f4c0d264b92 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:02:40 -0800 Subject: mm/core, x86/mm/pkeys: Add execute-only protection keys support Protection keys provide new page-based protection in hardware. But, they have an interesting attribute: they only affect data accesses and never affect instruction fetches. That means that if we set up some memory which is set as "access-disabled" via protection keys, we can still execute from it. This patch uses protection keys to set up mappings to do just that. If a user calls: mmap(..., PROT_EXEC); or mprotect(ptr, sz, PROT_EXEC); (note PROT_EXEC-only without PROT_READ/WRITE), the kernel will notice this, and set a special protection key on the memory. It also sets the appropriate bits in the Protection Keys User Rights (PKRU) register so that the memory becomes unreadable and unwritable. I haven't found any userspace that does this today. With this facility in place, we expect userspace to move to use it eventually. Userspace _could_ start doing this today. Any PROT_EXEC calls get converted to PROT_READ inside the kernel, and would transparently be upgraded to "true" PROT_EXEC with this code. IOW, userspace never has to do any PROT_EXEC runtime detection. This feature provides enhanced protection against leaking executable memory contents. This helps thwart attacks which are attempting to find ROP gadgets on the fly. But, the security provided by this approach is not comprehensive. The PKRU register which controls access permissions is a normal user register writable from unprivileged userspace. An attacker who can execute the 'wrpkru' instruction can easily disable the protection provided by this feature. The protection key that is used for execute-only support is permanently dedicated at compile time. This is fine for now because there is currently no API to set a protection key other than this one. Despite there being a constant PKRU value across the entire system, we do not set it unless this feature is in use in a process. That is to preserve the PKRU XSAVE 'init state', which can lead to faster context switches. PKRU *is* a user register and the kernel is modifying it. That means that code doing: pkru = rdpkru() pkru |= 0x100; mmap(..., PROT_EXEC); wrpkru(pkru); could lose the bits in PKRU that enforce execute-only permissions. To avoid this, we suggest avoiding ever calling mmap() or mprotect() when the PKRU value is expected to be unstable. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Gang Cc: Dan Williams Cc: Dave Chinner Cc: Dave Hansen Cc: David Hildenbrand Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Linus Torvalds Cc: Mel Gorman Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Piotr Kwapulinski Cc: Rik van Riel Cc: Stephen Smalley Cc: Vladimir Murzin Cc: Will Deacon Cc: keescook@google.com Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210240.CB4BB5CA@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pkeys.h | 25 +++++++++++ arch/x86/kernel/fpu/xstate.c | 2 - arch/x86/mm/Makefile | 2 + arch/x86/mm/fault.c | 10 +++++ arch/x86/mm/pkeys.c | 101 +++++++++++++++++++++++++++++++++++++++++++ include/linux/pkeys.h | 3 ++ mm/mmap.c | 10 ++++- mm/mprotect.c | 8 ++-- 8 files changed, 154 insertions(+), 7 deletions(-) create mode 100644 arch/x86/mm/pkeys.c (limited to 'mm') diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 5061aec2ed5e..7b84565c916c 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -6,4 +6,29 @@ extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); +/* + * Try to dedicate one of the protection keys to be used as an + * execute-only protection key. + */ +#define PKEY_DEDICATED_EXECUTE_ONLY 15 +extern int __execute_only_pkey(struct mm_struct *mm); +static inline int execute_only_pkey(struct mm_struct *mm) +{ + if (!boot_cpu_has(X86_FEATURE_OSPKE)) + return 0; + + return __execute_only_pkey(mm); +} + +extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma, + int prot, int pkey); +static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, + int prot, int pkey) +{ + if (!boot_cpu_has(X86_FEATURE_OSPKE)) + return 0; + + return __arch_override_mprotect_pkey(vma, prot, pkey); +} + #endif /*_ASM_X86_PKEYS_H */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 50813c35e9d9..1b1981812bb6 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -877,8 +877,6 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, int pkey_shift = (pkey * PKRU_BITS_PER_PKEY); u32 new_pkru_bits = 0; - if (!validate_pkey(pkey)) - return -EINVAL; /* * This check implies XSAVE support. OSPKE only gets * set if we enable XSAVE and we enable PKU in XCR0. diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index f9d38a48e3c8..67cf2e1e557b 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -34,3 +34,5 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_X86_INTEL_MPX) += mpx.o +obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o + diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index d81744e6f39f..5877b92ab6f1 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1108,6 +1108,16 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) */ if (error_code & PF_PK) return 1; + + if (!(error_code & PF_INSTR)) { + /* + * Assume all accesses require either read or execute + * permissions. This is not an instruction access, so + * it requires read permissions. + */ + if (!(vma->vm_flags & VM_READ)) + return 1; + } /* * Make sure to check the VMA so that we do not perform * faults just to hit a PF_PK as soon as we fill in a diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c new file mode 100644 index 000000000000..e8c474451928 --- /dev/null +++ b/arch/x86/mm/pkeys.c @@ -0,0 +1,101 @@ +/* + * Intel Memory Protection Keys management + * Copyright (c) 2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include /* mm_struct, vma, etc... */ +#include /* PKEY_* */ +#include + +#include /* boot_cpu_has, ... */ +#include /* vma_pkey() */ +#include /* fpregs_active() */ + +int __execute_only_pkey(struct mm_struct *mm) +{ + int ret; + + /* + * We do not want to go through the relatively costly + * dance to set PKRU if we do not need to. Check it + * first and assume that if the execute-only pkey is + * write-disabled that we do not have to set it + * ourselves. We need preempt off so that nobody + * can make fpregs inactive. + */ + preempt_disable(); + if (fpregs_active() && + !__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) { + preempt_enable(); + return PKEY_DEDICATED_EXECUTE_ONLY; + } + preempt_enable(); + ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY, + PKEY_DISABLE_ACCESS); + /* + * If the PKRU-set operation failed somehow, just return + * 0 and effectively disable execute-only support. + */ + if (ret) + return 0; + + return PKEY_DEDICATED_EXECUTE_ONLY; +} + +static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) +{ + /* Do this check first since the vm_flags should be hot */ + if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC) + return false; + if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY) + return false; + + return true; +} + +/* + * This is only called for *plain* mprotect calls. + */ +int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey) +{ + /* + * Is this an mprotect_pkey() call? If so, never + * override the value that came from the user. + */ + if (pkey != -1) + return pkey; + /* + * Look for a protection-key-drive execute-only mapping + * which is now being given permissions that are not + * execute-only. Move it back to the default pkey. + */ + if (vma_is_pkey_exec_only(vma) && + (prot & (PROT_READ|PROT_WRITE))) { + return 0; + } + /* + * The mapping is execute-only. Go try to get the + * execute-only protection key. If we fail to do that, + * fall through as if we do not have execute-only + * support. + */ + if (prot == PROT_EXEC) { + pkey = execute_only_pkey(vma->vm_mm); + if (pkey > 0) + return pkey; + } + /* + * This is a vanilla, non-pkey mprotect (or we failed to + * setup execute-only), inherit the pkey from the VMA we + * are working on. + */ + return vma_pkey(vma); +} diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h index fc325b367bd0..1d405a2b7272 100644 --- a/include/linux/pkeys.h +++ b/include/linux/pkeys.h @@ -13,6 +13,9 @@ #include #else /* ! CONFIG_ARCH_HAS_PKEYS */ #define arch_max_pkey() (1) +#define execute_only_pkey(mm) (0) +#define arch_override_mprotect_pkey(vma, prot, pkey) (0) +#define PKEY_DEDICATED_EXECUTE_ONLY 0 #endif /* ! CONFIG_ARCH_HAS_PKEYS */ /* diff --git a/mm/mmap.c b/mm/mmap.c index 784d2d6142a2..0175b7d055f0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -1270,6 +1271,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long pgoff, unsigned long *populate) { struct mm_struct *mm = current->mm; + int pkey = 0; *populate = 0; @@ -1309,11 +1311,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (offset_in_page(addr)) return addr; + if (prot == PROT_EXEC) { + pkey = execute_only_pkey(mm); + if (pkey < 0) + pkey = 0; + } + /* Do simple checking here so the lower-level routines won't have * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ - vm_flags |= calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags) | + vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_LOCKED) diff --git a/mm/mprotect.c b/mm/mprotect.c index 3790c8bee380..fa37c4cd973a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -354,7 +355,7 @@ fail: SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, unsigned long, prot) { - unsigned long vm_flags, nstart, end, tmp, reqprot; + unsigned long nstart, end, tmp, reqprot; struct vm_area_struct *vma, *prev; int error = -EINVAL; const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); @@ -380,8 +381,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) prot |= PROT_EXEC; - vm_flags = calc_vm_prot_bits(prot, 0); - down_write(¤t->mm->mmap_sem); vma = find_vma(current->mm, start); @@ -411,10 +410,11 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, for (nstart = start ; ; ) { unsigned long newflags; + int pkey = arch_override_mprotect_pkey(vma, prot, -1); /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - newflags = vm_flags; + newflags = calc_vm_prot_bits(prot, pkey); newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); /* newflags >> 4 shift VM_MAY% in place of VM_% */ -- cgit v1.2.3-55-g7522