diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 132 |
1 files changed, 63 insertions, 69 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 745088810965..641cedfc8c0f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -25,6 +25,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/jhash.h> +#include <linux/numa.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -887,7 +888,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, struct zonelist *zonelist; struct zone *zone; struct zoneref *z; - int node = -1; + int node = NUMA_NO_NODE; zonelist = node_zonelist(nid, gfp_mask); @@ -919,7 +920,7 @@ retry_cpuset: /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { - if (hugepage_migration_supported(h)) + if (hugepage_movable_supported(h)) return GFP_HIGHUSER_MOVABLE; else return GFP_HIGHUSER; @@ -1586,8 +1587,8 @@ out_unlock: return page; } -static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nmask) +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) { struct page *page; @@ -3238,7 +3239,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct page *ptepage; unsigned long addr; int cow; - struct address_space *mapping = vma->vm_file->f_mapping; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); struct mmu_notifier_range range; @@ -3250,23 +3250,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, mmu_notifier_range_init(&range, src, vma->vm_start, vma->vm_end); mmu_notifier_invalidate_range_start(&range); - } else { - /* - * For shared mappings i_mmap_rwsem must be held to call - * huge_pte_alloc, otherwise the returned ptep could go - * away if part of a shared pmd and another thread calls - * huge_pmd_unshare. - */ - i_mmap_lock_read(mapping); } for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; - src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) continue; - dst_pte = huge_pte_alloc(dst, addr, sz); if (!dst_pte) { ret = -ENOMEM; @@ -3337,8 +3327,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, if (cow) mmu_notifier_invalidate_range_end(&range); - else - i_mmap_unlock_read(mapping); return ret; } @@ -3365,7 +3353,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * This is a hugetlb vma, all the pte entries should point * to huge page. */ - tlb_remove_check_page_size_change(tlb, sz); + tlb_change_page_size(tlb, sz); tlb_start_vma(tlb, vma); /* @@ -3637,7 +3625,6 @@ retry_avoidcopy: copy_user_huge_page(new_page, old_page, address, vma, pages_per_huge_page(h)); __SetPageUptodate(new_page); - set_page_huge_active(new_page); mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); @@ -3658,6 +3645,7 @@ retry_avoidcopy: make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); + set_page_huge_active(new_page); /* Make the old page be freed below */ new_page = old_page; } @@ -3742,6 +3730,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, pte_t new_pte; spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); + bool new_page = false; /* * Currently, we are forced to kill the process in the event the @@ -3755,16 +3744,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, } /* - * We can not race with truncation due to holding i_mmap_rwsem. - * Check once here for faults beyond end of file. + * Use page lock to guard against racing truncation + * before we get page_table_lock. */ - size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) - goto out; - retry: page = find_lock_page(mapping, idx); if (!page) { + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto out; + /* * Check for page in userfault range */ @@ -3784,18 +3773,14 @@ retry: }; /* - * hugetlb_fault_mutex and i_mmap_rwsem must be - * dropped before handling userfault. Reacquire - * after handling fault to make calling code simpler. + * hugetlb_fault_mutex must be dropped before + * handling userfault. Reacquire after handling + * fault to make calling code simpler. */ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); - ret = handle_userfault(&vmf, VM_UFFD_MISSING); - - i_mmap_lock_read(mapping); mutex_lock(&hugetlb_fault_mutex_table[hash]); goto out; } @@ -3807,7 +3792,7 @@ retry: } clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); - set_page_huge_active(page); + new_page = true; if (vma->vm_flags & VM_MAYSHARE) { int err = huge_add_to_page_cache(page, mapping, idx); @@ -3854,6 +3839,9 @@ retry: } ptl = huge_pte_lock(h, mm, ptep); + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto backout; ret = 0; if (!huge_pte_none(huge_ptep_get(ptep))) @@ -3875,6 +3863,15 @@ retry: } spin_unlock(ptl); + + /* + * Only make newly allocated pages active. Existing pages found + * in the pagecache could be !page_huge_active() if they have been + * isolated for migration. + */ + if (new_page) + set_page_huge_active(page); + unlock_page(page); out: return ret; @@ -3940,11 +3937,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (ptep) { - /* - * Since we hold no locks, ptep could be stale. That is - * OK as we are only making decisions based on content and - * not actually modifying content here. - */ entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { migration_entry_wait_huge(vma, mm, ptep); @@ -3952,33 +3944,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); + } else { + ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); + if (!ptep) + return VM_FAULT_OOM; } - /* - * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold - * until finished with ptep. This serves two purposes: - * 1) It prevents huge_pmd_unshare from being called elsewhere - * and making the ptep no longer valid. - * 2) It synchronizes us with file truncation. - * - * ptep could have already be assigned via huge_pte_offset. That - * is OK, as huge_pte_alloc will return the same value unless - * something changed. - */ mapping = vma->vm_file->f_mapping; - i_mmap_lock_read(mapping); - ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); - if (!ptep) { - i_mmap_unlock_read(mapping); - return VM_FAULT_OOM; - } + idx = vma_hugecache_offset(h, vma, haddr); /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - idx = vma_hugecache_offset(h, vma, haddr); hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -4066,7 +4045,6 @@ out_ptl: } out_mutex: mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); /* * Generally it's safe to hold refcount during waiting page lock. But * here we just wait to defer the next page fault to avoid busy loop and @@ -4128,7 +4106,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * the set_pte_at() write. */ __SetPageUptodate(page); - set_page_huge_active(page); mapping = dst_vma->vm_file->f_mapping; idx = vma_hugecache_offset(h, dst_vma, dst_addr); @@ -4196,6 +4173,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); + set_page_huge_active(page); if (vm_shared) unlock_page(page); ret = 0; @@ -4301,7 +4279,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } if (ret & VM_FAULT_RETRY) { - if (nonblocking) + if (nonblocking && + !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *nonblocking = 0; *nr_pages = 0; /* @@ -4320,6 +4299,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; page = pte_page(huge_ptep_get(pte)); + + /* + * Instead of doing 'try_get_page()' below in the same_page + * loop, just check the count once here. + */ + if (unlikely(page_count(page) <= 0)) { + if (pages) { + spin_unlock(ptl); + remainder = 0; + err = -ENOMEM; + break; + } + } same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); @@ -4420,10 +4412,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, continue; } if (!huge_pte_none(pte)) { - pte = huge_ptep_get_and_clear(mm, address, ptep); - pte = pte_mkhuge(huge_pte_modify(pte, newprot)); + pte_t old_pte; + + old_pte = huge_ptep_modify_prot_start(vma, address, ptep); + pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); pte = arch_make_huge_pte(pte, vma, NULL, 0); - set_huge_pte_at(mm, address, ptep, pte); + huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; } spin_unlock(ptl); @@ -4671,12 +4665,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the - * code much cleaner. - * - * This routine must be called with i_mmap_rwsem held in at least read mode. - * For hugetlbfs, this prevents removal of any page table entries associated - * with the address space. This is important as we are setting up sharing - * based on existing page table entries (mappings). + * code much cleaner. pmd allocation is essential for the shared case because + * pud has to be populated inside the same i_mmap_rwsem section - otherwise + * racing tasks could either miss the sharing (see huge_pte_offset) or select a + * bad pmd for sharing. */ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) { @@ -4693,6 +4685,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); + i_mmap_lock_write(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -4722,6 +4715,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); + i_mmap_unlock_write(mapping); return pte; } @@ -4732,7 +4726,7 @@ out: * indicated by page_count > 1, unmap is achieved by clearing pud and * decrementing the ref count. If count == 1, the pte page is not shared. * - * Called with page table lock held and i_mmap_rwsem held in write mode. + * called with page table lock held. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user |