diff options
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 113 |
1 files changed, 88 insertions, 25 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f7daa7de8f48..9ed58530f695 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -89,6 +89,7 @@ static unsigned int khugepaged_full_scans; static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; /* during fragmentation poll the hugepage allocator once every minute */ static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; +static unsigned long khugepaged_sleep_expire; static struct task_struct *khugepaged_thread __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); static DEFINE_SPINLOCK(khugepaged_mm_lock); @@ -467,6 +468,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, return -EINVAL; khugepaged_scan_sleep_millisecs = msecs; + khugepaged_sleep_expire = 0; wake_up_interruptible(&khugepaged_wait); return count; @@ -494,6 +496,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, return -EINVAL; khugepaged_alloc_sleep_millisecs = msecs; + khugepaged_sleep_expire = 0; wake_up_interruptible(&khugepaged_wait); return count; @@ -764,10 +767,7 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) { - pmd_t entry; - entry = mk_pmd(page, prot); - entry = pmd_mkhuge(entry); - return entry; + return pmd_mkhuge(mk_pmd(page, prot)); } static inline struct list_head *page_deferred_list(struct page *page) @@ -1013,6 +1013,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write); return VM_FAULT_NOPAGE; } +EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) @@ -1298,15 +1299,9 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); /* * We can only reuse the page if nobody else maps the huge page or it's - * part. We can do it by checking page_mapcount() on each sub-page, but - * it's expensive. - * The cheaper way is to check page_count() to be equal 1: every - * mapcount takes page reference reference, so this way we can - * guarantee, that the PMD is the only mapping. - * This can give false negative if somebody pinned the page, but that's - * fine. + * part. */ - if (page_mapcount(page) == 1 && page_count(page) == 1) { + if (page_trans_huge_mapcount(page, NULL) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -1704,20 +1699,17 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return 1; } -bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, - unsigned long old_addr, +bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd) { spinlock_t *old_ptl, *new_ptl; pmd_t pmd; - struct mm_struct *mm = vma->vm_mm; if ((old_addr & ~HPAGE_PMD_MASK) || (new_addr & ~HPAGE_PMD_MASK) || - old_end - old_addr < HPAGE_PMD_SIZE || - (new_vma->vm_flags & VM_NOHUGEPAGE)) + old_end - old_addr < HPAGE_PMD_SIZE) return false; /* @@ -2079,7 +2071,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (pte_write(pteval)) { writable = true; } else { - if (PageSwapCache(page) && !reuse_swap_page(page)) { + if (PageSwapCache(page) && + !reuse_swap_page(page, NULL)) { unlock_page(page); result = SCAN_SWAP_CACHE_PAGE; goto out; @@ -2802,15 +2795,25 @@ static void khugepaged_do_scan(void) put_page(hpage); } +static bool khugepaged_should_wakeup(void) +{ + return kthread_should_stop() || + time_after_eq(jiffies, khugepaged_sleep_expire); +} + static void khugepaged_wait_work(void) { if (khugepaged_has_work()) { - if (!khugepaged_scan_sleep_millisecs) + const unsigned long scan_sleep_jiffies = + msecs_to_jiffies(khugepaged_scan_sleep_millisecs); + + if (!scan_sleep_jiffies) return; + khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; wait_event_freezable_timeout(khugepaged_wait, - kthread_should_stop(), - msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); + khugepaged_should_wakeup(), + scan_sleep_jiffies); return; } @@ -3034,8 +3037,10 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, return; /* - * Caller holds the mmap_sem write mode, so a huge pmd cannot - * materialize from under us. + * Caller holds the mmap_sem write mode or the anon_vma lock, + * so a huge pmd cannot materialize from under us (khugepaged + * holds both the mmap_sem write mode and the anon_vma lock + * write mode). */ __split_huge_pmd(vma, pmd, address, freeze); } @@ -3118,7 +3123,7 @@ static void __split_huge_page_tail(struct page *head, int tail, VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); /* - * tail_page->_count is zero and not changing from under us. But + * tail_page->_refcount is zero and not changing from under us. But * get_page_unless_zero() may be running from under us on the * tail_page. If we used atomic_set() below instead of atomic_inc(), we * would then run atomic_set() concurrently with @@ -3223,6 +3228,64 @@ int total_mapcount(struct page *page) } /* + * This calculates accurately how many mappings a transparent hugepage + * has (unlike page_mapcount() which isn't fully accurate). This full + * accuracy is primarily needed to know if copy-on-write faults can + * reuse the page and change the mapping to read-write instead of + * copying them. At the same time this returns the total_mapcount too. + * + * The function returns the highest mapcount any one of the subpages + * has. If the return value is one, even if different processes are + * mapping different subpages of the transparent hugepage, they can + * all reuse it, because each process is reusing a different subpage. + * + * The total_mapcount is instead counting all virtual mappings of the + * subpages. If the total_mapcount is equal to "one", it tells the + * caller all mappings belong to the same "mm" and in turn the + * anon_vma of the transparent hugepage can become the vma->anon_vma + * local one as no other process may be mapping any of the subpages. + * + * It would be more accurate to replace page_mapcount() with + * page_trans_huge_mapcount(), however we only use + * page_trans_huge_mapcount() in the copy-on-write faults where we + * need full accuracy to avoid breaking page pinning, because + * page_trans_huge_mapcount() is slower than page_mapcount(). + */ +int page_trans_huge_mapcount(struct page *page, int *total_mapcount) +{ + int i, ret, _total_mapcount, mapcount; + + /* hugetlbfs shouldn't call it */ + VM_BUG_ON_PAGE(PageHuge(page), page); + + if (likely(!PageTransCompound(page))) { + mapcount = atomic_read(&page->_mapcount) + 1; + if (total_mapcount) + *total_mapcount = mapcount; + return mapcount; + } + + page = compound_head(page); + + _total_mapcount = ret = 0; + for (i = 0; i < HPAGE_PMD_NR; i++) { + mapcount = atomic_read(&page[i]._mapcount) + 1; + ret = max(ret, mapcount); + _total_mapcount += mapcount; + } + if (PageDoubleMap(page)) { + ret -= 1; + _total_mapcount -= HPAGE_PMD_NR; + } + mapcount = compound_mapcount(page); + ret += mapcount; + _total_mapcount += mapcount; + if (total_mapcount) + *total_mapcount = _total_mapcount; + return ret; +} + +/* * This function splits huge page into normal pages. @page can point to any * subpage of huge page to split. Split doesn't change the position of @page. * @@ -3287,7 +3350,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mlocked) lru_add_drain(); - /* Prevent deferred_split_scan() touching ->_count */ + /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock_irqsave(&pgdata->split_queue_lock, flags); count = page_count(head); mapcount = total_mapcount(head); |