summaryrefslogtreecommitdiffstats
path: root/mm/rmap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/rmap.c')
-rw-r--r--mm/rmap.c159
1 files changed, 60 insertions, 99 deletions
diff --git a/mm/rmap.c b/mm/rmap.c
index 0f3b7cda2a24..7df7984d476c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -127,12 +127,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
avc->vma = vma;
avc->anon_vma = anon_vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
-
- /*
- * It's critical to add new vmas to the tail of the anon_vma,
- * see comment in huge_memory.c:__split_huge_page().
- */
- list_add_tail(&avc->same_anon_vma, &anon_vma->head);
+ anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
}
/**
@@ -269,51 +264,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
}
/*
- * Some rmap walk that needs to find all ptes/hugepmds without false
- * negatives (like migrate and split_huge_page) running concurrent
- * with operations that copy or move pagetables (like mremap() and
- * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
- * list to be in a certain order: the dst_vma must be placed after the
- * src_vma in the list. This is always guaranteed by fork() but
- * mremap() needs to call this function to enforce it in case the
- * dst_vma isn't newly allocated and chained with the anon_vma_clone()
- * function but just an extension of a pre-existing vma through
- * vma_merge.
- *
- * NOTE: the same_anon_vma list can still be changed by other
- * processes while mremap runs because mremap doesn't hold the
- * anon_vma mutex to prevent modifications to the list while it
- * runs. All we need to enforce is that the relative order of this
- * process vmas isn't changing (we don't care about other vmas
- * order). Each vma corresponds to an anon_vma_chain structure so
- * there's no risk that other processes calling anon_vma_moveto_tail()
- * and changing the same_anon_vma list under mremap() will screw with
- * the relative order of this process vmas in the list, because we
- * they can't alter the order of any vma that belongs to this
- * process. And there can't be another anon_vma_moveto_tail() running
- * concurrently with mremap() coming from this process because we hold
- * the mmap_sem for the whole mremap(). fork() ordering dependency
- * also shouldn't be affected because fork() only cares that the
- * parent vmas are placed in the list before the child vmas and
- * anon_vma_moveto_tail() won't reorder vmas from either the fork()
- * parent or child.
- */
-void anon_vma_moveto_tail(struct vm_area_struct *dst)
-{
- struct anon_vma_chain *pavc;
- struct anon_vma *root = NULL;
-
- list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
- struct anon_vma *anon_vma = pavc->anon_vma;
- VM_BUG_ON(pavc->vma != dst);
- root = lock_anon_vma_root(root, anon_vma);
- list_del(&pavc->same_anon_vma);
- list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
- }
- unlock_anon_vma_root(root);
-}
-
-/*
* Attach vma to its own anon_vma, as well as to the anon_vmas that
* the corresponding VMA in the parent process is attached to.
* Returns 0 on success, non-zero on failure.
@@ -381,13 +331,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
struct anon_vma *anon_vma = avc->anon_vma;
root = lock_anon_vma_root(root, anon_vma);
- list_del(&avc->same_anon_vma);
+ anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
/*
* Leave empty anon_vmas on the list - we'll need
* to free them outside the lock.
*/
- if (list_empty(&anon_vma->head))
+ if (RB_EMPTY_ROOT(&anon_vma->rb_root))
continue;
list_del(&avc->same_vma);
@@ -416,7 +366,7 @@ static void anon_vma_ctor(void *data)
mutex_init(&anon_vma->mutex);
atomic_set(&anon_vma->refcount, 0);
- INIT_LIST_HEAD(&anon_vma->head);
+ anon_vma->rb_root = RB_ROOT;
}
void __init anon_vma_init(void)
@@ -560,22 +510,26 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
/*
* At what user virtual address is page expected in @vma?
- * Returns virtual address or -EFAULT if page's index/offset is not
- * within the range mapped the @vma.
*/
-inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+static inline unsigned long
+__vma_address(struct page *page, struct vm_area_struct *vma)
{
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- unsigned long address;
if (unlikely(is_vm_hugetlb_page(vma)))
pgoff = page->index << huge_page_order(page_hstate(page));
- address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
- if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
- /* page should be within @vma mapping range */
- return -EFAULT;
- }
+
+ return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+}
+
+inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ unsigned long address = __vma_address(page, vma);
+
+ /* page should be within @vma mapping range */
+ VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+
return address;
}
@@ -585,6 +539,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
+ unsigned long address;
if (PageAnon(page)) {
struct anon_vma *page__anon_vma = page_anon_vma(page);
/*
@@ -600,7 +555,10 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
return -EFAULT;
} else
return -EFAULT;
- return vma_address(page, vma);
+ address = __vma_address(page, vma);
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ return -EFAULT;
+ return address;
}
/*
@@ -674,8 +632,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
pte_t *pte;
spinlock_t *ptl;
- address = vma_address(page, vma);
- if (address == -EFAULT) /* out of vma range */
+ address = __vma_address(page, vma);
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
return 0;
pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
if (!pte) /* the page is not in this mm */
@@ -769,6 +727,7 @@ static int page_referenced_anon(struct page *page,
{
unsigned int mapcount;
struct anon_vma *anon_vma;
+ pgoff_t pgoff;
struct anon_vma_chain *avc;
int referenced = 0;
@@ -777,11 +736,10 @@ static int page_referenced_anon(struct page *page,
return referenced;
mapcount = page_mapcount(page);
- list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
/*
* If we are reclaiming on behalf of a cgroup, skip
* counting on behalf of references from different
@@ -820,7 +778,6 @@ static int page_referenced_file(struct page *page,
struct address_space *mapping = page->mapping;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
int referenced = 0;
/*
@@ -846,10 +803,8 @@ static int page_referenced_file(struct page *page,
*/
mapcount = page_mapcount(page);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
/*
* If we are reclaiming on behalf of a cgroup, skip
* counting on behalf of references from different
@@ -929,7 +884,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
pte_t entry;
flush_cache_page(vma, address, pte_pfn(*pte));
- entry = ptep_clear_flush_notify(vma, address, pte);
+ entry = ptep_clear_flush(vma, address, pte);
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
set_pte_at(mm, address, pte, entry);
@@ -937,6 +892,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
}
pte_unmap_unlock(pte, ptl);
+
+ if (ret)
+ mmu_notifier_invalidate_page(mm, address);
out:
return ret;
}
@@ -945,17 +903,14 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
{
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
int ret = 0;
BUG_ON(PageAnon(page));
mutex_lock(&mapping->i_mmap_mutex);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
if (vma->vm_flags & VM_SHARED) {
unsigned long address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
ret += page_mkclean_one(page, vma, address);
}
}
@@ -1128,7 +1083,7 @@ void page_add_new_anon_rmap(struct page *page,
else
__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
__page_set_anon_rmap(page, vma, address, 1);
- if (page_evictable(page, vma))
+ if (!mlocked_vma_newpage(vma, page))
lru_cache_add_lru(page, LRU_ACTIVE_ANON);
else
add_page_to_unevictable_list(page);
@@ -1203,7 +1158,10 @@ void page_remove_rmap(struct page *page)
} else {
__dec_zone_page_state(page, NR_FILE_MAPPED);
mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
+ mem_cgroup_end_update_page_stat(page, &locked, &flags);
}
+ if (unlikely(PageMlocked(page)))
+ clear_page_mlock(page);
/*
* It would be tidy to reset the PageAnon mapping here,
* but that might overwrite a racing page_add_anon_rmap
@@ -1213,6 +1171,7 @@ void page_remove_rmap(struct page *page)
* Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache.
*/
+ return;
out:
if (!anon)
mem_cgroup_end_update_page_stat(page, &locked, &flags);
@@ -1256,7 +1215,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
- pteval = ptep_clear_flush_notify(vma, address, pte);
+ pteval = ptep_clear_flush(vma, address, pte);
/* Move the dirty bit to the physical page now the pte is gone. */
if (pte_dirty(pteval))
@@ -1318,6 +1277,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
out_unmap:
pte_unmap_unlock(pte, ptl);
+ if (ret != SWAP_FAIL)
+ mmu_notifier_invalidate_page(mm, address);
out:
return ret;
@@ -1382,6 +1343,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
spinlock_t *ptl;
struct page *page;
unsigned long address;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
unsigned long end;
int ret = SWAP_AGAIN;
int locked_vma = 0;
@@ -1405,6 +1368,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
if (!pmd_present(*pmd))
return ret;
+ mmun_start = address;
+ mmun_end = end;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
/*
* If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
* keep the sem while scanning the cluster for mlocking pages.
@@ -1438,7 +1405,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush_notify(vma, address, pte);
+ pteval = ptep_clear_flush(vma, address, pte);
/* If nonlinear, store the file page offset in the pte. */
if (page->index != linear_page_index(vma, address))
@@ -1454,6 +1421,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
(*mapcount)--;
}
pte_unmap_unlock(pte - 1, ptl);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (locked_vma)
up_read(&vma->vm_mm->mmap_sem);
return ret;
@@ -1492,6 +1460,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma)
static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
{
struct anon_vma *anon_vma;
+ pgoff_t pgoff;
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
@@ -1499,7 +1468,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
if (!anon_vma)
return ret;
- list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address;
@@ -1516,8 +1486,6 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
continue;
address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
ret = try_to_unmap_one(page, vma, address, flags);
if (ret != SWAP_AGAIN || !page_mapped(page))
break;
@@ -1547,7 +1515,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
struct address_space *mapping = page->mapping;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
int ret = SWAP_AGAIN;
unsigned long cursor;
unsigned long max_nl_cursor = 0;
@@ -1555,10 +1522,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
unsigned int mapcount;
mutex_lock(&mapping->i_mmap_mutex);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
ret = try_to_unmap_one(page, vma, address, flags);
if (ret != SWAP_AGAIN || !page_mapped(page))
goto out;
@@ -1576,7 +1541,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
goto out;
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
- shared.vm_set.list) {
+ shared.nonlinear) {
cursor = (unsigned long) vma->vm_private_data;
if (cursor > max_nl_cursor)
max_nl_cursor = cursor;
@@ -1608,7 +1573,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
do {
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
- shared.vm_set.list) {
+ shared.nonlinear) {
cursor = (unsigned long) vma->vm_private_data;
while ( cursor < max_nl_cursor &&
cursor < vma->vm_end - vma->vm_start) {
@@ -1631,7 +1596,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
* in locked vmas). Reset cursor on all unreserved nonlinear
* vmas, now forgetting on which ones it had fallen behind.
*/
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
vma->vm_private_data = NULL;
out:
mutex_unlock(&mapping->i_mmap_mutex);
@@ -1716,6 +1681,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
struct vm_area_struct *, unsigned long, void *), void *arg)
{
struct anon_vma *anon_vma;
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
@@ -1729,11 +1695,9 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
if (!anon_vma)
return ret;
anon_vma_lock(anon_vma);
- list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
ret = rmap_one(page, vma, address, arg);
if (ret != SWAP_AGAIN)
break;
@@ -1748,16 +1712,13 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
struct address_space *mapping = page->mapping;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
int ret = SWAP_AGAIN;
if (!mapping)
return ret;
mutex_lock(&mapping->i_mmap_mutex);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
ret = rmap_one(page, vma, address, arg);
if (ret != SWAP_AGAIN)
break;