summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CREDITS5
-rw-r--r--Documentation/admin-guide/mm/idle_page_tracking.rst5
-rw-r--r--Documentation/admin-guide/mm/pagemap.rst3
-rw-r--r--Documentation/filesystems/seq_file.txt63
-rw-r--r--MAINTAINERS3
-rw-r--r--arch/alpha/mm/fault.c3
-rw-r--r--arch/arc/mm/fault.c4
-rw-r--r--arch/arm/mm/dma-mapping.c5
-rw-r--r--arch/arm/mm/fault.c7
-rw-r--r--arch/arm64/mm/dma-mapping.c4
-rw-r--r--arch/arm64/mm/fault.c6
-rw-r--r--arch/hexagon/mm/vm_fault.c2
-rw-r--r--arch/ia64/mm/fault.c2
-rw-r--r--arch/m68k/mm/fault.c4
-rw-r--r--arch/microblaze/mm/fault.c2
-rw-r--r--arch/mips/mm/fault.c2
-rw-r--r--arch/nds32/mm/fault.c2
-rw-r--r--arch/nios2/mm/fault.c2
-rw-r--r--arch/openrisc/mm/fault.c2
-rw-r--r--arch/parisc/mm/fault.c2
-rw-r--r--arch/powerpc/include/asm/copro.h4
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c2
-rw-r--r--arch/powerpc/mm/copro_fault.c2
-rw-r--r--arch/powerpc/mm/fault.c7
-rw-r--r--arch/powerpc/platforms/cell/spufs/fault.c2
-rw-r--r--arch/riscv/mm/fault.c3
-rw-r--r--arch/s390/mm/fault.c13
-rw-r--r--arch/sh/boards/of-generic.c7
-rw-r--r--arch/sh/include/asm/kexec.h3
-rw-r--r--arch/sh/kernel/dwarf.c2
-rw-r--r--arch/sh/mm/fault.c4
-rw-r--r--arch/sparc/mm/fault_32.c3
-rw-r--r--arch/sparc/mm/fault_64.c3
-rw-r--r--arch/um/kernel/trap.c2
-rw-r--r--arch/unicore32/mm/fault.c9
-rw-r--r--arch/x86/mm/fault.c5
-rw-r--r--arch/xtensa/kernel/pci-dma.c2
-rw-r--r--arch/xtensa/mm/fault.c2
-rw-r--r--drivers/base/firmware_loader/fallback.c5
-rw-r--r--drivers/base/memory.c2
-rw-r--r--drivers/base/node.c49
-rw-r--r--drivers/dax/device.c2
-rw-r--r--drivers/firewire/core-cdev.c8
-rw-r--r--drivers/iommu/amd_iommu.c2
-rw-r--r--drivers/iommu/amd_iommu_v2.c2
-rw-r--r--drivers/iommu/intel-iommu.c3
-rw-r--r--drivers/iommu/intel-svm.c4
-rw-r--r--drivers/misc/cxl/fault.c2
-rw-r--r--drivers/misc/ocxl/link.c3
-rw-r--r--drivers/s390/char/vmcp.c2
-rw-r--r--drivers/staging/android/ion/ion_cma_heap.c2
-rw-r--r--fs/btrfs/extent_io.c2
-rw-r--r--fs/buffer.c12
-rw-r--r--fs/dcache.c3
-rw-r--r--fs/ext2/file.c1
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/inode.c5
-rw-r--r--fs/ext4/readpage.c5
-rw-r--r--fs/f2fs/data.c5
-rw-r--r--fs/hostfs/hostfs.h2
-rw-r--r--fs/hpfs/hpfs_fn.h13
-rw-r--r--fs/hpfs/namei.c12
-rw-r--r--fs/mpage.c118
-rw-r--r--fs/notify/dnotify/dnotify.c5
-rw-r--r--fs/notify/fanotify/fanotify.c14
-rw-r--r--fs/notify/fanotify/fanotify_user.c5
-rw-r--r--fs/notify/group.c3
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c7
-rw-r--r--fs/notify/inotify/inotify_user.c5
-rw-r--r--fs/ntfs/aops.c9
-rw-r--r--fs/ntfs/compress.c28
-rw-r--r--fs/ntfs/inode.c12
-rw-r--r--fs/ntfs/mft.c12
-rw-r--r--fs/ntfs/time.h27
-rw-r--r--fs/ocfs2/alloc.c60
-rw-r--r--fs/ocfs2/cluster/heartbeat.c12
-rw-r--r--fs/ocfs2/cluster/nodemanager.c6
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/inode.c5
-rw-r--r--fs/ocfs2/localalloc.c9
-rw-r--r--fs/ocfs2/quota_local.c15
-rw-r--r--fs/seq_file.c54
-rw-r--r--fs/super.c11
-rw-r--r--fs/ufs/balloc.c4
-rw-r--r--fs/ufs/ialloc.c2
-rw-r--r--fs/ufs/super.c4
-rw-r--r--fs/ufs/util.h14
-rw-r--r--fs/userfaultfd.c3
-rw-r--r--fs/xfs/xfs_file.c2
-rw-r--r--include/asm-generic/pgtable.h18
-rw-r--r--include/linux/bitfield.h2
-rw-r--r--include/linux/cma.h2
-rw-r--r--include/linux/dma-contiguous.h4
-rw-r--r--include/linux/fs.h5
-rw-r--r--include/linux/fsnotify_backend.h12
-rw-r--r--include/linux/hugetlb.h3
-rw-r--r--include/linux/kasan.h13
-rw-r--r--include/linux/list_lru.h43
-rw-r--r--include/linux/memcontrol.h74
-rw-r--r--include/linux/mm.h10
-rw-r--r--include/linux/node.h12
-rw-r--r--include/linux/page_ext.h15
-rw-r--r--include/linux/sched.h7
-rw-r--r--include/linux/sched/mm.h37
-rw-r--r--include/linux/shrinker.h11
-rw-r--r--include/linux/slab.h2
-rw-r--r--include/linux/vmacache.h6
-rw-r--r--init/Kconfig5
-rw-r--r--kernel/dma/contiguous.c6
-rw-r--r--kernel/dma/direct.c3
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/memremap.c10
-rw-r--r--mm/Kconfig11
-rw-r--r--mm/cma.c8
-rw-r--r--mm/cma_debug.c2
-rw-r--r--mm/fadvise.c8
-rw-r--r--mm/hmm.c19
-rw-r--r--mm/huge_memory.c11
-rw-r--r--mm/hugetlb.c39
-rw-r--r--mm/kasan/kasan_init.c316
-rw-r--r--mm/khugepaged.c60
-rw-r--r--mm/ksm.c5
-rw-r--r--mm/list_lru.c147
-rw-r--r--mm/memblock.c46
-rw-r--r--mm/memcontrol.c326
-rw-r--r--mm/memory.c146
-rw-r--r--mm/memory_hotplug.c96
-rw-r--r--mm/mempool.c12
-rw-r--r--mm/migrate.c3
-rw-r--r--mm/mlock.c3
-rw-r--r--mm/mmap.c9
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/oom_kill.c16
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c33
-rw-r--r--mm/page_ext.c4
-rw-r--r--mm/shmem.c3
-rw-r--r--mm/slab.h6
-rw-r--r--mm/slab_common.c8
-rw-r--r--mm/slub.c3
-rw-r--r--mm/sparse-vmemmap.c61
-rw-r--r--mm/sparse.c290
-rw-r--r--mm/swap_slots.c4
-rw-r--r--mm/vmacache.c38
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c241
-rw-r--r--mm/workingset.c30
-rw-r--r--mm/zsmalloc.c36
-rwxr-xr-xscripts/spdxcheck.py11
-rw-r--r--tools/vm/page-types.c118
152 files changed, 2122 insertions, 1165 deletions
diff --git a/CREDITS b/CREDITS
index 989cda91c427..5befd2d714d0 100644
--- a/CREDITS
+++ b/CREDITS
@@ -2571,6 +2571,11 @@ S: Helstorfer Str. 7
S: D-30625 Hannover
S: Germany
+N: Ron Minnich
+E: rminnich@sandia.gov
+E: rminnich@gmail.com
+D: 9p filesystem development
+
N: Corey Minyard
E: minyard@wf-rch.cirr.com
E: minyard@mvista.com
diff --git a/Documentation/admin-guide/mm/idle_page_tracking.rst b/Documentation/admin-guide/mm/idle_page_tracking.rst
index 6f7b7ca1add3..df9394fb39c2 100644
--- a/Documentation/admin-guide/mm/idle_page_tracking.rst
+++ b/Documentation/admin-guide/mm/idle_page_tracking.rst
@@ -65,6 +65,11 @@ workload one should:
are not reclaimable, he or she can filter them out using
``/proc/kpageflags``.
+The page-types tool in the tools/vm directory can be used to assist in this.
+If the tool is run initially with the appropriate option, it will mark all the
+queried pages as idle. Subsequent runs of the tool can then show which pages have
+their idle flag cleared in the interim.
+
See :ref:`Documentation/admin-guide/mm/pagemap.rst <pagemap>` for more
information about ``/proc/pid/pagemap``, ``/proc/kpageflags``, and
``/proc/kpagecgroup``.
diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst
index 577af85beb41..3f7bade2c231 100644
--- a/Documentation/admin-guide/mm/pagemap.rst
+++ b/Documentation/admin-guide/mm/pagemap.rst
@@ -44,6 +44,9 @@ There are four components to pagemap:
* ``/proc/kpagecount``. This file contains a 64-bit count of the number of
times each page is mapped, indexed by PFN.
+The page-types tool in the tools/vm directory can be used to query the
+number of times a page is mapped.
+
* ``/proc/kpageflags``. This file contains a 64-bit set of flags for each
page, indexed by PFN.
diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt
index 9de4303201e1..d412b236a9d6 100644
--- a/Documentation/filesystems/seq_file.txt
+++ b/Documentation/filesystems/seq_file.txt
@@ -66,23 +66,39 @@ kernel 3.10. Current versions require the following update
The iterator interface
-Modules implementing a virtual file with seq_file must implement a simple
-iterator object that allows stepping through the data of interest.
-Iterators must be able to move to a specific position - like the file they
-implement - but the interpretation of that position is up to the iterator
-itself. A seq_file implementation that is formatting firewall rules, for
-example, could interpret position N as the Nth rule in the chain.
-Positioning can thus be done in whatever way makes the most sense for the
-generator of the data, which need not be aware of how a position translates
-to an offset in the virtual file. The one obvious exception is that a
-position of zero should indicate the beginning of the file.
+Modules implementing a virtual file with seq_file must implement an
+iterator object that allows stepping through the data of interest
+during a "session" (roughly one read() system call). If the iterator
+is able to move to a specific position - like the file they implement,
+though with freedom to map the position number to a sequence location
+in whatever way is convenient - the iterator need only exist
+transiently during a session. If the iterator cannot easily find a
+numerical position but works well with a first/next interface, the
+iterator can be stored in the private data area and continue from one
+session to the next.
+
+A seq_file implementation that is formatting firewall rules from a
+table, for example, could provide a simple iterator that interprets
+position N as the Nth rule in the chain. A seq_file implementation
+that presents the content of a, potentially volatile, linked list
+might record a pointer into that list, providing that can be done
+without risk of the current location being removed.
+
+Positioning can thus be done in whatever way makes the most sense for
+the generator of the data, which need not be aware of how a position
+translates to an offset in the virtual file. The one obvious exception
+is that a position of zero should indicate the beginning of the file.
The /proc/sequence iterator just uses the count of the next number it
will output as its position.
-Four functions must be implemented to make the iterator work. The first,
-called start() takes a position as an argument and returns an iterator
-which will start reading at that position. For our simple sequence example,
+Four functions must be implemented to make the iterator work. The
+first, called start(), starts a session and takes a position as an
+argument, returning an iterator which will start reading at that
+position. The pos passed to start() will always be either zero, or
+the most recent pos used in the previous session.
+
+For our simple sequence example,
the start() function looks like:
static void *ct_seq_start(struct seq_file *s, loff_t *pos)
@@ -101,11 +117,12 @@ implementations; in most cases the start() function should check for a
"past end of file" condition and return NULL if need be.
For more complicated applications, the private field of the seq_file
-structure can be used. There is also a special value which can be returned
-by the start() function called SEQ_START_TOKEN; it can be used if you wish
-to instruct your show() function (described below) to print a header at the
-top of the output. SEQ_START_TOKEN should only be used if the offset is
-zero, however.
+structure can be used to hold state from session to session. There is
+also a special value which can be returned by the start() function
+called SEQ_START_TOKEN; it can be used if you wish to instruct your
+show() function (described below) to print a header at the top of the
+output. SEQ_START_TOKEN should only be used if the offset is zero,
+however.
The next function to implement is called, amazingly, next(); its job is to
move the iterator forward to the next position in the sequence. The
@@ -121,9 +138,13 @@ complete. Here's the example version:
return spos;
}
-The stop() function is called when iteration is complete; its job, of
-course, is to clean up. If dynamic memory is allocated for the iterator,
-stop() is the place to free it.
+The stop() function closes a session; its job, of course, is to clean
+up. If dynamic memory is allocated for the iterator, stop() is the
+place to free it; if a lock was taken by start(), stop() must release
+that lock. The value that *pos was set to by the last next() call
+before stop() is remembered, and used for the first start() call of
+the next session unless lseek() has been called on the file; in that
+case next start() will be asked to start at position zero.
static void ct_seq_stop(struct seq_file *s, void *v)
{
diff --git a/MAINTAINERS b/MAINTAINERS
index b6dcaaa23adf..e9336962d0f2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -199,12 +199,13 @@ F: drivers/net/ethernet/8390/
9P FILE SYSTEM
M: Eric Van Hensbergen <ericvh@gmail.com>
-M: Ron Minnich <rminnich@sandia.gov>
M: Latchesar Ionkov <lucho@ionkov.net>
+M: Dominique Martinet <asmadeus@codewreck.org>
L: v9fs-developer@lists.sourceforge.net
W: http://swik.net/v9fs
Q: http://patchwork.kernel.org/project/v9fs-devel/list/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs.git
+T: git git://github.com/martinetd/linux.git
S: Maintained
F: Documentation/filesystems/9p.txt
F: fs/9p/
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index de2bd217adad..d73dc473fbb9 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -87,7 +87,8 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
const struct exception_table_entry *fixup;
- int fault, si_code = SEGV_MAPERR;
+ int si_code = SEGV_MAPERR;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
/* As of EV6, a load into $31/$f31 is a prefetch, and never faults
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index b884bbd6f354..db6913094be3 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -15,6 +15,7 @@
#include <linux/uaccess.h>
#include <linux/kdebug.h>
#include <linux/perf_event.h>
+#include <linux/mm_types.h>
#include <asm/pgalloc.h>
#include <asm/mmu.h>
@@ -66,7 +67,8 @@ void do_page_fault(unsigned long address, struct pt_regs *regs)
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
siginfo_t info;
- int fault, ret;
+ int ret;
+ vm_fault_t fault;
int write = regs->ecr_cause & ECR_C_PROTV_STORE; /* ST/EX */
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index ba0e786c952e..66566472c153 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -594,7 +594,7 @@ static void *__alloc_from_contiguous(struct device *dev, size_t size,
struct page *page;
void *ptr = NULL;
- page = dma_alloc_from_contiguous(dev, count, order, gfp);
+ page = dma_alloc_from_contiguous(dev, count, order, gfp & __GFP_NOWARN);
if (!page)
return NULL;
@@ -1299,7 +1299,8 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
unsigned long order = get_order(size);
struct page *page;
- page = dma_alloc_from_contiguous(dev, count, order, gfp);
+ page = dma_alloc_from_contiguous(dev, count, order,
+ gfp & __GFP_NOWARN);
if (!page)
goto error;
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 84becc911ee3..3232afb6fdc0 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -224,12 +224,12 @@ static inline bool access_error(unsigned int fsr, struct vm_area_struct *vma)
return vma->vm_flags & mask ? false : true;
}
-static int __kprobes
+static vm_fault_t __kprobes
__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,
unsigned int flags, struct task_struct *tsk)
{
struct vm_area_struct *vma;
- int fault;
+ vm_fault_t fault;
vma = find_vma(mm, addr);
fault = VM_FAULT_BADMAP;
@@ -264,7 +264,8 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{
struct task_struct *tsk;
struct mm_struct *mm;
- int fault, sig, code;
+ int sig, code;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
if (notify_page_fault(regs, fsr))
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 61e93f0b5482..072c51fb07d7 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -355,7 +355,7 @@ static int __init atomic_pool_init(void)
if (dev_get_cma_area(NULL))
page = dma_alloc_from_contiguous(NULL, nr_pages,
- pool_size_order, GFP_KERNEL);
+ pool_size_order, false);
else
page = alloc_pages(GFP_DMA32, pool_size_order);
@@ -573,7 +573,7 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size,
struct page *page;
page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
- get_order(size), gfp);
+ get_order(size), gfp & __GFP_NOWARN);
if (!page)
return NULL;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 9943690a3924..50b30ff30de4 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -379,12 +379,12 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
#define VM_FAULT_BADMAP 0x010000
#define VM_FAULT_BADACCESS 0x020000
-static int __do_page_fault(struct mm_struct *mm, unsigned long addr,
+static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
unsigned int mm_flags, unsigned long vm_flags,
struct task_struct *tsk)
{
struct vm_area_struct *vma;
- int fault;
+ vm_fault_t fault;
vma = find_vma(mm, addr);
fault = VM_FAULT_BADMAP;
@@ -427,7 +427,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
struct task_struct *tsk;
struct mm_struct *mm;
struct siginfo si;
- int fault, major = 0;
+ vm_fault_t fault, major = 0;
unsigned long vm_flags = VM_READ | VM_WRITE;
unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
index 933bbcef5363..eb263e61daf4 100644
--- a/arch/hexagon/mm/vm_fault.c
+++ b/arch/hexagon/mm/vm_fault.c
@@ -52,7 +52,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs)
struct mm_struct *mm = current->mm;
int si_signo;
int si_code = SEGV_MAPERR;
- int fault;
+ vm_fault_t fault;
const struct exception_table_entry *fixup;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 817fa120645f..a9d55ad8d67b 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -86,7 +86,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
struct vm_area_struct *vma, *prev_vma;
struct mm_struct *mm = current->mm;
unsigned long mask;
- int fault;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT)
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index f2ff3779875a..9b6163c05a75 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -70,7 +70,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
{
struct mm_struct *mm = current->mm;
struct vm_area_struct * vma;
- int fault;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
pr_debug("do page fault:\nregs->sr=%#x, regs->pc=%#lx, address=%#lx, %ld, %p\n",
@@ -136,7 +136,7 @@ good_area:
*/
fault = handle_mm_fault(vma, address, flags);
- pr_debug("handle_mm_fault returns %d\n", fault);
+ pr_debug("handle_mm_fault returns %x\n", fault);
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
return 0;
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index af607447c683..202ad6a494f5 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -90,7 +90,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
struct mm_struct *mm = current->mm;
int code = SEGV_MAPERR;
int is_write = error_code & ESR_S;
- int fault;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
regs->ear = address;
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 5f71f2b903b7..73d8a0f0b810 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -43,7 +43,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
struct mm_struct *mm = tsk->mm;
const int field = sizeof(unsigned long) * 2;
int si_code;
- int fault;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
diff --git a/arch/nds32/mm/fault.c b/arch/nds32/mm/fault.c
index 9bdb7c3ecbb6..b740534b152c 100644
--- a/arch/nds32/mm/fault.c
+++ b/arch/nds32/mm/fault.c
@@ -73,7 +73,7 @@ void do_page_fault(unsigned long entry, unsigned long addr,
struct mm_struct *mm;
struct vm_area_struct *vma;
int si_code;
- int fault;
+ vm_fault_t fault;
unsigned int mask = VM_READ | VM_WRITE | VM_EXEC;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c
index b804dd06ea1c..24fd84cf6006 100644
--- a/arch/nios2/mm/fault.c
+++ b/arch/nios2/mm/fault.c
@@ -47,7 +47,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause,
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
int code = SEGV_MAPERR;
- int fault;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
cause >>= 2;
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index 9f011d16cc46..dc4dbafc1d83 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -53,7 +53,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address,
struct mm_struct *mm;
struct vm_area_struct *vma;
int si_code;
- int fault;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index a80117980fc2..c8e8b7c05558 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -262,7 +262,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
struct task_struct *tsk;
struct mm_struct *mm;
unsigned long acc_type;
- int fault = 0;
+ vm_fault_t fault = 0;
unsigned int flags;
if (faulthandler_disabled())
diff --git a/arch/powerpc/include/asm/copro.h b/arch/powerpc/include/asm/copro.h
index ce216df31381..48616fe7ea75 100644
--- a/arch/powerpc/include/asm/copro.h
+++ b/arch/powerpc/include/asm/copro.h
@@ -10,13 +10,15 @@
#ifndef _ASM_POWERPC_COPRO_H
#define _ASM_POWERPC_COPRO_H
+#include <linux/mm_types.h>
+
struct copro_slb
{
u64 esid, vsid;
};
int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
- unsigned long dsisr, unsigned *flt);
+ unsigned long dsisr, vm_fault_t *flt);
int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb);
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index d4a3f4da409b..fc6bb9630a9c 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -77,7 +77,7 @@ struct page *kvm_alloc_hpt_cma(unsigned long nr_pages)
VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES),
- GFP_KERNEL);
+ false);
}
EXPORT_SYMBOL_GPL(kvm_alloc_hpt_cma);
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index 7d0945bd3a61..c8da352e8686 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -34,7 +34,7 @@
* to handle fortunately.
*/
int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
- unsigned long dsisr, unsigned *flt)
+ unsigned long dsisr, vm_fault_t *flt)
{
struct vm_area_struct *vma;
unsigned long is_write;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 7d262c6437c4..d51cf5f4e45e 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -155,7 +155,7 @@ static noinline int bad_access(struct pt_regs *regs, unsigned long address)
}
static int do_sigbus(struct pt_regs *regs, unsigned long address,
- unsigned int fault)
+ vm_fault_t fault)
{
siginfo_t info;
unsigned int lsb = 0;
@@ -186,7 +186,8 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
return 0;
}
-static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
+static int mm_fault_error(struct pt_regs *regs, unsigned long addr,
+ vm_fault_t fault)
{
/*
* Kernel page fault interrupted by SIGKILL. We have no reason to
@@ -414,7 +415,7 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
int is_exec = TRAP(regs) == 0x400;
int is_user = user_mode(regs);
int is_write = page_fault_is_write(error_code);
- int fault, major = 0;
+ vm_fault_t fault, major = 0;
bool must_retry = false;
if (notify_page_fault(regs))
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c
index 1e002e94d0f6..83cf58daaa79 100644
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -111,7 +111,7 @@ int spufs_handle_class1(struct spu_context *ctx)
{
u64 ea, dsisr, access;
unsigned long flags;
- unsigned flt = 0;
+ vm_fault_t flt = 0;
int ret;
/*
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index 148c98ca9b45..88401d5125bc 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -41,7 +41,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs)
struct mm_struct *mm;
unsigned long addr, cause;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
- int fault, code = SEGV_MAPERR;
+ int code = SEGV_MAPERR;
+ vm_fault_t fault;
cause = regs->scause;
addr = regs->sbadaddr;
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 4cc3f06b0ab3..72af23bacbb5 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -341,7 +341,8 @@ static noinline int signal_return(struct pt_regs *regs)
return -EACCES;
}
-static noinline void do_fault_error(struct pt_regs *regs, int access, int fault)
+static noinline void do_fault_error(struct pt_regs *regs, int access,
+ vm_fault_t fault)
{
int si_code;
@@ -401,7 +402,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, int fault)
* 11 Page translation -> Not present (nullification)
* 3b Region third trans. -> Not present (nullification)
*/
-static inline int do_exception(struct pt_regs *regs, int access)
+static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
{
struct gmap *gmap;
struct task_struct *tsk;
@@ -411,7 +412,7 @@ static inline int do_exception(struct pt_regs *regs, int access)
unsigned long trans_exc_code;
unsigned long address;
unsigned int flags;
- int fault;
+ vm_fault_t fault;
tsk = current;
/*
@@ -564,7 +565,8 @@ out:
void do_protection_exception(struct pt_regs *regs)
{
unsigned long trans_exc_code;
- int access, fault;
+ int access;
+ vm_fault_t fault;
trans_exc_code = regs->int_parm_long;
/*
@@ -599,7 +601,8 @@ NOKPROBE_SYMBOL(do_protection_exception);
void do_dat_exception(struct pt_regs *regs)
{
- int access, fault;
+ int access;
+ vm_fault_t fault;
access = VM_READ | VM_EXEC | VM_WRITE;
fault = do_exception(regs, access);
diff --git a/arch/sh/boards/of-generic.c b/arch/sh/boards/of-generic.c
index 46b2481eec90..26789ad28193 100644
--- a/arch/sh/boards/of-generic.c
+++ b/arch/sh/boards/of-generic.c
@@ -56,15 +56,15 @@ const struct of_cpu_method __cpu_method_of_table_sentinel
static void sh_of_smp_probe(void)
{
- struct device_node *np = 0;
- const char *method = 0;
+ struct device_node *np;
+ const char *method = NULL;
const struct of_cpu_method *m = __cpu_method_of_table;
pr_info("SH generic board support: scanning for cpus\n");
init_cpu_possible(cpumask_of(0));
- while ((np = of_find_node_by_type(np, "cpu"))) {
+ for_each_node_by_type(np, "cpu") {
const __be32 *cell = of_get_property(np, "reg", NULL);
u64 id = -1;
if (cell) id = of_read_number(cell, of_n_addr_cells(np));
@@ -80,6 +80,7 @@ static void sh_of_smp_probe(void)
if (!method) {
np = of_find_node_by_name(NULL, "cpus");
of_property_read_string(np, "enable-method", &method);
+ of_node_put(np);
}
pr_info("CPU enable method: %s\n", method);
diff --git a/arch/sh/include/asm/kexec.h b/arch/sh/include/asm/kexec.h
index fd5f331a3912..927d80ba2332 100644
--- a/arch/sh/include/asm/kexec.h
+++ b/arch/sh/include/asm/kexec.h
@@ -4,6 +4,7 @@
#include <asm/ptrace.h>
#include <asm/string.h>
+#include <linux/kernel.h>
/*
* KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
@@ -61,7 +62,7 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
__asm__ __volatile__ ("stc gbr, %0" : "=r" (newregs->gbr));
__asm__ __volatile__ ("stc sr, %0" : "=r" (newregs->sr));
- newregs->pc = (unsigned long)current_text_addr();
+ newregs->pc = _THIS_IP_;
}
}
#else
diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c
index 1a2526676a87..bb511e2d9d68 100644
--- a/arch/sh/kernel/dwarf.c
+++ b/arch/sh/kernel/dwarf.c
@@ -599,7 +599,7 @@ struct dwarf_frame *dwarf_unwind_stack(unsigned long pc,
* time this function makes its first function call.
*/
if (!pc || !prev)
- pc = (unsigned long)current_text_addr();
+ pc = _THIS_IP_;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/*
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index b8e7bb84b6b1..6defd2c6d9b1 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -313,7 +313,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
static noinline int
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, unsigned int fault)
+ unsigned long address, vm_fault_t fault)
{
/*
* Pagefault was interrupted by SIGKILL. We have no reason to
@@ -396,7 +396,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
struct task_struct *tsk;
struct mm_struct *mm;
struct vm_area_struct * vma;
- int fault;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index 9f75b6444bf1..b0440b0edd97 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -166,7 +166,8 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
unsigned int fixup;
unsigned long g2;
int from_user = !(regs->psr & PSR_PS);
- int fault, code;
+ int code;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
if (text_fault)
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 63166fcf9e25..8f8a604c1300 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -278,7 +278,8 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned int insn = 0;
- int si_code, fault_code, fault;
+ int si_code, fault_code;
+ vm_fault_t fault;
unsigned long address, mm_rss;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index ec9a42c14c56..cced82946042 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -72,7 +72,7 @@ good_area:
}
do {
- int fault;
+ vm_fault_t fault;
fault = handle_mm_fault(vma, address, flags);
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
index 381473412937..8f12a5b50a42 100644
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -168,11 +168,11 @@ static inline bool access_error(unsigned int fsr, struct vm_area_struct *vma)
return vma->vm_flags & mask ? false : true;
}
-static int __do_pf(struct mm_struct *mm, unsigned long addr, unsigned int fsr,
- unsigned int flags, struct task_struct *tsk)
+static vm_fault_t __do_pf(struct mm_struct *mm, unsigned long addr,
+ unsigned int fsr, unsigned int flags, struct task_struct *tsk)
{
struct vm_area_struct *vma;
- int fault;
+ vm_fault_t fault;
vma = find_vma(mm, addr);
fault = VM_FAULT_BADMAP;
@@ -209,7 +209,8 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{
struct task_struct *tsk;
struct mm_struct *mm;
- int fault, sig, code;
+ int sig, code;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index db1c042e9853..b9123c497e0a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -16,6 +16,7 @@
#include <linux/prefetch.h> /* prefetchw */
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
+#include <linux/mm_types.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
@@ -999,7 +1000,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, u32 *pkey, unsigned int fault)
+ unsigned long address, u32 *pkey, vm_fault_t fault)
{
if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
no_context(regs, error_code, address, 0, 0);
@@ -1213,7 +1214,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
- int fault, major = 0;
+ vm_fault_t fault, major = 0;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
u32 pkey;
diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c
index 392b4a80ebc2..a02dc563d290 100644
--- a/arch/xtensa/kernel/pci-dma.c
+++ b/arch/xtensa/kernel/pci-dma.c
@@ -137,7 +137,7 @@ static void *xtensa_dma_alloc(struct device *dev, size_t size,
if (gfpflags_allow_blocking(flag))
page = dma_alloc_from_contiguous(dev, count, get_order(size),
- flag);
+ flag & __GFP_NOWARN);
if (!page)
page = alloc_pages(flag, get_order(size));
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index c111a833205a..2ab0e0dcd166 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -42,7 +42,7 @@ void do_page_fault(struct pt_regs *regs)
int code;
int is_write, is_exec;
- int fault;
+ vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
code = SEGV_MAPERR;
diff --git a/drivers/base/firmware_loader/fallback.c b/drivers/base/firmware_loader/fallback.c
index 202324291542..b5c865fe263b 100644
--- a/drivers/base/firmware_loader/fallback.c
+++ b/drivers/base/firmware_loader/fallback.c
@@ -219,11 +219,6 @@ static ssize_t firmware_loading_show(struct device *dev,
return sprintf(buf, "%d\n", loading);
}
-/* Some architectures don't have PAGE_KERNEL_RO */
-#ifndef PAGE_KERNEL_RO
-#define PAGE_KERNEL_RO PAGE_KERNEL
-#endif
-
/* one pages buffer should be mapped/unmapped only once */
static int map_fw_priv_pages(struct fw_priv *fw_priv)
{
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index f5e560188a18..c8a1cb0b6136 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -736,8 +736,6 @@ int hotplug_memory_register(int nid, struct mem_section *section)
mem->section_count++;
}
- if (mem->section_count == sections_per_block)
- ret = register_mem_sect_under_node(mem, nid, false);
out:
mutex_unlock(&mem_sysfs_mutex);
return ret;
diff --git a/drivers/base/node.c b/drivers/base/node.c
index a5e821d09656..1ac4c36e13bb 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -399,18 +399,12 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
}
/* register memory section under specified node if it spans that node */
-int register_mem_sect_under_node(struct memory_block *mem_blk, int nid,
- bool check_nid)
+int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
{
- int ret;
+ int ret, nid = *(int *)arg;
unsigned long pfn, sect_start_pfn, sect_end_pfn;
- if (!mem_blk)
- return -EFAULT;
-
mem_blk->nid = nid;
- if (!node_online(nid))
- return 0;
sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
@@ -433,7 +427,7 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, int nid,
* case, during hotplug we know that all pages in the memory
* block belong to the same node.
*/
- if (check_nid) {
+ if (system_state == SYSTEM_BOOTING) {
page_nid = get_nid_for_pfn(pfn);
if (page_nid < 0)
continue;
@@ -490,41 +484,10 @@ int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
return 0;
}
-int link_mem_sections(int nid, unsigned long start_pfn, unsigned long nr_pages,
- bool check_nid)
+int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long end_pfn = start_pfn + nr_pages;
- unsigned long pfn;
- struct memory_block *mem_blk = NULL;
- int err = 0;
-
- for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
- unsigned long section_nr = pfn_to_section_nr(pfn);
- struct mem_section *mem_sect;
- int ret;
-
- if (!present_section_nr(section_nr))
- continue;
- mem_sect = __nr_to_section(section_nr);
-
- /* same memblock ? */
- if (mem_blk)
- if ((section_nr >= mem_blk->start_section_nr) &&
- (section_nr <= mem_blk->end_section_nr))
- continue;
-
- mem_blk = find_memory_block_hinted(mem_sect, mem_blk);
-
- ret = register_mem_sect_under_node(mem_blk, nid, check_nid);
- if (!err)
- err = ret;
-
- /* discard ref obtained in find_memory_block() */
- }
-
- if (mem_blk)
- kobject_put(&mem_blk->dev.kobj);
- return err;
+ return walk_memory_range(start_pfn, end_pfn, (void *)&nid,
+ register_mem_sect_under_node);
}
#ifdef CONFIG_HUGETLBFS
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 108c37fca782..0a2acd7993f0 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -474,7 +474,7 @@ static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
return rc;
vma->vm_ops = &dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ vma->vm_flags |= VM_HUGEPAGE;
return 0;
}
diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index f0587273940e..d8e185582642 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -1205,7 +1205,7 @@ static int ioctl_get_cycle_timer2(struct client *client, union ioctl_arg *arg)
{
struct fw_cdev_get_cycle_timer2 *a = &arg->get_cycle_timer2;
struct fw_card *card = client->device->card;
- struct timespec ts = {0, 0};
+ struct timespec64 ts = {0, 0};
u32 cycle_time;
int ret = 0;
@@ -1214,9 +1214,9 @@ static int ioctl_get_cycle_timer2(struct client *client, union ioctl_arg *arg)
cycle_time = card->driver->read_csr(card, CSR_CYCLE_TIME);
switch (a->clk_id) {
- case CLOCK_REALTIME: getnstimeofday(&ts); break;
- case CLOCK_MONOTONIC: ktime_get_ts(&ts); break;
- case CLOCK_MONOTONIC_RAW: getrawmonotonic(&ts); break;
+ case CLOCK_REALTIME: ktime_get_real_ts64(&ts); break;
+ case CLOCK_MONOTONIC: ktime_get_ts64(&ts); break;
+ case CLOCK_MONOTONIC_RAW: ktime_get_raw_ts64(&ts); break;
default:
ret = -EINVAL;
}
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 596b95c50051..60b2eab29cd8 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2620,7 +2620,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
return NULL;
page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
- get_order(size), flag);
+ get_order(size), flag & __GFP_NOWARN);
if (!page)
return NULL;
}
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 1d0b53a04a08..58da65df03f5 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -508,7 +508,7 @@ static void do_fault(struct work_struct *work)
{
struct fault *fault = container_of(work, struct fault, work);
struct vm_area_struct *vma;
- int ret = VM_FAULT_ERROR;
+ vm_fault_t ret = VM_FAULT_ERROR;
unsigned int flags = 0;
struct mm_struct *mm;
u64 address;
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 115ff26e9ced..6a237d18fabf 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -3758,7 +3758,8 @@ static void *intel_alloc_coherent(struct device *dev, size_t size,
if (gfpflags_allow_blocking(flags)) {
unsigned int count = size >> PAGE_SHIFT;
- page = dma_alloc_from_contiguous(dev, count, order, flags);
+ page = dma_alloc_from_contiguous(dev, count, order,
+ flags & __GFP_NOWARN);
if (page && iommu_no_mapping(dev) &&
page_to_phys(page) + size > dev->coherent_dma_mask) {
dma_release_from_contiguous(dev, page, count);
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 45f6e581cd56..7d65aab36a96 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -24,6 +24,7 @@
#include <linux/pci-ats.h>
#include <linux/dmar.h>
#include <linux/interrupt.h>
+#include <linux/mm_types.h>
#include <asm/page.h>
#define PASID_ENTRY_P BIT_ULL(0)
@@ -594,7 +595,8 @@ static irqreturn_t prq_event_thread(int irq, void *d)
struct vm_area_struct *vma;
struct page_req_dsc *req;
struct qi_desc resp;
- int ret, result;
+ int result;
+ vm_fault_t ret;
u64 address;
handled = 1;
diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index d45f3e6b17d2..dc7b34174f85 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -134,7 +134,7 @@ static int cxl_handle_segment_miss(struct cxl_context *ctx,
int cxl_handle_mm_fault(struct mm_struct *mm, u64 dsisr, u64 dar)
{
- unsigned flt = 0;
+ vm_fault_t flt = 0;
int result;
unsigned long access, flags, inv_flags = 0;
diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c
index a963b0a4a3c5..31695a078485 100644
--- a/drivers/misc/ocxl/link.c
+++ b/drivers/misc/ocxl/link.c
@@ -2,6 +2,7 @@
// Copyright 2017 IBM Corp.
#include <linux/sched/mm.h>
#include <linux/mutex.h>
+#include <linux/mm_types.h>
#include <linux/mmu_context.h>
#include <asm/copro.h>
#include <asm/pnv-ocxl.h>
@@ -126,7 +127,7 @@ static void ack_irq(struct spa *spa, enum xsl_response r)
static void xsl_fault_handler_bh(struct work_struct *fault_work)
{
- unsigned int flt = 0;
+ vm_fault_t flt = 0;
unsigned long access, flags, inv_flags = 0;
enum xsl_response r;
struct xsl_fault *fault = container_of(fault_work, struct xsl_fault,
diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c
index 948ce82a7725..0fa1b6b1491a 100644
--- a/drivers/s390/char/vmcp.c
+++ b/drivers/s390/char/vmcp.c
@@ -68,7 +68,7 @@ static void vmcp_response_alloc(struct vmcp_session *session)
* anymore the system won't work anyway.
*/
if (order > 2)
- page = cma_alloc(vmcp_cma, nr_pages, 0, GFP_KERNEL);
+ page = cma_alloc(vmcp_cma, nr_pages, 0, false);
if (page) {
session->response = (char *)page_to_phys(page);
session->cma_alloc = 1;
diff --git a/drivers/staging/android/ion/ion_cma_heap.c b/drivers/staging/android/ion/ion_cma_heap.c
index 49718c96bf9e..3fafd013d80a 100644
--- a/drivers/staging/android/ion/ion_cma_heap.c
+++ b/drivers/staging/android/ion/ion_cma_heap.c
@@ -39,7 +39,7 @@ static int ion_cma_allocate(struct ion_heap *heap, struct ion_buffer *buffer,
if (align > CONFIG_CMA_ALIGNMENT)
align = CONFIG_CMA_ALIGNMENT;
- pages = cma_alloc(cma_heap->cma, nr_pages, align, GFP_KERNEL);
+ pages = cma_alloc(cma_heap->cma, nr_pages, align, false);
if (!pages)
return -ENOMEM;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 628f1aef34b0..4dd6faab02bb 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3102,7 +3102,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
for (index = 0; index < nr_pages; index++) {
__do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
- bio, 0, bio_flags, 0, prev_em_start);
+ bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
put_page(pages[index]);
}
}
diff --git a/fs/buffer.c b/fs/buffer.c
index c8c2b7d8b8d6..4cc679d5bf58 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -45,6 +45,7 @@
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
#include <linux/pagevec.h>
+#include <linux/sched/mm.h>
#include <trace/events/block.h>
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
@@ -813,12 +814,16 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
bool retry)
{
struct buffer_head *bh, *head;
- gfp_t gfp = GFP_NOFS;
+ gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
long offset;
+ struct mem_cgroup *memcg;
if (retry)
gfp |= __GFP_NOFAIL;
+ memcg = get_mem_cgroup_from_page(page);
+ memalloc_use_memcg(memcg);
+
head = NULL;
offset = PAGE_SIZE;
while ((offset -= size) >= 0) {
@@ -835,6 +840,9 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
/* Link the buffer to its page */
set_bh_page(bh, page, offset);
}
+out:
+ memalloc_unuse_memcg();
+ mem_cgroup_put(memcg);
return head;
/*
* In case anything failed, we just free everything we got.
@@ -848,7 +856,7 @@ no_grow:
} while (head);
}
- return NULL;
+ goto out;
}
EXPORT_SYMBOL_GPL(alloc_page_buffers);
diff --git a/fs/dcache.c b/fs/dcache.c
index 8d2ec4898c2b..2e7e8d85e9b4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -292,7 +292,8 @@ void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry
spin_unlock(&dentry->d_lock);
name->name = p->name;
} else {
- memcpy(name->inline_name, dentry->d_iname, DNAME_INLINE_LEN);
+ memcpy(name->inline_name, dentry->d_iname,
+ dentry->d_name.len + 1);
spin_unlock(&dentry->d_lock);
name->name = name->inline_name;
}
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 047c327a6b23..28b2609f25c1 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -126,7 +126,6 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
vma->vm_ops = &ext2_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
return 0;
}
#else
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1fc013f3d944..0f0edd1cd0cd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3062,7 +3062,7 @@ static inline void ext4_set_de_type(struct super_block *sb,
/* readpages.c */
extern int ext4_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
- unsigned nr_pages);
+ unsigned nr_pages, bool is_readahead);
/* symlink.c */
extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 7f8023340eb8..69d65d49837b 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -374,7 +374,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ vma->vm_flags |= VM_HUGEPAGE;
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8f6ad7667974..d0dd585add6a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3325,7 +3325,8 @@ static int ext4_readpage(struct file *file, struct page *page)
ret = ext4_readpage_inline(inode, page);
if (ret == -EAGAIN)
- return ext4_mpage_readpages(page->mapping, NULL, page, 1);
+ return ext4_mpage_readpages(page->mapping, NULL, page, 1,
+ false);
return ret;
}
@@ -3340,7 +3341,7 @@ ext4_readpages(struct file *file, struct address_space *mapping,
if (ext4_has_inline_data(inode))
return 0;
- return ext4_mpage_readpages(mapping, pages, NULL, nr_pages);
+ return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true);
}
static void ext4_invalidatepage(struct page *page, unsigned int offset,
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 19b87a8de6ff..f461d75ac049 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -98,7 +98,7 @@ static void mpage_end_io(struct bio *bio)
int ext4_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
- unsigned nr_pages)
+ unsigned nr_pages, bool is_readahead)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
@@ -259,7 +259,8 @@ int ext4_mpage_readpages(struct address_space *mapping,
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
bio->bi_private = ctx;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bio_set_op_attrs(bio, REQ_OP_READ,
+ is_readahead ? REQ_RAHEAD : 0);
}
length = first_hole << blkbits;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 8f931d699287..b7c9b58acf3e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1421,6 +1421,11 @@ out:
/*
* This function was originally taken from fs/mpage.c, and customized for f2fs.
* Major change was from block_size == page_size in f2fs by default.
+ *
+ * Note that the aops->readpages() function is ONLY used for read-ahead. If
+ * this function ever deviates from doing just read-ahead, it should either
+ * use ->readpage() or do the necessary surgery to decouple ->readpages()
+ * readom read-ahead.
*/
static int f2fs_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index cb8374af08a6..33b8423ef0c9 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -19,7 +19,7 @@
#define HOSTFS_ATTR_ATIME_SET 128
#define HOSTFS_ATTR_MTIME_SET 256
-/* These two are unused by hostfs. */
+/* This one is unused by hostfs. */
#define HOSTFS_ATTR_FORCE 512 /* Not a change, but a change it */
#define HOSTFS_ATTR_ATTR_FLAG 1024
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 2a153aed4c19..ab2e7cc2ff33 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -334,16 +334,23 @@ long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg);
* local time (HPFS) to GMT (Unix)
*/
-static inline time_t local_to_gmt(struct super_block *s, time32_t t)
+static inline time64_t local_to_gmt(struct super_block *s, time32_t t)
{
extern struct timezone sys_tz;
return t + sys_tz.tz_minuteswest * 60 + hpfs_sb(s)->sb_timeshift;
}
-static inline time32_t gmt_to_local(struct super_block *s, time_t t)
+static inline time32_t gmt_to_local(struct super_block *s, time64_t t)
{
extern struct timezone sys_tz;
- return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift;
+ t = t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift;
+
+ return clamp_t(time64_t, t, 0, U32_MAX);
+}
+
+static inline time32_t local_get_seconds(struct super_block *s)
+{
+ return gmt_to_local(s, ktime_get_real_seconds());
}
/*
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index a3615e4c730d..082b7c76dd0c 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -11,7 +11,7 @@
static void hpfs_update_directory_times(struct inode *dir)
{
- time_t t = get_seconds();
+ time64_t t = local_to_gmt(dir->i_sb, local_get_seconds(dir->i_sb));
if (t == dir->i_mtime.tv_sec &&
t == dir->i_ctime.tv_sec)
return;
@@ -50,7 +50,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
/*dee.archive = 0;*/
dee.hidden = name[0] == '.';
dee.fnode = cpu_to_le32(fno);
- dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+ dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb));
result = new_inode(dir->i_sb);
if (!result)
goto bail2;
@@ -91,7 +91,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
dnode->root_dnode = 1;
dnode->up = cpu_to_le32(fno);
de = hpfs_add_de(dir->i_sb, dnode, "\001\001", 2, 0);
- de->creation_date = de->write_date = de->read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+ de->creation_date = de->write_date = de->read_date = cpu_to_le32(local_get_seconds(dir->i_sb));
if (!(mode & 0222)) de->read_only = 1;
de->first = de->directory = 1;
/*de->hidden = de->system = 0;*/
@@ -151,7 +151,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, b
dee.archive = 1;
dee.hidden = name[0] == '.';
dee.fnode = cpu_to_le32(fno);
- dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+ dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb));
result = new_inode(dir->i_sb);
if (!result)
@@ -238,7 +238,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, de
dee.archive = 1;
dee.hidden = name[0] == '.';
dee.fnode = cpu_to_le32(fno);
- dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+ dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb));
result = new_inode(dir->i_sb);
if (!result)
@@ -314,7 +314,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
dee.archive = 1;
dee.hidden = name[0] == '.';
dee.fnode = cpu_to_le32(fno);
- dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(gmt_to_local(dir->i_sb, get_seconds()));
+ dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb));
result = new_inode(dir->i_sb);
if (!result)
diff --git a/fs/mpage.c b/fs/mpage.c
index b73638db9866..c820dc9bebab 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -133,6 +133,17 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
} while (page_bh != head);
}
+struct mpage_readpage_args {
+ struct bio *bio;
+ struct page *page;
+ unsigned int nr_pages;
+ bool is_readahead;
+ sector_t last_block_in_bio;
+ struct buffer_head map_bh;
+ unsigned long first_logical_block;
+ get_block_t *get_block;
+};
+
/*
* This is the worker routine which does all the work of mapping the disk
* blocks and constructs largest possible bios, submits them for IO if the
@@ -142,16 +153,14 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
* represent the validity of its disk mapping and to decide when to do the next
* get_block() call.
*/
-static struct bio *
-do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
- sector_t *last_block_in_bio, struct buffer_head *map_bh,
- unsigned long *first_logical_block, get_block_t get_block,
- gfp_t gfp)
+static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
{
+ struct page *page = args->page;
struct inode *inode = page->mapping->host;
const unsigned blkbits = inode->i_blkbits;
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
+ struct buffer_head *map_bh = &args->map_bh;
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
@@ -161,14 +170,24 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
struct block_device *bdev = NULL;
int length;
int fully_mapped = 1;
+ int op_flags;
unsigned nblocks;
unsigned relative_block;
+ gfp_t gfp;
+
+ if (args->is_readahead) {
+ op_flags = REQ_RAHEAD;
+ gfp = readahead_gfp_mask(page->mapping);
+ } else {
+ op_flags = 0;
+ gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
+ }
if (page_has_buffers(page))
goto confused;
block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
- last_block = block_in_file + nr_pages * blocks_per_page;
+ last_block = block_in_file + args->nr_pages * blocks_per_page;
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
last_block = last_block_in_file;
@@ -178,9 +197,10 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
* Map blocks using the result from the previous get_blocks call first.
*/
nblocks = map_bh->b_size >> blkbits;
- if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
- block_in_file < (*first_logical_block + nblocks)) {
- unsigned map_offset = block_in_file - *first_logical_block;
+ if (buffer_mapped(map_bh) &&
+ block_in_file > args->first_logical_block &&
+ block_in_file < (args->first_logical_block + nblocks)) {
+ unsigned map_offset = block_in_file - args->first_logical_block;
unsigned last = nblocks - map_offset;
for (relative_block = 0; ; relative_block++) {
@@ -208,9 +228,9 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
if (block_in_file < last_block) {
map_bh->b_size = (last_block-block_in_file) << blkbits;
- if (get_block(inode, block_in_file, map_bh, 0))
+ if (args->get_block(inode, block_in_file, map_bh, 0))
goto confused;
- *first_logical_block = block_in_file;
+ args->first_logical_block = block_in_file;
}
if (!buffer_mapped(map_bh)) {
@@ -273,43 +293,45 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
/*
* This page will go to BIO. Do we need to send this BIO off first?
*/
- if (bio && (*last_block_in_bio != blocks[0] - 1))
- bio = mpage_bio_submit(REQ_OP_READ, 0, bio);
+ if (args->bio && (args->last_block_in_bio != blocks[0] - 1))
+ args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
alloc_new:
- if (bio == NULL) {
+ if (args->bio == NULL) {
if (first_hole == blocks_per_page) {
if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9),
page))
goto out;
}
- bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- min_t(int, nr_pages, BIO_MAX_PAGES), gfp);
- if (bio == NULL)
+ args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
+ min_t(int, args->nr_pages,
+ BIO_MAX_PAGES),
+ gfp);
+ if (args->bio == NULL)
goto confused;
}
length = first_hole << blkbits;
- if (bio_add_page(bio, page, length, 0) < length) {
- bio = mpage_bio_submit(REQ_OP_READ, 0, bio);
+ if (bio_add_page(args->bio, page, length, 0) < length) {
+ args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
goto alloc_new;
}
- relative_block = block_in_file - *first_logical_block;
+ relative_block = block_in_file - args->first_logical_block;
nblocks = map_bh->b_size >> blkbits;
if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
(first_hole != blocks_per_page))
- bio = mpage_bio_submit(REQ_OP_READ, 0, bio);
+ args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
else
- *last_block_in_bio = blocks[blocks_per_page - 1];
+ args->last_block_in_bio = blocks[blocks_per_page - 1];
out:
- return bio;
+ return args->bio;
confused:
- if (bio)
- bio = mpage_bio_submit(REQ_OP_READ, 0, bio);
+ if (args->bio)
+ args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
if (!PageUptodate(page))
- block_read_full_page(page, get_block);
+ block_read_full_page(page, args->get_block);
else
unlock_page(page);
goto out;
@@ -363,15 +385,12 @@ int
mpage_readpages(struct address_space *mapping, struct list_head *pages,
unsigned nr_pages, get_block_t get_block)
{
- struct bio *bio = NULL;
+ struct mpage_readpage_args args = {
+ .get_block = get_block,
+ .is_readahead = true,
+ };
unsigned page_idx;
- sector_t last_block_in_bio = 0;
- struct buffer_head map_bh;
- unsigned long first_logical_block = 0;
- gfp_t gfp = readahead_gfp_mask(mapping);
- map_bh.b_state = 0;
- map_bh.b_size = 0;
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = lru_to_page(pages);
@@ -379,18 +398,16 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
list_del(&page->lru);
if (!add_to_page_cache_lru(page, mapping,
page->index,
- gfp)) {
- bio = do_mpage_readpage(bio, page,
- nr_pages - page_idx,
- &last_block_in_bio, &map_bh,
- &first_logical_block,
- get_block, gfp);
+ readahead_gfp_mask(mapping))) {
+ args.page = page;
+ args.nr_pages = nr_pages - page_idx;
+ args.bio = do_mpage_readpage(&args);
}
put_page(page);
}
BUG_ON(!list_empty(pages));
- if (bio)
- mpage_bio_submit(REQ_OP_READ, 0, bio);
+ if (args.bio)
+ mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio);
return 0;
}
EXPORT_SYMBOL(mpage_readpages);
@@ -400,18 +417,15 @@ EXPORT_SYMBOL(mpage_readpages);
*/
int mpage_readpage(struct page *page, get_block_t get_block)
{
- struct bio *bio = NULL;
- sector_t last_block_in_bio = 0;
- struct buffer_head map_bh;
- unsigned long first_logical_block = 0;
- gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
+ struct mpage_readpage_args args = {
+ .page = page,
+ .nr_pages = 1,
+ .get_block = get_block,
+ };
- map_bh.b_state = 0;
- map_bh.b_size = 0;
- bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
- &map_bh, &first_logical_block, get_block, gfp);
- if (bio)
- mpage_bio_submit(REQ_OP_READ, 0, bio);
+ args.bio = do_mpage_readpage(&args);
+ if (args.bio)
+ mpage_bio_submit(REQ_OP_READ, 0, args.bio);
return 0;
}
EXPORT_SYMBOL(mpage_readpage);
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index e2bea2ac5dfb..a6365e6bc047 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -384,8 +384,9 @@ out_err:
static int __init dnotify_init(void)
{
- dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
- dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC);
+ dnotify_struct_cache = KMEM_CACHE(dnotify_struct,
+ SLAB_PANIC|SLAB_ACCOUNT);
+ dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT);
dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
if (IS_ERR(dnotify_group))
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index f90842efea13..eb4e75175cfb 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -11,6 +11,7 @@
#include <linux/types.h>
#include <linux/wait.h>
#include <linux/audit.h>
+#include <linux/sched/mm.h>
#include "fanotify.h"
@@ -140,8 +141,8 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
struct inode *inode, u32 mask,
const struct path *path)
{
- struct fanotify_event_info *event;
- gfp_t gfp = GFP_KERNEL;
+ struct fanotify_event_info *event = NULL;
+ gfp_t gfp = GFP_KERNEL_ACCOUNT;
/*
* For queues with unlimited length lost events are not expected and
@@ -151,19 +152,22 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
if (group->max_events == UINT_MAX)
gfp |= __GFP_NOFAIL;
+ /* Whoever is interested in the event, pays for the allocation. */
+ memalloc_use_memcg(group->memcg);
+
if (fanotify_is_perm_event(mask)) {
struct fanotify_perm_event_info *pevent;
pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
if (!pevent)
- return NULL;
+ goto out;
event = &pevent->fae;
pevent->response = 0;
goto init;
}
event = kmem_cache_alloc(fanotify_event_cachep, gfp);
if (!event)
- return NULL;
+ goto out;
init: __maybe_unused
fsnotify_init_event(&event->fse, inode, mask);
event->tgid = get_pid(task_tgid(current));
@@ -174,6 +178,8 @@ init: __maybe_unused
event->path.mnt = NULL;
event->path.dentry = NULL;
}
+out:
+ memalloc_unuse_memcg();
return event;
}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index d736a833fe39..69054886915b 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -16,6 +16,7 @@
#include <linux/uaccess.h>
#include <linux/compat.h>
#include <linux/sched/signal.h>
+#include <linux/memcontrol.h>
#include <asm/ioctls.h>
@@ -731,6 +732,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
group->fanotify_data.user = user;
atomic_inc(&user->fanotify_listeners);
+ group->memcg = get_mem_cgroup_from_mm(current->mm);
oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL);
if (unlikely(!oevent)) {
@@ -932,7 +934,8 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
*/
static int __init fanotify_user_setup(void)
{
- fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
+ fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
+ SLAB_PANIC|SLAB_ACCOUNT);
fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
fanotify_perm_event_cachep =
diff --git a/fs/notify/group.c b/fs/notify/group.c
index aa5468f23e45..c03b83662876 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -22,6 +22,7 @@
#include <linux/srcu.h>
#include <linux/rculist.h>
#include <linux/wait.h>
+#include <linux/memcontrol.h>
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
@@ -36,6 +37,8 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
if (group->ops->free_group_priv)
group->ops->free_group_priv(group);
+ mem_cgroup_put(group->memcg);
+
kfree(group);
}
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 9ab6dde38a14..f4184b4f3815 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -31,6 +31,7 @@
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/sched/user.h>
+#include <linux/sched/mm.h>
#include "inotify.h"
@@ -98,7 +99,11 @@ int inotify_handle_event(struct fsnotify_group *group,
i_mark = container_of(inode_mark, struct inotify_inode_mark,
fsn_mark);
- event = kmalloc(alloc_len, GFP_KERNEL);
+ /* Whoever is interested in the event, pays for the allocation. */
+ memalloc_use_memcg(group->memcg);
+ event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT);
+ memalloc_unuse_memcg();
+
if (unlikely(!event)) {
/*
* Treat lost event due to ENOMEM the same way as queue
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 6f48d325c350..ac6978d3208c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -38,6 +38,7 @@
#include <linux/uaccess.h>
#include <linux/poll.h>
#include <linux/wait.h>
+#include <linux/memcontrol.h>
#include "inotify.h"
#include "../fdinfo.h"
@@ -639,6 +640,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
oevent->name_len = 0;
group->max_events = max_events;
+ group->memcg = get_mem_cgroup_from_mm(current->mm);
spin_lock_init(&group->inotify_data.idr_lock);
idr_init(&group->inotify_data.idr);
@@ -815,7 +817,8 @@ static int __init inotify_user_setup(void)
BUG_ON(hweight32(ALL_INOTIFY_BITS) != 22);
- inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
+ inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark,
+ SLAB_PANIC|SLAB_ACCOUNT);
inotify_max_queued_events = 16384;
init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 3a2e509c77c5..8946130c87ad 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -93,13 +93,11 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
ofs = 0;
if (file_ofs < init_size)
ofs = init_size - file_ofs;
- local_irq_save(flags);
kaddr = kmap_atomic(page);
memset(kaddr + bh_offset(bh) + ofs, 0,
bh->b_size - ofs);
flush_dcache_page(page);
kunmap_atomic(kaddr);
- local_irq_restore(flags);
}
} else {
clear_buffer_uptodate(bh);
@@ -146,13 +144,11 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
recs = PAGE_SIZE / rec_size;
/* Should have been verified before we got here... */
BUG_ON(!recs);
- local_irq_save(flags);
kaddr = kmap_atomic(page);
for (i = 0; i < recs; i++)
post_read_mst_fixup((NTFS_RECORD*)(kaddr +
i * rec_size), rec_size);
kunmap_atomic(kaddr);
- local_irq_restore(flags);
flush_dcache_page(page);
if (likely(page_uptodate && !PageError(page)))
SetPageUptodate(page);
@@ -926,7 +922,7 @@ static int ntfs_write_mst_block(struct page *page,
ntfs_volume *vol = ni->vol;
u8 *kaddr;
unsigned int rec_size = ni->itype.index.block_size;
- ntfs_inode *locked_nis[PAGE_SIZE / rec_size];
+ ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE];
struct buffer_head *bh, *head, *tbh, *rec_start_bh;
struct buffer_head *bhs[MAX_BUF_PER_PAGE];
runlist_element *rl;
@@ -935,6 +931,9 @@ static int ntfs_write_mst_block(struct page *page,
bool sync, is_mft, page_is_dirty, rec_is_dirty;
unsigned char bh_size_bits;
+ if (WARN_ON(rec_size < NTFS_BLOCK_SIZE))
+ return -EINVAL;
+
ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
"0x%lx.", vi->i_ino, ni->type, page->index);
BUG_ON(!NInoNonResident(ni));
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index fbd0090d7d0c..df7c32b5fac7 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -128,6 +128,7 @@ static inline void handle_bounds_compressed_page(struct page *page,
/**
* ntfs_decompress - decompress a compression block into an array of pages
* @dest_pages: destination array of pages
+ * @completed_pages: scratch space to track completed pages
* @dest_index: current index into @dest_pages (IN/OUT)
* @dest_ofs: current offset within @dest_pages[@dest_index] (IN/OUT)
* @dest_max_index: maximum index into @dest_pages (IN)
@@ -162,10 +163,10 @@ static inline void handle_bounds_compressed_page(struct page *page,
* Note to hackers: This function may not sleep until it has finished accessing
* the compression block @cb_start as it is a per-CPU buffer.
*/
-static int ntfs_decompress(struct page *dest_pages[], int *dest_index,
- int *dest_ofs, const int dest_max_index, const int dest_max_ofs,
- const int xpage, char *xpage_done, u8 *const cb_start,
- const u32 cb_size, const loff_t i_size,
+static int ntfs_decompress(struct page *dest_pages[], int completed_pages[],
+ int *dest_index, int *dest_ofs, const int dest_max_index,
+ const int dest_max_ofs, const int xpage, char *xpage_done,
+ u8 *const cb_start, const u32 cb_size, const loff_t i_size,
const s64 initialized_size)
{
/*
@@ -190,9 +191,6 @@ static int ntfs_decompress(struct page *dest_pages[], int *dest_index,
/* Variables for tag and token parsing. */
u8 tag; /* Current tag. */
int token; /* Loop counter for the eight tokens in tag. */
-
- /* Need this because we can't sleep, so need two stages. */
- int completed_pages[dest_max_index - *dest_index + 1];
int nr_completed_pages = 0;
/* Default error code. */
@@ -516,6 +514,7 @@ int ntfs_read_compressed_block(struct page *page)
unsigned int cb_clusters, cb_max_ofs;
int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0;
struct page **pages;
+ int *completed_pages;
unsigned char xpage_done = 0;
ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = "
@@ -528,14 +527,16 @@ int ntfs_read_compressed_block(struct page *page)
BUG_ON(ni->name_len);
pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
+ completed_pages = kmalloc_array(nr_pages + 1, sizeof(int), GFP_NOFS);
/* Allocate memory to store the buffer heads we need. */
bhs_size = cb_size / block_size * sizeof(struct buffer_head *);
bhs = kmalloc(bhs_size, GFP_NOFS);
- if (unlikely(!pages || !bhs)) {
+ if (unlikely(!pages || !bhs || !completed_pages)) {
kfree(bhs);
kfree(pages);
+ kfree(completed_pages);
unlock_page(page);
ntfs_error(vol->sb, "Failed to allocate internal buffers.");
return -ENOMEM;
@@ -562,6 +563,7 @@ int ntfs_read_compressed_block(struct page *page)
if (xpage >= max_page) {
kfree(bhs);
kfree(pages);
+ kfree(completed_pages);
zero_user(page, 0, PAGE_SIZE);
ntfs_debug("Compressed read outside i_size - truncated?");
SetPageUptodate(page);
@@ -854,10 +856,10 @@ lock_retry_remap:
unsigned int prev_cur_page = cur_page;
ntfs_debug("Found compressed compression block.");
- err = ntfs_decompress(pages, &cur_page, &cur_ofs,
- cb_max_page, cb_max_ofs, xpage, &xpage_done,
- cb_pos, cb_size - (cb_pos - cb), i_size,
- initialized_size);
+ err = ntfs_decompress(pages, completed_pages, &cur_page,
+ &cur_ofs, cb_max_page, cb_max_ofs, xpage,
+ &xpage_done, cb_pos, cb_size - (cb_pos - cb),
+ i_size, initialized_size);
/*
* We can sleep from now on, lock already dropped by
* ntfs_decompress().
@@ -912,6 +914,7 @@ lock_retry_remap:
/* We no longer need the list of pages. */
kfree(pages);
+ kfree(completed_pages);
/* If we have completed the requested page, we return success. */
if (likely(xpage_done))
@@ -956,5 +959,6 @@ err_out:
}
}
kfree(pages);
+ kfree(completed_pages);
return -EIO;
}
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index decaf75d1cd5..bd3221cbdd95 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -667,18 +667,18 @@ static int ntfs_read_locked_inode(struct inode *vi)
* mtime is the last change of the data within the file. Not changed
* when only metadata is changed, e.g. a rename doesn't affect mtime.
*/
- vi->i_mtime = timespec_to_timespec64(ntfs2utc(si->last_data_change_time));
+ vi->i_mtime = ntfs2utc(si->last_data_change_time);
/*
* ctime is the last change of the metadata of the file. This obviously
* always changes, when mtime is changed. ctime can be changed on its
* own, mtime is then not changed, e.g. when a file is renamed.
*/
- vi->i_ctime = timespec_to_timespec64(ntfs2utc(si->last_mft_change_time));
+ vi->i_ctime = ntfs2utc(si->last_mft_change_time);
/*
* Last access to the data within the file. Not changed during a rename
* for example but changed whenever the file is written to.
*/
- vi->i_atime = timespec_to_timespec64(ntfs2utc(si->last_access_time));
+ vi->i_atime = ntfs2utc(si->last_access_time);
/* Find the attribute list attribute if present. */
ntfs_attr_reinit_search_ctx(ctx);
@@ -2997,7 +2997,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
si = (STANDARD_INFORMATION*)((u8*)ctx->attr +
le16_to_cpu(ctx->attr->data.resident.value_offset));
/* Update the access times if they have changed. */
- nt = utc2ntfs(timespec64_to_timespec(vi->i_mtime));
+ nt = utc2ntfs(vi->i_mtime);
if (si->last_data_change_time != nt) {
ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
"new = 0x%llx", vi->i_ino, (long long)
@@ -3006,7 +3006,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
si->last_data_change_time = nt;
modified = true;
}
- nt = utc2ntfs(timespec64_to_timespec(vi->i_ctime));
+ nt = utc2ntfs(vi->i_ctime);
if (si->last_mft_change_time != nt) {
ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, "
"new = 0x%llx", vi->i_ino, (long long)
@@ -3015,7 +3015,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
si->last_mft_change_time = nt;
modified = true;
}
- nt = utc2ntfs(timespec64_to_timespec(vi->i_atime));
+ nt = utc2ntfs(vi->i_atime);
if (si->last_access_time != nt) {
ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
"new = 0x%llx", vi->i_ino,
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 32c523cf5a2d..fb14d17666c8 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -35,6 +35,8 @@
#include "mft.h"
#include "ntfs.h"
+#define MAX_BHS (PAGE_SIZE / NTFS_BLOCK_SIZE)
+
/**
* map_mft_record_page - map the page in which a specific mft record resides
* @ni: ntfs inode whose mft record page to map
@@ -469,7 +471,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
struct page *page;
unsigned int blocksize = vol->sb->s_blocksize;
int max_bhs = vol->mft_record_size / blocksize;
- struct buffer_head *bhs[max_bhs];
+ struct buffer_head *bhs[MAX_BHS];
struct buffer_head *bh, *head;
u8 *kmirr;
runlist_element *rl;
@@ -479,6 +481,8 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
ntfs_debug("Entering for inode 0x%lx.", mft_no);
BUG_ON(!max_bhs);
+ if (WARN_ON(max_bhs > MAX_BHS))
+ return -EINVAL;
if (unlikely(!vol->mftmirr_ino)) {
/* This could happen during umount... */
err = ntfs_sync_mft_mirror_umount(vol, mft_no, m);
@@ -674,7 +678,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
unsigned int blocksize = vol->sb->s_blocksize;
unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
int max_bhs = vol->mft_record_size / blocksize;
- struct buffer_head *bhs[max_bhs];
+ struct buffer_head *bhs[MAX_BHS];
struct buffer_head *bh, *head;
runlist_element *rl;
unsigned int block_start, block_end, m_start, m_end;
@@ -684,6 +688,10 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
BUG_ON(NInoAttr(ni));
BUG_ON(!max_bhs);
BUG_ON(!PageLocked(page));
+ if (WARN_ON(max_bhs > MAX_BHS)) {
+ err = -EINVAL;
+ goto err_out;
+ }
/*
* If the ntfs_inode is clean no need to do anything. If it is dirty,
* mark it as clean now so that it can be redirtied later on if needed.
diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h
index 01233989d5d1..24cd719f1fd2 100644
--- a/fs/ntfs/time.h
+++ b/fs/ntfs/time.h
@@ -36,16 +36,16 @@
* Convert the Linux UTC time @ts to its corresponding NTFS time and return
* that in little endian format.
*
- * Linux stores time in a struct timespec consisting of a time_t (long at
- * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second
- * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of
- * 1-nano-second intervals since the value of tv_sec.
+ * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec
+ * and a long tv_nsec where tv_sec is the number of 1-second intervals since
+ * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second
+ * intervals since the value of tv_sec.
*
* NTFS uses Microsoft's standard time format which is stored in a s64 and is
* measured as the number of 100-nano-second intervals since 1st January 1601,
* 00:00:00 UTC.
*/
-static inline sle64 utc2ntfs(const struct timespec ts)
+static inline sle64 utc2ntfs(const struct timespec64 ts)
{
/*
* Convert the seconds to 100ns intervals, add the nano-seconds
@@ -63,7 +63,10 @@ static inline sle64 utc2ntfs(const struct timespec ts)
*/
static inline sle64 get_current_ntfs_time(void)
{
- return utc2ntfs(current_kernel_time());
+ struct timespec64 ts;
+
+ ktime_get_coarse_real_ts64(&ts);
+ return utc2ntfs(ts);
}
/**
@@ -73,18 +76,18 @@ static inline sle64 get_current_ntfs_time(void)
* Convert the little endian NTFS time @time to its corresponding Linux UTC
* time and return that in cpu format.
*
- * Linux stores time in a struct timespec consisting of a time_t (long at
- * present) tv_sec and a long tv_nsec where tv_sec is the number of 1-second
- * intervals since 1st January 1970, 00:00:00 UTC and tv_nsec is the number of
- * 1-nano-second intervals since the value of tv_sec.
+ * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec
+ * and a long tv_nsec where tv_sec is the number of 1-second intervals since
+ * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second
+ * intervals since the value of tv_sec.
*
* NTFS uses Microsoft's standard time format which is stored in a s64 and is
* measured as the number of 100 nano-second intervals since 1st January 1601,
* 00:00:00 UTC.
*/
-static inline struct timespec ntfs2utc(const sle64 time)
+static inline struct timespec64 ntfs2utc(const sle64 time)
{
- struct timespec ts;
+ struct timespec64 ts;
/* Subtract the NTFS time offset. */
u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0f157bbd3e0f..a342f008e42f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -932,13 +932,11 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
goto bail;
}
- if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+ if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation)
rc = ocfs2_error(sb,
"Extent block #%llu has an invalid h_fs_generation of #%u\n",
(unsigned long long)bh->b_blocknr,
le32_to_cpu(eb->h_fs_generation));
- goto bail;
- }
bail:
return rc;
}
@@ -1481,19 +1479,17 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
while(le16_to_cpu(el->l_tree_depth) > 1) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
- ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has empty extent list (next_free_rec == 0)\n",
- (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
- status = -EIO;
+ status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has empty extent list (next_free_rec == 0)\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
goto bail;
}
i = le16_to_cpu(el->l_next_free_rec) - 1;
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (!blkno) {
- ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has extent list where extent # %d has no physical block start\n",
- (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
- status = -EIO;
+ status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has extent list where extent # %d has no physical block start\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
goto bail;
}
@@ -1598,10 +1594,8 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
* the new data. */
ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
meta_ac);
- if (ret < 0) {
+ if (ret < 0)
mlog_errno(ret);
- goto out;
- }
out:
if (final_depth)
@@ -3214,11 +3208,10 @@ rightmost_no_delete:
goto rightmost_no_delete;
if (le16_to_cpu(el->l_next_free_rec) == 0) {
- ret = -EIO;
- ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has empty extent block at %llu\n",
- (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
- (unsigned long long)le64_to_cpu(eb->h_blkno));
+ ret = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has empty extent block at %llu\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ (unsigned long long)le64_to_cpu(eb->h_blkno));
goto out;
}
@@ -4411,12 +4404,11 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
le16_to_cpu(new_el->l_count)) {
bh = path_leaf_bh(left_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
- ocfs2_error(sb,
- "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n",
- (unsigned long long)le64_to_cpu(eb->h_blkno),
- le16_to_cpu(new_el->l_next_free_rec),
- le16_to_cpu(new_el->l_count));
- status = -EINVAL;
+ status = ocfs2_error(sb,
+ "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n",
+ (unsigned long long)le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(new_el->l_next_free_rec),
+ le16_to_cpu(new_el->l_count));
goto free_left_path;
}
rec = &new_el->l_recs[
@@ -4466,11 +4458,10 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
bh = path_leaf_bh(right_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
- ocfs2_error(sb,
- "Extent block #%llu has an invalid l_next_free_rec of %d\n",
- (unsigned long long)le64_to_cpu(eb->h_blkno),
- le16_to_cpu(new_el->l_next_free_rec));
- status = -EINVAL;
+ status = ocfs2_error(sb,
+ "Extent block #%llu has an invalid l_next_free_rec of %d\n",
+ (unsigned long long)le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(new_el->l_next_free_rec));
goto free_right_path;
}
rec = &new_el->l_recs[1];
@@ -5523,10 +5514,8 @@ static int ocfs2_truncate_rec(handle_t *handle,
ocfs2_journal_dirty(handle, path_leaf_bh(path));
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
- if (ret) {
+ if (ret)
mlog_errno(ret);
- goto out;
- }
out:
ocfs2_free_path(left_path);
@@ -5659,10 +5648,8 @@ int ocfs2_remove_extent(handle_t *handle,
ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
cpos, len);
- if (ret) {
+ if (ret)
mlog_errno(ret);
- goto out;
- }
}
out:
@@ -5707,7 +5694,6 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
if (ret < 0) {
if (ret != -ENOSPC)
mlog_errno(ret);
- goto out;
}
}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index ea8c551bcd7e..9b2ed62dd638 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -127,13 +127,13 @@ enum o2hb_heartbeat_modes {
O2HB_HEARTBEAT_NUM_MODES,
};
-char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
- "local", /* O2HB_HEARTBEAT_LOCAL */
- "global", /* O2HB_HEARTBEAT_GLOBAL */
+static const char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
+ "local", /* O2HB_HEARTBEAT_LOCAL */
+ "global", /* O2HB_HEARTBEAT_GLOBAL */
};
unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
-unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
+static unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
/*
* o2hb_dependent_users tracks the number of registered callbacks that depend
@@ -141,7 +141,7 @@ unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
* However only o2dlm depends on the heartbeat. It does not want the heartbeat
* to stop while a dlm domain is still active.
*/
-unsigned int o2hb_dependent_users;
+static unsigned int o2hb_dependent_users;
/*
* In global heartbeat mode, all regions are pinned if there are one or more
@@ -2486,7 +2486,7 @@ unlock:
return ret;
}
-void o2hb_region_dec_user(const char *region_uuid)
+static void o2hb_region_dec_user(const char *region_uuid)
{
spin_lock(&o2hb_live_lock);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index da64c3a20eeb..0e4166cc23a0 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -35,9 +35,9 @@
* cluster references throughout where nodes are looked up */
struct o2nm_cluster *o2nm_single_cluster = NULL;
-char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
- "reset", /* O2NM_FENCE_RESET */
- "panic", /* O2NM_FENCE_PANIC */
+static const char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
+ "reset", /* O2NM_FENCE_RESET */
+ "panic", /* O2NM_FENCE_PANIC */
};
static inline void o2nm_lock_subsystem(void);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 1296f78ae966..7d9eea7d4a87 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -872,8 +872,6 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
"for type %u key %08x\n", msg_type, key);
}
write_unlock(&o2net_handler_lock);
- if (ret)
- goto out;
out:
if (ret)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 0ff424c6d17c..8e712b614e6e 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -96,7 +96,7 @@ struct ocfs2_unblock_ctl {
};
/* Lockdep class keys */
-struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
+static struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
int new_level);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index ddc3e9470c87..79279240fb6e 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -637,10 +637,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
handle = NULL;
status = ocfs2_commit_truncate(osb, inode, fe_bh);
- if (status < 0) {
+ if (status < 0)
mlog_errno(status);
- goto out;
- }
}
out:
@@ -1499,7 +1497,6 @@ static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
(unsigned long long)bh->b_blocknr,
le32_to_cpu(di->i_fs_generation));
rc = -OCFS2_FILECHECK_ERR_GENERATION;
- goto bail;
}
bail:
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index fe0d1f9571bb..7642b6712c39 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -663,11 +663,10 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
#ifdef CONFIG_OCFS2_DEBUG_FS
if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
ocfs2_local_alloc_count_bits(alloc)) {
- ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n",
- (unsigned long long)le64_to_cpu(alloc->i_blkno),
- le32_to_cpu(alloc->id1.bitmap1.i_used),
- ocfs2_local_alloc_count_bits(alloc));
- status = -EIO;
+ status = ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n",
+ (unsigned long long)le64_to_cpu(alloc->i_blkno),
+ le32_to_cpu(alloc->id1.bitmap1.i_used),
+ ocfs2_local_alloc_count_bits(alloc));
goto bail;
}
#endif
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 16c42ed0dca8..b1a8b046f4c2 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -137,14 +137,13 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
int rc = 0;
struct buffer_head *tmp = *bh;
- if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
- ocfs2_error(inode->i_sb,
- "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)v_block,
- (unsigned long long)i_size_read(inode));
- return -EIO;
- }
+ if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block)
+ return ocfs2_error(inode->i_sb,
+ "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)v_block,
+ (unsigned long long)i_size_read(inode));
+
rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
ocfs2_validate_quota_block);
if (rc)
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 4cc090b50cc5..1dea7a8a5255 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -90,23 +90,22 @@ EXPORT_SYMBOL(seq_open);
static int traverse(struct seq_file *m, loff_t offset)
{
- loff_t pos = 0, index;
+ loff_t pos = 0;
int error = 0;
void *p;
m->version = 0;
- index = 0;
+ m->index = 0;
m->count = m->from = 0;
- if (!offset) {
- m->index = index;
+ if (!offset)
return 0;
- }
+
if (!m->buf) {
m->buf = seq_buf_alloc(m->size = PAGE_SIZE);
if (!m->buf)
return -ENOMEM;
}
- p = m->op->start(m, &index);
+ p = m->op->start(m, &m->index);
while (p) {
error = PTR_ERR(p);
if (IS_ERR(p))
@@ -123,20 +122,15 @@ static int traverse(struct seq_file *m, loff_t offset)
if (pos + m->count > offset) {
m->from = offset - pos;
m->count -= m->from;
- m->index = index;
break;
}
pos += m->count;
m->count = 0;
- if (pos == offset) {
- index++;
- m->index = index;
+ p = m->op->next(m, p, &m->index);
+ if (pos == offset)
break;
- }
- p = m->op->next(m, p, &index);
}
m->op->stop(m, p);
- m->index = index;
return error;
Eoverflow:
@@ -160,7 +154,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
{
struct seq_file *m = file->private_data;
size_t copied = 0;
- loff_t pos;
size_t n;
void *p;
int err = 0;
@@ -223,16 +216,12 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
size -= n;
buf += n;
copied += n;
- if (!m->count) {
- m->from = 0;
- m->index++;
- }
if (!size)
goto Done;
}
/* we need at least one record in buffer */
- pos = m->index;
- p = m->op->start(m, &pos);
+ m->from = 0;
+ p = m->op->start(m, &m->index);
while (1) {
err = PTR_ERR(p);
if (!p || IS_ERR(p))
@@ -243,8 +232,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
if (unlikely(err))
m->count = 0;
if (unlikely(!m->count)) {
- p = m->op->next(m, p, &pos);
- m->index = pos;
+ p = m->op->next(m, p, &m->index);
continue;
}
if (m->count < m->size)
@@ -256,29 +244,33 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
if (!m->buf)
goto Enomem;
m->version = 0;
- pos = m->index;
- p = m->op->start(m, &pos);
+ p = m->op->start(m, &m->index);
}
m->op->stop(m, p);
m->count = 0;
goto Done;
Fill:
/* they want more? let's try to get some more */
- while (m->count < size) {
+ while (1) {
size_t offs = m->count;
- loff_t next = pos;
- p = m->op->next(m, p, &next);
+ loff_t pos = m->index;
+
+ p = m->op->next(m, p, &m->index);
+ if (pos == m->index)
+ /* Buggy ->next function */
+ m->index++;
if (!p || IS_ERR(p)) {
err = PTR_ERR(p);
break;
}
+ if (m->count >= size)
+ break;
err = m->op->show(m, p);
if (seq_has_overflowed(m) || err) {
m->count = offs;
if (likely(err <= 0))
break;
}
- pos = next;
}
m->op->stop(m, p);
n = min(m->count, size);
@@ -287,11 +279,7 @@ Fill:
goto Efault;
copied += n;
m->count -= n;
- if (m->count)
- m->from = n;
- else
- pos++;
- m->index = pos;
+ m->from = n;
Done:
if (!copied)
copied = err;
diff --git a/fs/super.c b/fs/super.c
index 50728d9c1a05..7429588d6b49 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -144,6 +144,9 @@ static unsigned long super_cache_count(struct shrinker *shrink,
total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
+ if (!total_objects)
+ return SHRINK_EMPTY;
+
total_objects = vfs_pressure_ratio(total_objects);
return total_objects;
}
@@ -244,10 +247,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
INIT_LIST_HEAD(&s->s_inodes_wb);
spin_lock_init(&s->s_inode_wblist_lock);
- if (list_lru_init_memcg(&s->s_dentry_lru))
- goto fail;
- if (list_lru_init_memcg(&s->s_inode_lru))
- goto fail;
s->s_count = 1;
atomic_set(&s->s_active, 1);
mutex_init(&s->s_vfs_rename_mutex);
@@ -265,6 +264,10 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
if (prealloc_shrinker(&s->s_shrink))
goto fail;
+ if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink))
+ goto fail;
+ if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink))
+ goto fail;
return s;
fail:
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index e727ee07dbe4..075d3d9114c8 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -547,7 +547,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
/*
* Block can be extended
*/
- ucg->cg_time = cpu_to_fs32(sb, get_seconds());
+ ucg->cg_time = ufs_get_seconds(sb);
for (i = newcount; i < (uspi->s_fpb - fragoff); i++)
if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
break;
@@ -639,7 +639,7 @@ cg_found:
if (!ufs_cg_chkmagic(sb, ucg))
ufs_panic (sb, "ufs_alloc_fragments",
"internal error, bad magic number on cg %u", cgno);
- ucg->cg_time = cpu_to_fs32(sb, get_seconds());
+ ucg->cg_time = ufs_get_seconds(sb);
if (count == uspi->s_fpb) {
result = ufs_alloccg_block (inode, ucpi, goal, err);
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 02c0a4be4212..969fd60436d3 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -89,7 +89,7 @@ void ufs_free_inode (struct inode * inode)
if (!ufs_cg_chkmagic(sb, ucg))
ufs_panic (sb, "ufs_free_fragments", "internal error, bad cg magic number");
- ucg->cg_time = cpu_to_fs32(sb, get_seconds());
+ ucg->cg_time = ufs_get_seconds(sb);
is_directory = S_ISDIR(inode->i_mode);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 488088141451..a4e07e910f1b 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -698,7 +698,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
usb1 = ubh_get_usb_first(uspi);
usb3 = ubh_get_usb_third(uspi);
- usb1->fs_time = cpu_to_fs32(sb, get_seconds());
+ usb1->fs_time = ufs_get_seconds(sb);
if ((flags & UFS_ST_MASK) == UFS_ST_SUN ||
(flags & UFS_ST_MASK) == UFS_ST_SUNOS ||
(flags & UFS_ST_MASK) == UFS_ST_SUNx86)
@@ -1342,7 +1342,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
*/
if (*mount_flags & SB_RDONLY) {
ufs_put_super_internal(sb);
- usb1->fs_time = cpu_to_fs32(sb, get_seconds());
+ usb1->fs_time = ufs_get_seconds(sb);
if ((flags & UFS_ST_MASK) == UFS_ST_SUN
|| (flags & UFS_ST_MASK) == UFS_ST_SUNOS
|| (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 1907be6d5808..1fd3011ea623 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -590,3 +590,17 @@ static inline int ufs_is_data_ptr_zero(struct ufs_sb_private_info *uspi,
else
return *(__fs32 *)p == 0;
}
+
+static inline __fs32 ufs_get_seconds(struct super_block *sbp)
+{
+ time64_t now = ktime_get_real_seconds();
+
+ /* Signed 32-bit interpretation wraps around in 2038, which
+ * happens in ufs1 inode stamps but not ufs2 using 64-bits
+ * stamps. For superblock and blockgroup, let's assume
+ * unsigned 32-bit stamps, which are good until y2106.
+ * Wrap around rather than clamp here to make the dirty
+ * file system detection work in the superblock stamp.
+ */
+ return cpu_to_fs32(sbp, lower_32_bits(now));
+}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index bad9cea37f12..15c265d450bf 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1849,17 +1849,14 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
{
struct userfaultfd_ctx *ctx = f->private_data;
wait_queue_entry_t *wq;
- struct userfaultfd_wait_queue *uwq;
unsigned long pending = 0, total = 0;
spin_lock(&ctx->fault_pending_wqh.lock);
list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
- uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
pending++;
total++;
}
list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
- uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
total++;
}
spin_unlock(&ctx->fault_pending_wqh.lock);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 181e9084519b..5eaef2c17293 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1169,7 +1169,7 @@ xfs_file_mmap(
file_accessed(filp);
vma->vm_ops = &xfs_file_vm_ops;
if (IS_DAX(file_inode(filp)))
- vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ vma->vm_flags |= VM_HUGEPAGE;
return 0;
}
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index a75cb371cd19..88ebc6102c7c 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1095,6 +1095,24 @@ static inline bool arch_has_pfn_modify_check(void)
}
#endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */
+/*
+ * Architecture PAGE_KERNEL_* fallbacks
+ *
+ * Some architectures don't define certain PAGE_KERNEL_* flags. This is either
+ * because they really don't support them, or the port needs to be updated to
+ * reflect the required functionality. Below are a set of relatively safe
+ * fallbacks, as best effort, which we can count on in lieu of the architectures
+ * not defining them on their own yet.
+ */
+
+#ifndef PAGE_KERNEL_RO
+# define PAGE_KERNEL_RO PAGE_KERNEL
+#endif
+
+#ifndef PAGE_KERNEL_EXEC
+# define PAGE_KERNEL_EXEC PAGE_KERNEL
+#endif
+
#endif /* !__ASSEMBLY__ */
#ifndef io_remap_pfn_range
diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index 65a6981eef7b..3f1ef4450a7c 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -53,7 +53,7 @@
({ \
BUILD_BUG_ON_MSG(!__builtin_constant_p(_mask), \
_pfx "mask is not constant"); \
- BUILD_BUG_ON_MSG(!(_mask), _pfx "mask is zero"); \
+ BUILD_BUG_ON_MSG((_mask) == 0, _pfx "mask is zero"); \
BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ? \
~((_mask) >> __bf_shf(_mask)) & (_val) : 0, \
_pfx "value too large for the field"); \
diff --git a/include/linux/cma.h b/include/linux/cma.h
index bf90f0bb42bd..190184b5ff32 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -33,7 +33,7 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
const char *name,
struct cma **res_cma);
extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
- gfp_t gfp_mask);
+ bool no_warn);
extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
index 3c5a4cb3eb95..f247e8aa5e3d 100644
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@@ -112,7 +112,7 @@ static inline int dma_declare_contiguous(struct device *dev, phys_addr_t size,
}
struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
- unsigned int order, gfp_t gfp_mask);
+ unsigned int order, bool no_warn);
bool dma_release_from_contiguous(struct device *dev, struct page *pages,
int count);
@@ -145,7 +145,7 @@ int dma_declare_contiguous(struct device *dev, phys_addr_t size,
static inline
struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
- unsigned int order, gfp_t gfp_mask)
+ unsigned int order, bool no_warn)
{
return NULL;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1ec33fd0423f..a9242f336f02 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -179,7 +179,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
#define ATTR_ATIME_SET (1 << 7)
#define ATTR_MTIME_SET (1 << 8)
#define ATTR_FORCE (1 << 9) /* Not a change, but a change it */
-#define ATTR_ATTR_FLAG (1 << 10)
#define ATTR_KILL_SUID (1 << 11)
#define ATTR_KILL_SGID (1 << 12)
#define ATTR_FILE (1 << 13)
@@ -345,6 +344,10 @@ struct address_space_operations {
/* Set a page dirty. Return true if this dirtied it */
int (*set_page_dirty)(struct page *page);
+ /*
+ * Reads in the requested pages. Unlike ->readpage(), this is
+ * PURELY used for read-ahead!.
+ */
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 2b9b6f1ff5e0..b8f4182f42f1 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -84,6 +84,8 @@ struct fsnotify_event_private_data;
struct fsnotify_fname;
struct fsnotify_iter_info;
+struct mem_cgroup;
+
/*
* Each group much define these ops. The fsnotify infrastructure will call
* these operations for each relevant group.
@@ -127,6 +129,8 @@ struct fsnotify_event {
* everything will be cleaned up.
*/
struct fsnotify_group {
+ const struct fsnotify_ops *ops; /* how this group handles things */
+
/*
* How the refcnt is used is up to each group. When the refcnt hits 0
* fsnotify will clean up all of the resources associated with this group.
@@ -137,8 +141,6 @@ struct fsnotify_group {
*/
refcount_t refcnt; /* things with interest in this group */
- const struct fsnotify_ops *ops; /* how this group handles things */
-
/* needed to send notification to userspace */
spinlock_t notification_lock; /* protect the notification_list */
struct list_head notification_list; /* list of event_holder this group needs to send to userspace */
@@ -160,6 +162,8 @@ struct fsnotify_group {
atomic_t num_marks; /* 1 for each mark and 1 for not being
* past the point of no return when freeing
* a group */
+ atomic_t user_waits; /* Number of tasks waiting for user
+ * response */
struct list_head marks_list; /* all inode marks for this group */
struct fasync_struct *fsn_fa; /* async notification */
@@ -167,8 +171,8 @@ struct fsnotify_group {
struct fsnotify_event *overflow_event; /* Event we queue when the
* notification list is too
* full */
- atomic_t user_waits; /* Number of tasks waiting for user
- * response */
+
+ struct mem_cgroup *memcg; /* memcg to charge allocations */
/* groups can define private fields here or use the void *private */
union {
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 36fa6a2a82e3..c39d9170a8a0 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -348,9 +348,6 @@ struct hstate {
struct huge_bootmem_page {
struct list_head list;
struct hstate *hstate;
-#ifdef CONFIG_HIGHMEM
- phys_addr_t phys;
-#endif
};
struct page *alloc_huge_page(struct vm_area_struct *vma,
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index de784fd11d12..46aae129917c 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -20,7 +20,7 @@ extern pmd_t kasan_zero_pmd[PTRS_PER_PMD];
extern pud_t kasan_zero_pud[PTRS_PER_PUD];
extern p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D];
-void kasan_populate_zero_shadow(const void *shadow_start,
+int kasan_populate_zero_shadow(const void *shadow_start,
const void *shadow_end);
static inline void *kasan_mem_to_shadow(const void *addr)
@@ -71,6 +71,9 @@ struct kasan_cache {
int kasan_module_alloc(void *addr, size_t size);
void kasan_free_shadow(const struct vm_struct *vm);
+int kasan_add_zero_shadow(void *start, unsigned long size);
+void kasan_remove_zero_shadow(void *start, unsigned long size);
+
size_t ksize(const void *);
static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); }
size_t kasan_metadata_size(struct kmem_cache *cache);
@@ -124,6 +127,14 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
static inline void kasan_free_shadow(const struct vm_struct *vm) {}
+static inline int kasan_add_zero_shadow(void *start, unsigned long size)
+{
+ return 0;
+}
+static inline void kasan_remove_zero_shadow(void *start,
+ unsigned long size)
+{}
+
static inline void kasan_unpoison_slab(const void *ptr) { }
static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 96def9d15b1b..aa5efd9351eb 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -42,7 +42,7 @@ struct list_lru_node {
spinlock_t lock;
/* global list, used for the root cgroup in cgroup aware lrus */
struct list_lru_one lru;
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
/* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
struct list_lru_memcg __rcu *memcg_lrus;
#endif
@@ -51,21 +51,25 @@ struct list_lru_node {
struct list_lru {
struct list_lru_node *node;
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
struct list_head list;
+ int shrinker_id;
#endif
};
void list_lru_destroy(struct list_lru *lru);
int __list_lru_init(struct list_lru *lru, bool memcg_aware,
- struct lock_class_key *key);
+ struct lock_class_key *key, struct shrinker *shrinker);
-#define list_lru_init(lru) __list_lru_init((lru), false, NULL)
-#define list_lru_init_key(lru, key) __list_lru_init((lru), false, (key))
-#define list_lru_init_memcg(lru) __list_lru_init((lru), true, NULL)
+#define list_lru_init(lru) \
+ __list_lru_init((lru), false, NULL, NULL)
+#define list_lru_init_key(lru, key) \
+ __list_lru_init((lru), false, (key), NULL)
+#define list_lru_init_memcg(lru, shrinker) \
+ __list_lru_init((lru), true, NULL, shrinker)
int memcg_update_all_list_lrus(int num_memcgs);
-void memcg_drain_all_list_lrus(int src_idx, int dst_idx);
+void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg);
/**
* list_lru_add: add an element to the lru list's tail
@@ -162,6 +166,23 @@ unsigned long list_lru_walk_one(struct list_lru *lru,
int nid, struct mem_cgroup *memcg,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk);
+/**
+ * list_lru_walk_one_irq: walk a list_lru, isolating and disposing freeable items.
+ * @lru: the lru pointer.
+ * @nid: the node id to scan from.
+ * @memcg: the cgroup to scan from.
+ * @isolate: callback function that is resposible for deciding what to do with
+ * the item currently being scanned
+ * @cb_arg: opaque type that will be passed to @isolate
+ * @nr_to_walk: how many items to scan.
+ *
+ * Same as @list_lru_walk_one except that the spinlock is acquired with
+ * spin_lock_irq().
+ */
+unsigned long list_lru_walk_one_irq(struct list_lru *lru,
+ int nid, struct mem_cgroup *memcg,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk);
unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk);
@@ -175,6 +196,14 @@ list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
}
static inline unsigned long
+list_lru_shrink_walk_irq(struct list_lru *lru, struct shrink_control *sc,
+ list_lru_walk_cb isolate, void *cb_arg)
+{
+ return list_lru_walk_one_irq(lru, sc->nid, sc->memcg, isolate, cb_arg,
+ &sc->nr_to_scan);
+}
+
+static inline unsigned long
list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
void *cb_arg, unsigned long nr_to_walk)
{
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 680d3395fc83..0e6c515fb698 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -112,6 +112,15 @@ struct lruvec_stat {
};
/*
+ * Bitmap of shrinker::id corresponding to memcg-aware shrinkers,
+ * which have elements charged to this memcg.
+ */
+struct memcg_shrinker_map {
+ struct rcu_head rcu;
+ unsigned long map[0];
+};
+
+/*
* per-zone information in memory controller.
*/
struct mem_cgroup_per_node {
@@ -124,6 +133,9 @@ struct mem_cgroup_per_node {
struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
+#ifdef CONFIG_MEMCG_KMEM
+ struct memcg_shrinker_map __rcu *shrinker_map;
+#endif
struct rb_node tree_node; /* RB tree node */
unsigned long usage_in_excess;/* Set to the value by which */
/* the soft limit is exceeded*/
@@ -271,7 +283,7 @@ struct mem_cgroup {
bool tcpmem_active;
int tcpmem_pressure;
-#ifndef CONFIG_SLOB
+#ifdef CONFIG_MEMCG_KMEM
/* Index in the kmem_cache->memcg_params.memcg_caches array */
int kmemcg_id;
enum memcg_kmem_state kmem_state;
@@ -306,6 +318,11 @@ struct mem_cgroup {
extern struct mem_cgroup *root_mem_cgroup;
+static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
+{
+ return (memcg == root_mem_cgroup);
+}
+
static inline bool mem_cgroup_disabled(void)
{
return !cgroup_subsys_enabled(memory_cgrp_subsys);
@@ -373,11 +390,21 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
+
+struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
+
static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
}
+static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+{
+ if (memcg)
+ css_put(&memcg->css);
+}
+
#define mem_cgroup_from_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
@@ -497,16 +524,16 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
struct task_struct *p);
-static inline void mem_cgroup_oom_enable(void)
+static inline void mem_cgroup_enter_user_fault(void)
{
- WARN_ON(current->memcg_may_oom);
- current->memcg_may_oom = 1;
+ WARN_ON(current->in_user_fault);
+ current->in_user_fault = 1;
}
-static inline void mem_cgroup_oom_disable(void)
+static inline void mem_cgroup_exit_user_fault(void)
{
- WARN_ON(!current->memcg_may_oom);
- current->memcg_may_oom = 0;
+ WARN_ON(!current->in_user_fault);
+ current->in_user_fault = 0;
}
static inline bool task_in_memcg_oom(struct task_struct *p)
@@ -762,6 +789,11 @@ void mem_cgroup_split_huge_fixup(struct page *head);
struct mem_cgroup;
+static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
+{
+ return true;
+}
+
static inline bool mem_cgroup_disabled(void)
{
return true;
@@ -850,6 +882,20 @@ static inline bool task_in_mem_cgroup(struct task_struct *task,
return true;
}
+static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+{
+ return NULL;
+}
+
+static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
+{
+ return NULL;
+}
+
+static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+{
+}
+
static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
@@ -937,11 +983,11 @@ static inline void mem_cgroup_handle_over_high(void)
{
}
-static inline void mem_cgroup_oom_enable(void)
+static inline void mem_cgroup_enter_user_fault(void)
{
}
-static inline void mem_cgroup_oom_disable(void)
+static inline void mem_cgroup_exit_user_fault(void)
{
}
@@ -1207,7 +1253,7 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
int memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
void memcg_kmem_uncharge(struct page *page, int order);
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
extern struct static_key_false memcg_kmem_enabled_key;
extern struct workqueue_struct *memcg_kmem_cache_wq;
@@ -1238,6 +1284,10 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
return memcg ? memcg->kmemcg_id : -1;
}
+extern int memcg_expand_shrinker_maps(int new_id);
+
+extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
+ int nid, int shrinker_id);
#else
#define for_each_memcg_cache_index(_idx) \
for (; NULL; )
@@ -1260,6 +1310,8 @@ static inline void memcg_put_cache_ids(void)
{
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
+static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
+ int nid, int shrinker_id) { }
+#endif /* CONFIG_MEMCG_KMEM */
#endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 68a5121694ef..a3cae495f9ce 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2665,12 +2665,7 @@ extern int randomize_va_space;
const char * arch_vma_name(struct vm_area_struct *vma);
void print_vma_addr(char *prefix, unsigned long rip);
-void sparse_mem_maps_populate_node(struct page **map_map,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long map_count,
- int nodeid);
-
+void *sparse_buffer_alloc(unsigned long size);
struct page *sparse_mem_map_populate(unsigned long pnum, int nid,
struct vmem_altmap *altmap);
pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
@@ -2752,7 +2747,8 @@ extern void clear_huge_page(struct page *page,
unsigned long addr_hint,
unsigned int pages_per_huge_page);
extern void copy_user_huge_page(struct page *dst, struct page *src,
- unsigned long addr, struct vm_area_struct *vma,
+ unsigned long addr_hint,
+ struct vm_area_struct *vma,
unsigned int pages_per_huge_page);
extern long copy_huge_page_from_user(struct page *dst_page,
const void __user *usr_src,
diff --git a/include/linux/node.h b/include/linux/node.h
index 6d336e38d155..257bb3d6d014 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -33,10 +33,10 @@ typedef void (*node_registration_func_t)(struct node *);
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA)
extern int link_mem_sections(int nid, unsigned long start_pfn,
- unsigned long nr_pages, bool check_nid);
+ unsigned long end_pfn);
#else
static inline int link_mem_sections(int nid, unsigned long start_pfn,
- unsigned long nr_pages, bool check_nid)
+ unsigned long end_pfn)
{
return 0;
}
@@ -54,12 +54,14 @@ static inline int register_one_node(int nid)
if (node_online(nid)) {
struct pglist_data *pgdat = NODE_DATA(nid);
+ unsigned long start_pfn = pgdat->node_start_pfn;
+ unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
error = __register_one_node(nid);
if (error)
return error;
/* link memory sections under this node */
- error = link_mem_sections(nid, pgdat->node_start_pfn, pgdat->node_spanned_pages, true);
+ error = link_mem_sections(nid, start_pfn, end_pfn);
}
return error;
@@ -69,7 +71,7 @@ extern void unregister_one_node(int nid);
extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
extern int register_mem_sect_under_node(struct memory_block *mem_blk,
- int nid, bool check_nid);
+ void *arg);
extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
unsigned long phys_index);
@@ -99,7 +101,7 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
return 0;
}
static inline int register_mem_sect_under_node(struct memory_block *mem_blk,
- int nid, bool check_nid)
+ void *arg)
{
return 0;
}
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index ca5461efae2f..f84f167ec04c 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -16,18 +16,7 @@ struct page_ext_operations {
#ifdef CONFIG_PAGE_EXTENSION
-/*
- * page_ext->flags bits:
- *
- * PAGE_EXT_DEBUG_POISON is set for poisoned pages. This is used to
- * implement generic debug pagealloc feature. The pages are filled with
- * poison patterns and set this flag after free_pages(). The poisoned
- * pages are verified whether the patterns are not corrupted and clear
- * the flag before alloc_pages().
- */
-
enum page_ext_flags {
- PAGE_EXT_DEBUG_POISON, /* Page is poisoned */
PAGE_EXT_DEBUG_GUARD,
PAGE_EXT_OWNER,
#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
@@ -61,7 +50,7 @@ static inline void page_ext_init(void)
}
#endif
-struct page_ext *lookup_page_ext(struct page *page);
+struct page_ext *lookup_page_ext(const struct page *page);
#else /* !CONFIG_PAGE_EXTENSION */
struct page_ext;
@@ -70,7 +59,7 @@ static inline void pgdat_page_ext_init(struct pglist_data *pgdat)
{
}
-static inline struct page_ext *lookup_page_ext(struct page *page)
+static inline struct page_ext *lookup_page_ext(const struct page *page)
{
return NULL;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 95a5018c338e..789923fbee3a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -722,8 +722,8 @@ struct task_struct {
unsigned restore_sigmask:1;
#endif
#ifdef CONFIG_MEMCG
- unsigned memcg_may_oom:1;
-#ifndef CONFIG_SLOB
+ unsigned in_user_fault:1;
+#ifdef CONFIG_MEMCG_KMEM
unsigned memcg_kmem_skip_account:1;
#endif
#endif
@@ -1152,6 +1152,9 @@ struct task_struct {
/* Number of pages to reclaim on returning to userland: */
unsigned int memcg_nr_pages_over_high;
+
+ /* Used by memcontrol for targeted memcg charge: */
+ struct mem_cgroup *active_memcg;
#endif
#ifdef CONFIG_BLK_CGROUP
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 44d356f5e47c..aebb370a0006 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -248,6 +248,43 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
current->flags = (current->flags & ~PF_MEMALLOC) | flags;
}
+#ifdef CONFIG_MEMCG
+/**
+ * memalloc_use_memcg - Starts the remote memcg charging scope.
+ * @memcg: memcg to charge.
+ *
+ * This function marks the beginning of the remote memcg charging scope. All the
+ * __GFP_ACCOUNT allocations till the end of the scope will be charged to the
+ * given memcg.
+ *
+ * NOTE: This function is not nesting safe.
+ */
+static inline void memalloc_use_memcg(struct mem_cgroup *memcg)
+{
+ WARN_ON_ONCE(current->active_memcg);
+ current->active_memcg = memcg;
+}
+
+/**
+ * memalloc_unuse_memcg - Ends the remote memcg charging scope.
+ *
+ * This function marks the end of the remote memcg charging scope started by
+ * memalloc_use_memcg().
+ */
+static inline void memalloc_unuse_memcg(void)
+{
+ current->active_memcg = NULL;
+}
+#else
+static inline void memalloc_use_memcg(struct mem_cgroup *memcg)
+{
+}
+
+static inline void memalloc_unuse_memcg(void)
+{
+}
+#endif
+
#ifdef CONFIG_MEMBARRIER
enum {
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0),
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 6794490f25b2..b154fd2b084c 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -34,12 +34,15 @@ struct shrink_control {
};
#define SHRINK_STOP (~0UL)
+#define SHRINK_EMPTY (~0UL - 1)
/*
* A callback you can register to apply pressure to ageable caches.
*
* @count_objects should return the number of freeable items in the cache. If
- * there are no objects to free or the number of freeable items cannot be
- * determined, it should return 0. No deadlock checks should be done during the
+ * there are no objects to free, it should return SHRINK_EMPTY, while 0 is
+ * returned in cases of the number of freeable items cannot be determined
+ * or shrinker should skip this cache for this time (e.g., their number
+ * is below shrinkable limit). No deadlock checks should be done during the
* count callback - the shrinker relies on aggregating scan counts that couldn't
* be executed due to potential deadlocks to be run at a later call when the
* deadlock condition is no longer pending.
@@ -66,6 +69,10 @@ struct shrinker {
/* These are for internal use */
struct list_head list;
+#ifdef CONFIG_MEMCG_KMEM
+ /* ID in shrinker_idr */
+ int id;
+#endif
/* objs pending delete, per node */
atomic_long_t *nr_deferred;
};
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 14e3fe4bd6a1..ed9cbddeb4a6 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -97,7 +97,7 @@
# define SLAB_FAILSLAB 0
#endif
/* Account to memcg */
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
# define SLAB_ACCOUNT ((slab_flags_t __force)0x04000000U)
#else
# define SLAB_ACCOUNT 0
diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h
index a5b3aa8d281f..3e9a963edd6a 100644
--- a/include/linux/vmacache.h
+++ b/include/linux/vmacache.h
@@ -5,12 +5,6 @@
#include <linux/sched.h>
#include <linux/mm.h>
-/*
- * Hash based on the page number. Provides a good hit rate for
- * workloads with good locality and those with random accesses as well.
- */
-#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK)
-
static inline void vmacache_flush(struct task_struct *tsk)
{
memset(tsk->vmacache.vmas, 0, sizeof(tsk->vmacache.vmas));
diff --git a/init/Kconfig b/init/Kconfig
index 4dc783023e43..9bd50ba8253f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -708,6 +708,11 @@ config MEMCG_SWAP_ENABLED
select this option (if, for some reason, they need to disable it
then swapaccount=0 does the trick).
+config MEMCG_KMEM
+ bool
+ depends on MEMCG && !SLOB
+ default y
+
config BLK_CGROUP
bool "IO controller"
depends on BLOCK
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index d987dcd1bd56..286d82329eb0 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -178,7 +178,7 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
* @dev: Pointer to device for which the allocation is performed.
* @count: Requested number of pages.
* @align: Requested alignment of pages (in PAGE_SIZE order).
- * @gfp_mask: GFP flags to use for this allocation.
+ * @no_warn: Avoid printing message about failed allocation.
*
* This function allocates memory buffer for specified device. It uses
* device specific contiguous memory area if available or the default
@@ -186,12 +186,12 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
* function.
*/
struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
- unsigned int align, gfp_t gfp_mask)
+ unsigned int align, bool no_warn)
{
if (align > CONFIG_CMA_ALIGNMENT)
align = CONFIG_CMA_ALIGNMENT;
- return cma_alloc(dev_get_cma_area(dev), count, align, gfp_mask);
+ return cma_alloc(dev_get_cma_area(dev), count, align, no_warn);
}
/**
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index c2860c5a9e96..1c35b7b945d0 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -78,7 +78,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
again:
/* CMA can be used only in the context which permits sleeping */
if (gfpflags_allow_blocking(gfp)) {
- page = dma_alloc_from_contiguous(dev, count, page_order, gfp);
+ page = dma_alloc_from_contiguous(dev, count, page_order,
+ gfp & __GFP_NOWARN);
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
dma_release_from_contiguous(dev, page, count);
page = NULL;
diff --git a/kernel/fork.c b/kernel/fork.c
index 33112315b5c0..5ee74c113381 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -871,6 +871,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->use_memdelay = 0;
#endif
+#ifdef CONFIG_MEMCG
+ tsk->active_memcg = NULL;
+#endif
return tsk;
free_stack:
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 38283363da06..1f87ea6b6545 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/pfn_t.h>
#include <linux/io.h>
+#include <linux/kasan.h>
#include <linux/mm.h>
#include <linux/memory_hotplug.h>
#include <linux/swap.h>
@@ -137,6 +138,7 @@ static void devm_memremap_pages_release(void *data)
mem_hotplug_begin();
arch_remove_memory(align_start, align_size, pgmap->altmap_valid ?
&pgmap->altmap : NULL);
+ kasan_remove_zero_shadow(__va(align_start), align_size);
mem_hotplug_done();
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
@@ -239,6 +241,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
goto err_pfn_remap;
mem_hotplug_begin();
+ error = kasan_add_zero_shadow(__va(align_start), align_size);
+ if (error) {
+ mem_hotplug_done();
+ goto err_kasan;
+ }
+
error = arch_add_memory(nid, align_start, align_size, altmap, false);
if (!error)
move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
@@ -267,6 +275,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
return __va(res->start);
err_add_memory:
+ kasan_remove_zero_shadow(__va(align_start), align_size);
+ err_kasan:
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
err_pfn_remap:
err_radix:
diff --git a/mm/Kconfig b/mm/Kconfig
index 9ae1b6a8e30f..a550635ea5c3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -118,10 +118,6 @@ config SPARSEMEM_EXTREME
config SPARSEMEM_VMEMMAP_ENABLE
bool
-config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- def_bool y
- depends on SPARSEMEM && X86_64
-
config SPARSEMEM_VMEMMAP
bool "Sparse Memory virtual memmap"
depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
@@ -423,10 +419,11 @@ config ARCH_WANTS_THP_SWAP
config THP_SWAP
def_bool y
- depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP
+ depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP
help
Swap transparent huge pages in one piece, without splitting.
- XXX: For now this only does clustered swap space allocation.
+ XXX: For now, swap cluster backing transparent huge page
+ will be split after swapout.
For selection by architectures with reasonable THP sizes.
@@ -638,7 +635,7 @@ config DEFERRED_STRUCT_PAGE_INIT
bool "Defer initialisation of struct pages to kthreads"
default n
depends on NO_BOOTMEM
- depends on !FLATMEM
+ depends on SPARSEMEM
depends on !NEED_PER_CPU_KM
help
Ordinarily all struct pages are initialised during early boot in a
diff --git a/mm/cma.c b/mm/cma.c
index 5809bbe360d7..4cb76121a3ab 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -395,13 +395,13 @@ static inline void cma_debug_show_areas(struct cma *cma) { }
* @cma: Contiguous memory region for which the allocation is performed.
* @count: Requested number of pages.
* @align: Requested alignment of pages (in PAGE_SIZE order).
- * @gfp_mask: GFP mask to use during compaction
+ * @no_warn: Avoid printing message about failed allocation
*
* This function allocates part of contiguous memory on specific
* contiguous memory area.
*/
struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
- gfp_t gfp_mask)
+ bool no_warn)
{
unsigned long mask, offset;
unsigned long pfn = -1;
@@ -447,7 +447,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
mutex_lock(&cma_mutex);
ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
- gfp_mask);
+ GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0));
mutex_unlock(&cma_mutex);
if (ret == 0) {
page = pfn_to_page(pfn);
@@ -466,7 +466,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
trace_cma_alloc(pfn, page, count, align);
- if (ret && !(gfp_mask & __GFP_NOWARN)) {
+ if (ret && !no_warn) {
pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n",
__func__, count, ret);
cma_debug_show_areas(cma);
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index f23467291cfb..ad6723e9d110 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -139,7 +139,7 @@ static int cma_alloc_mem(struct cma *cma, int count)
if (!mem)
return -ENOMEM;
- p = cma_alloc(cma, count, 0, GFP_KERNEL);
+ p = cma_alloc(cma, count, 0, false);
if (!p) {
kfree(mem);
return -ENOMEM;
diff --git a/mm/fadvise.c b/mm/fadvise.c
index afa41491d324..2d8376e3c640 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -72,8 +72,12 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
goto out;
}
- /* Careful about overflows. Len == 0 means "as much as possible" */
- endbyte = offset + len;
+ /*
+ * Careful about overflows. Len == 0 means "as much as possible". Use
+ * unsigned math because signed overflows are undefined and UBSan
+ * complains.
+ */
+ endbyte = (u64)offset + (u64)len;
if (!len || endbyte < len)
endbyte = -1;
else
diff --git a/mm/hmm.c b/mm/hmm.c
index de7b6bf77201..76e7a058b32f 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -299,14 +299,14 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
- int r;
+ vm_fault_t ret;
flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
flags |= write_fault ? FAULT_FLAG_WRITE : 0;
- r = handle_mm_fault(vma, addr, flags);
- if (r & VM_FAULT_RETRY)
+ ret = handle_mm_fault(vma, addr, flags);
+ if (ret & VM_FAULT_RETRY)
return -EBUSY;
- if (r & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_ERROR) {
*pfn = range->values[HMM_PFN_ERROR];
return -EFAULT;
}
@@ -676,7 +676,8 @@ int hmm_vma_get_pfns(struct hmm_range *range)
return -EINVAL;
/* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
+ vma_is_dax(vma)) {
hmm_pfns_special(range);
return -EINVAL;
}
@@ -849,7 +850,8 @@ int hmm_vma_fault(struct hmm_range *range, bool block)
return -EINVAL;
/* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
+ vma_is_dax(vma)) {
hmm_pfns_special(range);
return -EINVAL;
}
@@ -971,10 +973,7 @@ static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
static void hmm_devmem_radix_release(struct resource *resource)
{
- resource_size_t key, align_start, align_size;
-
- align_start = resource->start & ~(PA_SECTION_SIZE - 1);
- align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
+ resource_size_t key;
mutex_lock(&hmm_devmem_lock);
for (key = resource->start;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a9e1e093df51..78427af91de9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -762,11 +762,11 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
* but we need to be consistent with PTEs and architectures that
* can't support a 'special' bit.
*/
- BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
+ !pfn_t_devmap(pfn));
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
- BUG_ON(!pfn_t_devmap(pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
@@ -1328,7 +1328,8 @@ alloc:
if (!page)
clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
else
- copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+ copy_user_huge_page(new_page, page, vmf->address,
+ vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
mmun_start = haddr;
@@ -1740,7 +1741,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
} else {
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
- add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
+ add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
}
spin_unlock(ptl);
@@ -2090,7 +2091,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
SetPageReferenced(page);
page_remove_rmap(page, true);
put_page(page);
- add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
+ add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
} else if (is_huge_zero_pmd(*pmd)) {
/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3103099f64fd..47566bb0b4b1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2101,7 +2101,7 @@ int __alloc_bootmem_huge_page(struct hstate *h)
for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
void *addr;
- addr = memblock_virt_alloc_try_nid_nopanic(
+ addr = memblock_virt_alloc_try_nid_raw(
huge_page_size(h), huge_page_size(h),
0, BOOTMEM_ALLOC_ACCESSIBLE, node);
if (addr) {
@@ -2119,6 +2119,7 @@ int __alloc_bootmem_huge_page(struct hstate *h)
found:
BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
/* Put them into a private list first because mem_map is not up yet */
+ INIT_LIST_HEAD(&m->list);
list_add(&m->list, &huge_boot_pages);
m->hstate = h;
return 1;
@@ -2139,16 +2140,9 @@ static void __init gather_bootmem_prealloc(void)
struct huge_bootmem_page *m;
list_for_each_entry(m, &huge_boot_pages, list) {
+ struct page *page = virt_to_page(m);
struct hstate *h = m->hstate;
- struct page *page;
-#ifdef CONFIG_HIGHMEM
- page = pfn_to_page(m->phys >> PAGE_SHIFT);
- memblock_free_late(__pa(m),
- sizeof(struct huge_bootmem_page));
-#else
- page = virt_to_page(m);
-#endif
WARN_ON(page_count(page) != 1);
prep_compound_huge_page(page, h->order);
WARN_ON(PageReserved(page));
@@ -3518,6 +3512,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
int ret = 0, outside_reserve = 0;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ unsigned long haddr = address & huge_page_mask(h);
pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
@@ -3527,7 +3522,7 @@ retry_avoidcopy:
* and just make the page writable */
if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
page_move_anon_rmap(old_page, vma);
- set_huge_ptep_writable(vma, address, ptep);
+ set_huge_ptep_writable(vma, haddr, ptep);
return 0;
}
@@ -3551,7 +3546,7 @@ retry_avoidcopy:
* be acquired again before returning to the caller, as expected.
*/
spin_unlock(ptl);
- new_page = alloc_huge_page(vma, address, outside_reserve);
+ new_page = alloc_huge_page(vma, haddr, outside_reserve);
if (IS_ERR(new_page)) {
/*
@@ -3564,11 +3559,10 @@ retry_avoidcopy:
if (outside_reserve) {
put_page(old_page);
BUG_ON(huge_pte_none(pte));
- unmap_ref_private(mm, vma, old_page, address);
+ unmap_ref_private(mm, vma, old_page, haddr);
BUG_ON(huge_pte_none(pte));
spin_lock(ptl);
- ptep = huge_pte_offset(mm, address & huge_page_mask(h),
- huge_page_size(h));
+ ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep &&
pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
@@ -3598,7 +3592,7 @@ retry_avoidcopy:
__SetPageUptodate(new_page);
set_page_huge_active(new_page);
- mmun_start = address & huge_page_mask(h);
+ mmun_start = haddr;
mmun_end = mmun_start + huge_page_size(h);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
@@ -3607,25 +3601,24 @@ retry_avoidcopy:
* before the page tables are altered
*/
spin_lock(ptl);
- ptep = huge_pte_offset(mm, address & huge_page_mask(h),
- huge_page_size(h));
+ ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
ClearPagePrivate(new_page);
/* Break COW */
- huge_ptep_clear_flush(vma, address, ptep);
+ huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
- set_huge_pte_at(mm, address, ptep,
+ set_huge_pte_at(mm, haddr, ptep,
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page, true);
- hugepage_add_new_anon_rmap(new_page, vma, address);
+ hugepage_add_new_anon_rmap(new_page, vma, haddr);
/* Make the old page be freed below */
new_page = old_page;
}
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out_release_all:
- restore_reserve_on_error(h, vma, address, new_page);
+ restore_reserve_on_error(h, vma, haddr, new_page);
put_page(new_page);
out_release_old:
put_page(old_page);
@@ -3828,7 +3821,7 @@ retry:
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_cow(mm, vma, haddr, ptep, page, ptl);
+ ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
}
spin_unlock(ptl);
@@ -3982,7 +3975,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & FAULT_FLAG_WRITE) {
if (!huge_pte_write(entry)) {
- ret = hugetlb_cow(mm, vma, haddr, ptep,
+ ret = hugetlb_cow(mm, vma, address, ptep,
pagecache_page, ptl);
goto out_put_page;
}
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c
index f436246ccc79..7a2a2f13f86f 100644
--- a/mm/kasan/kasan_init.c
+++ b/mm/kasan/kasan_init.c
@@ -17,10 +17,13 @@
#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/pfn.h>
+#include <linux/slab.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
+#include "kasan.h"
+
/*
* This page serves two purposes:
* - It used as early shadow memory. The entire shadow region populated
@@ -32,22 +35,59 @@ unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
#if CONFIG_PGTABLE_LEVELS > 4
p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss;
+static inline bool kasan_p4d_table(pgd_t pgd)
+{
+ return pgd_page(pgd) == virt_to_page(lm_alias(kasan_zero_p4d));
+}
+#else
+static inline bool kasan_p4d_table(pgd_t pgd)
+{
+ return 0;
+}
#endif
#if CONFIG_PGTABLE_LEVELS > 3
pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
+static inline bool kasan_pud_table(p4d_t p4d)
+{
+ return p4d_page(p4d) == virt_to_page(lm_alias(kasan_zero_pud));
+}
+#else
+static inline bool kasan_pud_table(p4d_t p4d)
+{
+ return 0;
+}
#endif
#if CONFIG_PGTABLE_LEVELS > 2
pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss;
+static inline bool kasan_pmd_table(pud_t pud)
+{
+ return pud_page(pud) == virt_to_page(lm_alias(kasan_zero_pmd));
+}
+#else
+static inline bool kasan_pmd_table(pud_t pud)
+{
+ return 0;
+}
#endif
pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss;
+static inline bool kasan_pte_table(pmd_t pmd)
+{
+ return pmd_page(pmd) == virt_to_page(lm_alias(kasan_zero_pte));
+}
+
+static inline bool kasan_zero_page_entry(pte_t pte)
+{
+ return pte_page(pte) == virt_to_page(lm_alias(kasan_zero_page));
+}
+
static __init void *early_alloc(size_t size, int node)
{
return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS),
BOOTMEM_ALLOC_ACCESSIBLE, node);
}
-static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
+static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr,
unsigned long end)
{
pte_t *pte = pte_offset_kernel(pmd, addr);
@@ -63,7 +103,7 @@ static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
}
}
-static void __init zero_pmd_populate(pud_t *pud, unsigned long addr,
+static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr,
unsigned long end)
{
pmd_t *pmd = pmd_offset(pud, addr);
@@ -78,14 +118,24 @@ static void __init zero_pmd_populate(pud_t *pud, unsigned long addr,
}
if (pmd_none(*pmd)) {
- pmd_populate_kernel(&init_mm, pmd,
- early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ pte_t *p;
+
+ if (slab_is_available())
+ p = pte_alloc_one_kernel(&init_mm, addr);
+ else
+ p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
+ if (!p)
+ return -ENOMEM;
+
+ pmd_populate_kernel(&init_mm, pmd, p);
}
zero_pte_populate(pmd, addr, next);
} while (pmd++, addr = next, addr != end);
+
+ return 0;
}
-static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr,
+static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr,
unsigned long end)
{
pud_t *pud = pud_offset(p4d, addr);
@@ -103,14 +153,24 @@ static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr,
}
if (pud_none(*pud)) {
- pud_populate(&init_mm, pud,
- early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ pmd_t *p;
+
+ if (slab_is_available()) {
+ p = pmd_alloc(&init_mm, pud, addr);
+ if (!p)
+ return -ENOMEM;
+ } else {
+ pud_populate(&init_mm, pud,
+ early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ }
}
zero_pmd_populate(pud, addr, next);
} while (pud++, addr = next, addr != end);
+
+ return 0;
}
-static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr,
+static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
unsigned long end)
{
p4d_t *p4d = p4d_offset(pgd, addr);
@@ -132,11 +192,21 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr,
}
if (p4d_none(*p4d)) {
- p4d_populate(&init_mm, p4d,
- early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ pud_t *p;
+
+ if (slab_is_available()) {
+ p = pud_alloc(&init_mm, p4d, addr);
+ if (!p)
+ return -ENOMEM;
+ } else {
+ p4d_populate(&init_mm, p4d,
+ early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ }
}
zero_pud_populate(p4d, addr, next);
} while (p4d++, addr = next, addr != end);
+
+ return 0;
}
/**
@@ -145,7 +215,7 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr,
* @shadow_start - start of the memory range to populate
* @shadow_end - end of the memory range to populate
*/
-void __init kasan_populate_zero_shadow(const void *shadow_start,
+int __ref kasan_populate_zero_shadow(const void *shadow_start,
const void *shadow_end)
{
unsigned long addr = (unsigned long)shadow_start;
@@ -191,9 +261,229 @@ void __init kasan_populate_zero_shadow(const void *shadow_start,
}
if (pgd_none(*pgd)) {
- pgd_populate(&init_mm, pgd,
- early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ p4d_t *p;
+
+ if (slab_is_available()) {
+ p = p4d_alloc(&init_mm, pgd, addr);
+ if (!p)
+ return -ENOMEM;
+ } else {
+ pgd_populate(&init_mm, pgd,
+ early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ }
}
zero_p4d_populate(pgd, addr, next);
} while (pgd++, addr = next, addr != end);
+
+ return 0;
+}
+
+static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd)
+{
+ pte_t *pte;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte = pte_start + i;
+ if (!pte_none(*pte))
+ return;
+ }
+
+ pte_free_kernel(&init_mm, (pte_t *)page_to_virt(pmd_page(*pmd)));
+ pmd_clear(pmd);
+}
+
+static void kasan_free_pmd(pmd_t *pmd_start, pud_t *pud)
+{
+ pmd_t *pmd;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd = pmd_start + i;
+ if (!pmd_none(*pmd))
+ return;
+ }
+
+ pmd_free(&init_mm, (pmd_t *)page_to_virt(pud_page(*pud)));
+ pud_clear(pud);
+}
+
+static void kasan_free_pud(pud_t *pud_start, p4d_t *p4d)
+{
+ pud_t *pud;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ pud = pud_start + i;
+ if (!pud_none(*pud))
+ return;
+ }
+
+ pud_free(&init_mm, (pud_t *)page_to_virt(p4d_page(*p4d)));
+ p4d_clear(p4d);
+}
+
+static void kasan_free_p4d(p4d_t *p4d_start, pgd_t *pgd)
+{
+ p4d_t *p4d;
+ int i;
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ p4d = p4d_start + i;
+ if (!p4d_none(*p4d))
+ return;
+ }
+
+ p4d_free(&init_mm, (p4d_t *)page_to_virt(pgd_page(*pgd)));
+ pgd_clear(pgd);
+}
+
+static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next, pte++) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ if (next > end)
+ next = end;
+
+ if (!pte_present(*pte))
+ continue;
+
+ if (WARN_ON(!kasan_zero_page_entry(*pte)))
+ continue;
+ pte_clear(&init_mm, addr, pte);
+ }
+}
+
+static void kasan_remove_pmd_table(pmd_t *pmd, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next, pmd++) {
+ pte_t *pte;
+
+ next = pmd_addr_end(addr, end);
+
+ if (!pmd_present(*pmd))
+ continue;
+
+ if (kasan_pte_table(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE))
+ pmd_clear(pmd);
+ continue;
+ }
+ pte = pte_offset_kernel(pmd, addr);
+ kasan_remove_pte_table(pte, addr, next);
+ kasan_free_pte(pte_offset_kernel(pmd, 0), pmd);
+ }
+}
+
+static void kasan_remove_pud_table(pud_t *pud, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next, pud++) {
+ pmd_t *pmd, *pmd_base;
+
+ next = pud_addr_end(addr, end);
+
+ if (!pud_present(*pud))
+ continue;
+
+ if (kasan_pmd_table(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE))
+ pud_clear(pud);
+ continue;
+ }
+ pmd = pmd_offset(pud, addr);
+ pmd_base = pmd_offset(pud, 0);
+ kasan_remove_pmd_table(pmd, addr, next);
+ kasan_free_pmd(pmd_base, pud);
+ }
+}
+
+static void kasan_remove_p4d_table(p4d_t *p4d, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next, p4d++) {
+ pud_t *pud;
+
+ next = p4d_addr_end(addr, end);
+
+ if (!p4d_present(*p4d))
+ continue;
+
+ if (kasan_pud_table(*p4d)) {
+ if (IS_ALIGNED(addr, P4D_SIZE) &&
+ IS_ALIGNED(next, P4D_SIZE))
+ p4d_clear(p4d);
+ continue;
+ }
+ pud = pud_offset(p4d, addr);
+ kasan_remove_pud_table(pud, addr, next);
+ kasan_free_pud(pud_offset(p4d, 0), p4d);
+ }
+}
+
+void kasan_remove_zero_shadow(void *start, unsigned long size)
+{
+ unsigned long addr, end, next;
+ pgd_t *pgd;
+
+ addr = (unsigned long)kasan_mem_to_shadow(start);
+ end = addr + (size >> KASAN_SHADOW_SCALE_SHIFT);
+
+ if (WARN_ON((unsigned long)start %
+ (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) ||
+ WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)))
+ return;
+
+ for (; addr < end; addr = next) {
+ p4d_t *p4d;
+
+ next = pgd_addr_end(addr, end);
+
+ pgd = pgd_offset_k(addr);
+ if (!pgd_present(*pgd))
+ continue;
+
+ if (kasan_p4d_table(*pgd)) {
+ if (IS_ALIGNED(addr, PGDIR_SIZE) &&
+ IS_ALIGNED(next, PGDIR_SIZE))
+ pgd_clear(pgd);
+ continue;
+ }
+
+ p4d = p4d_offset(pgd, addr);
+ kasan_remove_p4d_table(p4d, addr, next);
+ kasan_free_p4d(p4d_offset(pgd, 0), pgd);
+ }
+}
+
+int kasan_add_zero_shadow(void *start, unsigned long size)
+{
+ int ret;
+ void *shadow_start, *shadow_end;
+
+ shadow_start = kasan_mem_to_shadow(start);
+ shadow_end = shadow_start + (size >> KASAN_SHADOW_SCALE_SHIFT);
+
+ if (WARN_ON((unsigned long)start %
+ (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) ||
+ WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)))
+ return -EINVAL;
+
+ ret = kasan_populate_zero_shadow(shadow_start, shadow_end);
+ if (ret)
+ kasan_remove_zero_shadow(shadow_start,
+ size >> KASAN_SHADOW_SCALE_SHIFT);
+ return ret;
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index d7b2a4bf8671..961cbe9062a5 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -397,6 +397,26 @@ static inline int khugepaged_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
+static bool hugepage_vma_check(struct vm_area_struct *vma,
+ unsigned long vm_flags)
+{
+ if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+ (vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+ return false;
+ if (shmem_file(vma->vm_file)) {
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ return false;
+ return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+ HPAGE_PMD_NR);
+ }
+ if (!vma->anon_vma || vma->vm_ops)
+ return false;
+ if (is_vma_temporary_stack(vma))
+ return false;
+ return !(vm_flags & VM_NO_KHUGEPAGED);
+}
+
int __khugepaged_enter(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
@@ -434,15 +454,14 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
unsigned long vm_flags)
{
unsigned long hstart, hend;
- if (!vma->anon_vma)
- /*
- * Not yet faulted in so we will register later in the
- * page fault if needed.
- */
- return 0;
- if (vma->vm_ops || (vm_flags & VM_NO_KHUGEPAGED))
- /* khugepaged not yet working on file or special mappings */
+
+ /*
+ * khugepaged does not yet work on non-shmem files or special
+ * mappings. And file-private shmem THP is not supported.
+ */
+ if (!hugepage_vma_check(vma, vm_flags))
return 0;
+
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
@@ -819,25 +838,6 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
}
#endif
-static bool hugepage_vma_check(struct vm_area_struct *vma)
-{
- if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
- (vma->vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
- return false;
- if (shmem_file(vma->vm_file)) {
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
- return false;
- return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
- HPAGE_PMD_NR);
- }
- if (!vma->anon_vma || vma->vm_ops)
- return false;
- if (is_vma_temporary_stack(vma))
- return false;
- return !(vma->vm_flags & VM_NO_KHUGEPAGED);
-}
-
/*
* If mmap_sem temporarily dropped, revalidate vma
* before taking mmap_sem.
@@ -862,7 +862,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
hend = vma->vm_end & HPAGE_PMD_MASK;
if (address < hstart || address + HPAGE_PMD_SIZE > hend)
return SCAN_ADDRESS_RANGE;
- if (!hugepage_vma_check(vma))
+ if (!hugepage_vma_check(vma, vma->vm_flags))
return SCAN_VMA_CHECK;
return 0;
}
@@ -1517,6 +1517,8 @@ tree_unlocked:
unlock_page(new_page);
*hpage = NULL;
+
+ khugepaged_pages_collapsed++;
} else {
/* Something went wrong: rollback changes to the radix-tree */
shmem_uncharge(mapping->host, nr_none);
@@ -1694,7 +1696,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
progress++;
break;
}
- if (!hugepage_vma_check(vma)) {
+ if (!hugepage_vma_check(vma, vma->vm_flags)) {
skip:
progress++;
continue;
diff --git a/mm/ksm.c b/mm/ksm.c
index a6d43cf9a982..2621be57bd95 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -470,7 +470,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
{
struct page *page;
- int ret = 0;
+ vm_fault_t ret = 0;
do {
cond_resched();
@@ -2430,6 +2430,9 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
VM_HUGETLB | VM_MIXEDMAP))
return 0; /* just ignore the advice */
+ if (vma_is_dax(vma))
+ return 0;
+
#ifdef VM_SAO
if (*vm_flags & VM_SAO)
return 0;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index fcfb6c89ed47..5b30625fd365 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -12,7 +12,7 @@
#include <linux/mutex.h>
#include <linux/memcontrol.h>
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
static LIST_HEAD(list_lrus);
static DEFINE_MUTEX(list_lrus_mutex);
@@ -29,17 +29,12 @@ static void list_lru_unregister(struct list_lru *lru)
list_del(&lru->list);
mutex_unlock(&list_lrus_mutex);
}
-#else
-static void list_lru_register(struct list_lru *lru)
-{
-}
-static void list_lru_unregister(struct list_lru *lru)
+static int lru_shrinker_id(struct list_lru *lru)
{
+ return lru->shrinker_id;
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
/*
@@ -75,20 +70,39 @@ static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
}
static inline struct list_lru_one *
-list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
+ struct mem_cgroup **memcg_ptr)
{
- struct mem_cgroup *memcg;
+ struct list_lru_one *l = &nlru->lru;
+ struct mem_cgroup *memcg = NULL;
if (!nlru->memcg_lrus)
- return &nlru->lru;
+ goto out;
memcg = mem_cgroup_from_kmem(ptr);
if (!memcg)
- return &nlru->lru;
+ goto out;
- return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+ l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+out:
+ if (memcg_ptr)
+ *memcg_ptr = memcg;
+ return l;
}
#else
+static void list_lru_register(struct list_lru *lru)
+{
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+}
+
+static int lru_shrinker_id(struct list_lru *lru)
+{
+ return -1;
+}
+
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
return false;
@@ -101,23 +115,30 @@ list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
}
static inline struct list_lru_one *
-list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
+ struct mem_cgroup **memcg_ptr)
{
+ if (memcg_ptr)
+ *memcg_ptr = NULL;
return &nlru->lru;
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
bool list_lru_add(struct list_lru *lru, struct list_head *item)
{
int nid = page_to_nid(virt_to_page(item));
struct list_lru_node *nlru = &lru->node[nid];
+ struct mem_cgroup *memcg;
struct list_lru_one *l;
spin_lock(&nlru->lock);
if (list_empty(item)) {
- l = list_lru_from_kmem(nlru, item);
+ l = list_lru_from_kmem(nlru, item, &memcg);
list_add_tail(item, &l->list);
- l->nr_items++;
+ /* Set shrinker bit if the first element was added */
+ if (!l->nr_items++)
+ memcg_set_shrinker_bit(memcg, nid,
+ lru_shrinker_id(lru));
nlru->nr_items++;
spin_unlock(&nlru->lock);
return true;
@@ -135,7 +156,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
spin_lock(&nlru->lock);
if (!list_empty(item)) {
- l = list_lru_from_kmem(nlru, item);
+ l = list_lru_from_kmem(nlru, item, NULL);
list_del_init(item);
l->nr_items--;
nlru->nr_items--;
@@ -162,26 +183,20 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
}
EXPORT_SYMBOL_GPL(list_lru_isolate_move);
-static unsigned long __list_lru_count_one(struct list_lru *lru,
- int nid, int memcg_idx)
+unsigned long list_lru_count_one(struct list_lru *lru,
+ int nid, struct mem_cgroup *memcg)
{
struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
unsigned long count;
rcu_read_lock();
- l = list_lru_from_memcg_idx(nlru, memcg_idx);
+ l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
count = l->nr_items;
rcu_read_unlock();
return count;
}
-
-unsigned long list_lru_count_one(struct list_lru *lru,
- int nid, struct mem_cgroup *memcg)
-{
- return __list_lru_count_one(lru, nid, memcg_cache_id(memcg));
-}
EXPORT_SYMBOL_GPL(list_lru_count_one);
unsigned long list_lru_count_node(struct list_lru *lru, int nid)
@@ -194,17 +209,15 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid)
EXPORT_SYMBOL_GPL(list_lru_count_node);
static unsigned long
-__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
+__list_lru_walk_one(struct list_lru_node *nlru, int memcg_idx,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk)
{
- struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
struct list_head *item, *n;
unsigned long isolated = 0;
- spin_lock(&nlru->lock);
l = list_lru_from_memcg_idx(nlru, memcg_idx);
restart:
list_for_each_safe(item, n, &l->list) {
@@ -250,8 +263,6 @@ restart:
BUG();
}
}
-
- spin_unlock(&nlru->lock);
return isolated;
}
@@ -260,11 +271,32 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk)
{
- return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg),
- isolate, cb_arg, nr_to_walk);
+ struct list_lru_node *nlru = &lru->node[nid];
+ unsigned long ret;
+
+ spin_lock(&nlru->lock);
+ ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
+ nr_to_walk);
+ spin_unlock(&nlru->lock);
+ return ret;
}
EXPORT_SYMBOL_GPL(list_lru_walk_one);
+unsigned long
+list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
+{
+ struct list_lru_node *nlru = &lru->node[nid];
+ unsigned long ret;
+
+ spin_lock_irq(&nlru->lock);
+ ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
+ nr_to_walk);
+ spin_unlock_irq(&nlru->lock);
+ return ret;
+}
+
unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk)
@@ -272,12 +304,18 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
long isolated = 0;
int memcg_idx;
- isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg,
- nr_to_walk);
+ isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
+ nr_to_walk);
if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
for_each_memcg_cache_index(memcg_idx) {
- isolated += __list_lru_walk_one(lru, nid, memcg_idx,
- isolate, cb_arg, nr_to_walk);
+ struct list_lru_node *nlru = &lru->node[nid];
+
+ spin_lock(&nlru->lock);
+ isolated += __list_lru_walk_one(nlru, memcg_idx,
+ isolate, cb_arg,
+ nr_to_walk);
+ spin_unlock(&nlru->lock);
+
if (*nr_to_walk <= 0)
break;
}
@@ -292,7 +330,7 @@ static void init_one_lru(struct list_lru_one *l)
l->nr_items = 0;
}
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
int begin, int end)
{
@@ -500,10 +538,13 @@ fail:
goto out;
}
-static void memcg_drain_list_lru_node(struct list_lru_node *nlru,
- int src_idx, int dst_idx)
+static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
+ int src_idx, struct mem_cgroup *dst_memcg)
{
+ struct list_lru_node *nlru = &lru->node[nid];
+ int dst_idx = dst_memcg->kmemcg_id;
struct list_lru_one *src, *dst;
+ bool set;
/*
* Since list_lru_{add,del} may be called under an IRQ-safe lock,
@@ -515,14 +556,17 @@ static void memcg_drain_list_lru_node(struct list_lru_node *nlru,
dst = list_lru_from_memcg_idx(nlru, dst_idx);
list_splice_init(&src->list, &dst->list);
+ set = (!dst->nr_items && src->nr_items);
dst->nr_items += src->nr_items;
+ if (set)
+ memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
src->nr_items = 0;
spin_unlock_irq(&nlru->lock);
}
static void memcg_drain_list_lru(struct list_lru *lru,
- int src_idx, int dst_idx)
+ int src_idx, struct mem_cgroup *dst_memcg)
{
int i;
@@ -530,16 +574,16 @@ static void memcg_drain_list_lru(struct list_lru *lru,
return;
for_each_node(i)
- memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
+ memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg);
}
-void memcg_drain_all_list_lrus(int src_idx, int dst_idx)
+void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg)
{
struct list_lru *lru;
mutex_lock(&list_lrus_mutex);
list_for_each_entry(lru, &list_lrus, list)
- memcg_drain_list_lru(lru, src_idx, dst_idx);
+ memcg_drain_list_lru(lru, src_idx, dst_memcg);
mutex_unlock(&list_lrus_mutex);
}
#else
@@ -551,15 +595,21 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
static void memcg_destroy_list_lru(struct list_lru *lru)
{
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
int __list_lru_init(struct list_lru *lru, bool memcg_aware,
- struct lock_class_key *key)
+ struct lock_class_key *key, struct shrinker *shrinker)
{
int i;
size_t size = sizeof(*lru->node) * nr_node_ids;
int err = -ENOMEM;
+#ifdef CONFIG_MEMCG_KMEM
+ if (shrinker)
+ lru->shrinker_id = shrinker->id;
+ else
+ lru->shrinker_id = -1;
+#endif
memcg_get_cache_ids();
lru->node = kzalloc(size, GFP_KERNEL);
@@ -602,6 +652,9 @@ void list_lru_destroy(struct list_lru *lru)
kfree(lru->node);
lru->node = NULL;
+#ifdef CONFIG_MEMCG_KMEM
+ lru->shrinker_id = -1;
+#endif
memcg_put_cache_ids();
}
EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/memblock.c b/mm/memblock.c
index b4ad05764745..237944479d25 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -392,7 +392,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
{
struct memblock_region *new_array, *old_array;
phys_addr_t old_alloc_size, new_alloc_size;
- phys_addr_t old_size, new_size, addr;
+ phys_addr_t old_size, new_size, addr, new_end;
int use_slab = slab_is_available();
int *in_slab;
@@ -453,9 +453,9 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
return -1;
}
- memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
- type->name, type->max * 2, (u64)addr,
- (u64)addr + new_size - 1);
+ new_end = addr + new_size - 1;
+ memblock_dbg("memblock: %s is doubled to %ld at [%pa-%pa]",
+ type->name, type->max * 2, &addr, &new_end);
/*
* Found space, we now need to move the array over before we add the
@@ -1438,9 +1438,9 @@ void * __init memblock_virt_alloc_try_nid_raw(
{
void *ptr;
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
- __func__, (u64)size, (u64)align, nid, (u64)min_addr,
- (u64)max_addr, (void *)_RET_IP_);
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
+ __func__, (u64)size, (u64)align, nid, &min_addr,
+ &max_addr, (void *)_RET_IP_);
ptr = memblock_virt_alloc_internal(size, align,
min_addr, max_addr, nid);
@@ -1475,9 +1475,9 @@ void * __init memblock_virt_alloc_try_nid_nopanic(
{
void *ptr;
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
- __func__, (u64)size, (u64)align, nid, (u64)min_addr,
- (u64)max_addr, (void *)_RET_IP_);
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
+ __func__, (u64)size, (u64)align, nid, &min_addr,
+ &max_addr, (void *)_RET_IP_);
ptr = memblock_virt_alloc_internal(size, align,
min_addr, max_addr, nid);
@@ -1511,9 +1511,9 @@ void * __init memblock_virt_alloc_try_nid(
{
void *ptr;
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
- __func__, (u64)size, (u64)align, nid, (u64)min_addr,
- (u64)max_addr, (void *)_RET_IP_);
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
+ __func__, (u64)size, (u64)align, nid, &min_addr,
+ &max_addr, (void *)_RET_IP_);
ptr = memblock_virt_alloc_internal(size, align,
min_addr, max_addr, nid);
if (ptr) {
@@ -1521,9 +1521,8 @@ void * __init memblock_virt_alloc_try_nid(
return ptr;
}
- panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
- __func__, (u64)size, (u64)align, nid, (u64)min_addr,
- (u64)max_addr);
+ panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa\n",
+ __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr);
return NULL;
}
#endif
@@ -1538,9 +1537,10 @@ void * __init memblock_virt_alloc_try_nid(
*/
void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
{
- memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
- __func__, (u64)base, (u64)base + size - 1,
- (void *)_RET_IP_);
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg("%s: [%pa-%pa] %pF\n",
+ __func__, &base, &end, (void *)_RET_IP_);
kmemleak_free_part_phys(base, size);
memblock_remove_range(&memblock.reserved, base, size);
}
@@ -1556,11 +1556,11 @@ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
*/
void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
{
- u64 cursor, end;
+ phys_addr_t cursor, end;
- memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
- __func__, (u64)base, (u64)base + size - 1,
- (void *)_RET_IP_);
+ end = base + size - 1;
+ memblock_dbg("%s: [%pa-%pa] %pF\n",
+ __func__, &base, &end, (void *)_RET_IP_);
kmemleak_free_part_phys(base, size);
cursor = PFN_UP(base);
end = PFN_DOWN(base + size);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b836e7f00309..6a921890739f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -233,6 +233,21 @@ enum res_type {
/* Used for OOM nofiier */
#define OOM_CONTROL (0)
+/*
+ * Iteration constructs for visiting all cgroups (under a tree). If
+ * loops are exited prematurely (break), mem_cgroup_iter_break() must
+ * be used for reference counting.
+ */
+#define for_each_mem_cgroup_tree(iter, root) \
+ for (iter = mem_cgroup_iter(root, NULL, NULL); \
+ iter != NULL; \
+ iter = mem_cgroup_iter(root, iter, NULL))
+
+#define for_each_mem_cgroup(iter) \
+ for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
+ iter != NULL; \
+ iter = mem_cgroup_iter(NULL, iter, NULL))
+
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
@@ -246,12 +261,7 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}
-static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
-{
- return (memcg == root_mem_cgroup);
-}
-
-#ifndef CONFIG_SLOB
+#ifdef CONFIG_MEMCG_KMEM
/*
* This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
* The main reason for not using cgroup id for this:
@@ -305,7 +315,135 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
struct workqueue_struct *memcg_kmem_cache_wq;
-#endif /* !CONFIG_SLOB */
+static int memcg_shrinker_map_size;
+static DEFINE_MUTEX(memcg_shrinker_map_mutex);
+
+static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
+{
+ kvfree(container_of(head, struct memcg_shrinker_map, rcu));
+}
+
+static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
+ int size, int old_size)
+{
+ struct memcg_shrinker_map *new, *old;
+ int nid;
+
+ lockdep_assert_held(&memcg_shrinker_map_mutex);
+
+ for_each_node(nid) {
+ old = rcu_dereference_protected(
+ mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
+ /* Not yet online memcg */
+ if (!old)
+ return 0;
+
+ new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ /* Set all old bits, clear all new bits */
+ memset(new->map, (int)0xff, old_size);
+ memset((void *)new->map + old_size, 0, size - old_size);
+
+ rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
+ call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
+ }
+
+ return 0;
+}
+
+static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_per_node *pn;
+ struct memcg_shrinker_map *map;
+ int nid;
+
+ if (mem_cgroup_is_root(memcg))
+ return;
+
+ for_each_node(nid) {
+ pn = mem_cgroup_nodeinfo(memcg, nid);
+ map = rcu_dereference_protected(pn->shrinker_map, true);
+ if (map)
+ kvfree(map);
+ rcu_assign_pointer(pn->shrinker_map, NULL);
+ }
+}
+
+static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
+{
+ struct memcg_shrinker_map *map;
+ int nid, size, ret = 0;
+
+ if (mem_cgroup_is_root(memcg))
+ return 0;
+
+ mutex_lock(&memcg_shrinker_map_mutex);
+ size = memcg_shrinker_map_size;
+ for_each_node(nid) {
+ map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
+ if (!map) {
+ memcg_free_shrinker_maps(memcg);
+ ret = -ENOMEM;
+ break;
+ }
+ rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
+ }
+ mutex_unlock(&memcg_shrinker_map_mutex);
+
+ return ret;
+}
+
+int memcg_expand_shrinker_maps(int new_id)
+{
+ int size, old_size, ret = 0;
+ struct mem_cgroup *memcg;
+
+ size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
+ old_size = memcg_shrinker_map_size;
+ if (size <= old_size)
+ return 0;
+
+ mutex_lock(&memcg_shrinker_map_mutex);
+ if (!root_mem_cgroup)
+ goto unlock;
+
+ for_each_mem_cgroup(memcg) {
+ if (mem_cgroup_is_root(memcg))
+ continue;
+ ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
+ if (ret)
+ goto unlock;
+ }
+unlock:
+ if (!ret)
+ memcg_shrinker_map_size = size;
+ mutex_unlock(&memcg_shrinker_map_mutex);
+ return ret;
+}
+
+void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
+{
+ if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
+ struct memcg_shrinker_map *map;
+
+ rcu_read_lock();
+ map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
+ /* Pairs with smp mb in shrink_slab() */
+ smp_mb__before_atomic();
+ set_bit(shrinker_id, map->map);
+ rcu_read_unlock();
+ }
+}
+
+#else /* CONFIG_MEMCG_KMEM */
+static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
+#endif /* CONFIG_MEMCG_KMEM */
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
@@ -678,9 +816,20 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
}
EXPORT_SYMBOL(mem_cgroup_from_task);
-static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+/**
+ * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
+ * @mm: mm from which memcg should be extracted. It can be NULL.
+ *
+ * Obtain a reference on mm->memcg and returns it if successful. Otherwise
+ * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
+ * returned.
+ */
+struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
- struct mem_cgroup *memcg = NULL;
+ struct mem_cgroup *memcg;
+
+ if (mem_cgroup_disabled())
+ return NULL;
rcu_read_lock();
do {
@@ -700,6 +849,46 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
rcu_read_unlock();
return memcg;
}
+EXPORT_SYMBOL(get_mem_cgroup_from_mm);
+
+/**
+ * get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
+ * @page: page from which memcg should be extracted.
+ *
+ * Obtain a reference on page->memcg and returns it if successful. Otherwise
+ * root_mem_cgroup is returned.
+ */
+struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
+{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ rcu_read_lock();
+ if (!memcg || !css_tryget_online(&memcg->css))
+ memcg = root_mem_cgroup;
+ rcu_read_unlock();
+ return memcg;
+}
+EXPORT_SYMBOL(get_mem_cgroup_from_page);
+
+/**
+ * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
+ */
+static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
+{
+ if (unlikely(current->active_memcg)) {
+ struct mem_cgroup *memcg = root_mem_cgroup;
+
+ rcu_read_lock();
+ if (css_tryget_online(&current->active_memcg->css))
+ memcg = current->active_memcg;
+ rcu_read_unlock();
+ return memcg;
+ }
+ return get_mem_cgroup_from_mm(current->mm);
+}
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
@@ -862,21 +1051,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
}
}
-/*
- * Iteration constructs for visiting all cgroups (under a tree). If
- * loops are exited prematurely (break), mem_cgroup_iter_break() must
- * be used for reference counting.
- */
-#define for_each_mem_cgroup_tree(iter, root) \
- for (iter = mem_cgroup_iter(root, NULL, NULL); \
- iter != NULL; \
- iter = mem_cgroup_iter(root, iter, NULL))
-
-#define for_each_mem_cgroup(iter) \
- for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
- iter != NULL; \
- iter = mem_cgroup_iter(NULL, iter, NULL))
-
/**
* mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
* @memcg: hierarchy root
@@ -1483,28 +1657,53 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
}
-static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+enum oom_status {
+ OOM_SUCCESS,
+ OOM_FAILED,
+ OOM_ASYNC,
+ OOM_SKIPPED
+};
+
+static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
- if (!current->memcg_may_oom || order > PAGE_ALLOC_COSTLY_ORDER)
- return;
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ return OOM_SKIPPED;
+
/*
* We are in the middle of the charge context here, so we
* don't want to block when potentially sitting on a callstack
* that holds all kinds of filesystem and mm locks.
*
- * Also, the caller may handle a failed allocation gracefully
- * (like optional page cache readahead) and so an OOM killer
- * invocation might not even be necessary.
+ * cgroup1 allows disabling the OOM killer and waiting for outside
+ * handling until the charge can succeed; remember the context and put
+ * the task to sleep at the end of the page fault when all locks are
+ * released.
+ *
+ * On the other hand, in-kernel OOM killer allows for an async victim
+ * memory reclaim (oom_reaper) and that means that we are not solely
+ * relying on the oom victim to make a forward progress and we can
+ * invoke the oom killer here.
*
- * That's why we don't do anything here except remember the
- * OOM context and then deal with it at the end of the page
- * fault when the stack is unwound, the locks are released,
- * and when we know whether the fault was overall successful.
+ * Please note that mem_cgroup_out_of_memory might fail to find a
+ * victim and then we have to bail out from the charge path.
*/
- css_get(&memcg->css);
- current->memcg_in_oom = memcg;
- current->memcg_oom_gfp_mask = mask;
- current->memcg_oom_order = order;
+ if (memcg->oom_kill_disable) {
+ if (!current->in_user_fault)
+ return OOM_SKIPPED;
+ css_get(&memcg->css);
+ current->memcg_in_oom = memcg;
+ current->memcg_oom_gfp_mask = mask;
+ current->memcg_oom_order = order;
+
+ return OOM_ASYNC;
+ }
+
+ if (mem_cgroup_out_of_memory(memcg, mask, order))
+ return OOM_SUCCESS;
+
+ WARN(1,"Memory cgroup charge failed because of no reclaimable memory! "
+ "This looks like a misconfiguration or a kernel bug.");
+ return OOM_FAILED;
}
/**
@@ -1899,6 +2098,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned long nr_reclaimed;
bool may_swap = true;
bool drained = false;
+ bool oomed = false;
+ enum oom_status oom_status;
if (mem_cgroup_is_root(memcg))
return 0;
@@ -1986,6 +2187,9 @@ retry:
if (nr_retries--)
goto retry;
+ if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
+ goto nomem;
+
if (gfp_mask & __GFP_NOFAIL)
goto force;
@@ -1994,8 +2198,23 @@ retry:
memcg_memory_event(mem_over_limit, MEMCG_OOM);
- mem_cgroup_oom(mem_over_limit, gfp_mask,
+ /*
+ * keep retrying as long as the memcg oom killer is able to make
+ * a forward progress or bypass the charge if the oom killer
+ * couldn't make any progress.
+ */
+ oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
get_order(nr_pages * PAGE_SIZE));
+ switch (oom_status) {
+ case OOM_SUCCESS:
+ nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ oomed = true;
+ goto retry;
+ case OOM_FAILED:
+ goto force;
+ default:
+ goto nomem;
+ }
nomem:
if (!(gfp_mask & __GFP_NOFAIL))
return -ENOMEM;
@@ -2119,7 +2338,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
unlock_page_lru(page, isolated);
}
-#ifndef CONFIG_SLOB
+#ifdef CONFIG_MEMCG_KMEM
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -2261,7 +2480,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
if (current->memcg_kmem_skip_account)
return cachep;
- memcg = get_mem_cgroup_from_mm(current->mm);
+ memcg = get_mem_cgroup_from_current();
kmemcg_id = READ_ONCE(memcg->kmemcg_id);
if (kmemcg_id < 0)
goto out;
@@ -2345,7 +2564,7 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
if (memcg_kmem_bypass())
return 0;
- memcg = get_mem_cgroup_from_mm(current->mm);
+ memcg = get_mem_cgroup_from_current();
if (!mem_cgroup_is_root(memcg)) {
ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
if (!ret)
@@ -2384,7 +2603,7 @@ void memcg_kmem_uncharge(struct page *page, int order)
css_put_many(&memcg->css, nr_pages);
}
-#endif /* !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2779,7 +2998,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
}
}
-#ifndef CONFIG_SLOB
+#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
int memcg_id;
@@ -2851,7 +3070,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
}
rcu_read_unlock();
- memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+ memcg_drain_all_list_lrus(kmemcg_id, parent);
memcg_free_cache_id(kmemcg_id);
}
@@ -2879,7 +3098,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
static void memcg_free_kmem(struct mem_cgroup *memcg)
{
}
-#endif /* !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
static int memcg_update_kmem_max(struct mem_cgroup *memcg,
unsigned long max)
@@ -4183,7 +4402,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
INIT_LIST_HEAD(&memcg->event_list);
spin_lock_init(&memcg->event_list_lock);
memcg->socket_pressure = jiffies;
-#ifndef CONFIG_SLOB
+#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -4260,6 +4479,16 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ /*
+ * A memcg must be visible for memcg_expand_shrinker_maps()
+ * by the time the maps are allocated. So, we allocate maps
+ * here, when for_each_mem_cgroup() can't skip it.
+ */
+ if (memcg_alloc_shrinker_maps(memcg)) {
+ mem_cgroup_id_remove(memcg);
+ return -ENOMEM;
+ }
+
/* Online state pins memcg ID, memcg ID pins CSS */
atomic_set(&memcg->id.ref, 1);
css_get(css);
@@ -4312,6 +4541,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
vmpressure_cleanup(&memcg->vmpressure);
cancel_work_sync(&memcg->high_work);
mem_cgroup_remove_from_trees(memcg);
+ memcg_free_shrinker_maps(memcg);
memcg_free_kmem(memcg);
mem_cgroup_free(memcg);
}
@@ -6023,7 +6253,7 @@ static int __init mem_cgroup_init(void)
{
int cpu, node;
-#ifndef CONFIG_SLOB
+#ifdef CONFIG_MEMCG_KMEM
/*
* Kmem cache creation is mostly done with the slab_mutex held,
* so use a workqueue with limited concurrency to avoid stalling
diff --git a/mm/memory.c b/mm/memory.c
index 348279ff6e51..19f47d7b9b86 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -859,6 +859,10 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return NULL;
}
}
+
+ if (pte_devmap(pte))
+ return NULL;
+
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
@@ -923,6 +927,8 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
}
}
+ if (pmd_devmap(pmd))
+ return NULL;
if (is_zero_pfn(pfn))
return NULL;
if (unlikely(pfn > highest_memmap_pfn))
@@ -1607,20 +1613,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, start, end);
- for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
+ for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
unmap_single_vma(&tlb, vma, start, end, NULL);
-
- /*
- * zap_page_range does not specify whether mmap_sem should be
- * held for read or write. That allows parallel zap_page_range
- * operations to unmap a PTE and defer a flush meaning that
- * this call observes pte_none and fails to flush the TLB.
- * Rather than adding a complex API, ensure that no stale
- * TLB entries exist when this call returns.
- */
- flush_tlb_range(vma, start, end);
- }
-
mmu_notifier_invalidate_range_end(mm, start, end);
tlb_finish_mmu(&tlb, start, end);
}
@@ -3394,7 +3388,7 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page)
if (write)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
+ add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
page_add_file_rmap(page, true);
/*
* deposit and withdraw with pmd lock held
@@ -4147,7 +4141,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
* space. Kernel faults are handled more gracefully.
*/
if (flags & FAULT_FLAG_USER)
- mem_cgroup_oom_enable();
+ mem_cgroup_enter_user_fault();
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
@@ -4155,7 +4149,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
ret = __handle_mm_fault(vma, address, flags);
if (flags & FAULT_FLAG_USER) {
- mem_cgroup_oom_disable();
+ mem_cgroup_exit_user_fault();
/*
* The task may have entered a memcg OOM situation but
* if the allocation error was handled gracefully (no
@@ -4593,71 +4587,93 @@ EXPORT_SYMBOL(__might_fault);
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
-static void clear_gigantic_page(struct page *page,
- unsigned long addr,
- unsigned int pages_per_huge_page)
-{
- int i;
- struct page *p = page;
-
- might_sleep();
- for (i = 0; i < pages_per_huge_page;
- i++, p = mem_map_next(p, page, i)) {
- cond_resched();
- clear_user_highpage(p, addr + i * PAGE_SIZE);
- }
-}
-void clear_huge_page(struct page *page,
- unsigned long addr_hint, unsigned int pages_per_huge_page)
+/*
+ * Process all subpages of the specified huge page with the specified
+ * operation. The target subpage will be processed last to keep its
+ * cache lines hot.
+ */
+static inline void process_huge_page(
+ unsigned long addr_hint, unsigned int pages_per_huge_page,
+ void (*process_subpage)(unsigned long addr, int idx, void *arg),
+ void *arg)
{
int i, n, base, l;
unsigned long addr = addr_hint &
~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
- if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
- clear_gigantic_page(page, addr, pages_per_huge_page);
- return;
- }
-
- /* Clear sub-page to access last to keep its cache lines hot */
+ /* Process target subpage last to keep its cache lines hot */
might_sleep();
n = (addr_hint - addr) / PAGE_SIZE;
if (2 * n <= pages_per_huge_page) {
- /* If sub-page to access in first half of huge page */
+ /* If target subpage in first half of huge page */
base = 0;
l = n;
- /* Clear sub-pages at the end of huge page */
+ /* Process subpages at the end of huge page */
for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
cond_resched();
- clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ process_subpage(addr + i * PAGE_SIZE, i, arg);
}
} else {
- /* If sub-page to access in second half of huge page */
+ /* If target subpage in second half of huge page */
base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
l = pages_per_huge_page - n;
- /* Clear sub-pages at the begin of huge page */
+ /* Process subpages at the begin of huge page */
for (i = 0; i < base; i++) {
cond_resched();
- clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ process_subpage(addr + i * PAGE_SIZE, i, arg);
}
}
/*
- * Clear remaining sub-pages in left-right-left-right pattern
- * towards the sub-page to access
+ * Process remaining subpages in left-right-left-right pattern
+ * towards the target subpage
*/
for (i = 0; i < l; i++) {
int left_idx = base + i;
int right_idx = base + 2 * l - 1 - i;
cond_resched();
- clear_user_highpage(page + left_idx,
- addr + left_idx * PAGE_SIZE);
+ process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
cond_resched();
- clear_user_highpage(page + right_idx,
- addr + right_idx * PAGE_SIZE);
+ process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
}
}
+static void clear_gigantic_page(struct page *page,
+ unsigned long addr,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+ struct page *p = page;
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page;
+ i++, p = mem_map_next(p, page, i)) {
+ cond_resched();
+ clear_user_highpage(p, addr + i * PAGE_SIZE);
+ }
+}
+
+static void clear_subpage(unsigned long addr, int idx, void *arg)
+{
+ struct page *page = arg;
+
+ clear_user_highpage(page + idx, addr);
+}
+
+void clear_huge_page(struct page *page,
+ unsigned long addr_hint, unsigned int pages_per_huge_page)
+{
+ unsigned long addr = addr_hint &
+ ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
+
+ if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+ clear_gigantic_page(page, addr, pages_per_huge_page);
+ return;
+ }
+
+ process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
+}
+
static void copy_user_gigantic_page(struct page *dst, struct page *src,
unsigned long addr,
struct vm_area_struct *vma,
@@ -4677,11 +4693,31 @@ static void copy_user_gigantic_page(struct page *dst, struct page *src,
}
}
+struct copy_subpage_arg {
+ struct page *dst;
+ struct page *src;
+ struct vm_area_struct *vma;
+};
+
+static void copy_subpage(unsigned long addr, int idx, void *arg)
+{
+ struct copy_subpage_arg *copy_arg = arg;
+
+ copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
+ addr, copy_arg->vma);
+}
+
void copy_user_huge_page(struct page *dst, struct page *src,
- unsigned long addr, struct vm_area_struct *vma,
+ unsigned long addr_hint, struct vm_area_struct *vma,
unsigned int pages_per_huge_page)
{
- int i;
+ unsigned long addr = addr_hint &
+ ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
+ struct copy_subpage_arg arg = {
+ .dst = dst,
+ .src = src,
+ .vma = vma,
+ };
if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
copy_user_gigantic_page(dst, src, addr, vma,
@@ -4689,11 +4725,7 @@ void copy_user_huge_page(struct page *dst, struct page *src,
return;
}
- might_sleep();
- for (i = 0; i < pages_per_huge_page; i++) {
- cond_resched();
- copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
- }
+ process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
}
long copy_huge_page_from_user(struct page *dst_page,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7deb49f69e27..4eb6e824a80c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1034,8 +1034,10 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
return pgdat;
}
-static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
+static void rollback_node_hotadd(int nid)
{
+ pg_data_t *pgdat = NODE_DATA(nid);
+
arch_refresh_nodedata(nid, NULL);
free_percpu(pgdat->per_cpu_nodestats);
arch_free_nodedata(pgdat);
@@ -1046,28 +1048,48 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
/**
* try_online_node - online a node if offlined
* @nid: the node ID
- *
+ * @start: start addr of the node
+ * @set_node_online: Whether we want to online the node
* called by cpu_up() to online a node without onlined memory.
+ *
+ * Returns:
+ * 1 -> a new node has been allocated
+ * 0 -> the node is already online
+ * -ENOMEM -> the node could not be allocated
*/
-int try_online_node(int nid)
+static int __try_online_node(int nid, u64 start, bool set_node_online)
{
- pg_data_t *pgdat;
- int ret;
+ pg_data_t *pgdat;
+ int ret = 1;
if (node_online(nid))
return 0;
- mem_hotplug_begin();
- pgdat = hotadd_new_pgdat(nid, 0);
+ pgdat = hotadd_new_pgdat(nid, start);
if (!pgdat) {
pr_err("Cannot online node %d due to NULL pgdat\n", nid);
ret = -ENOMEM;
goto out;
}
- node_set_online(nid);
- ret = register_one_node(nid);
- BUG_ON(ret);
+
+ if (set_node_online) {
+ node_set_online(nid);
+ ret = register_one_node(nid);
+ BUG_ON(ret);
+ }
out:
+ return ret;
+}
+
+/*
+ * Users of this function always want to online/register the node
+ */
+int try_online_node(int nid)
+{
+ int ret;
+
+ mem_hotplug_begin();
+ ret = __try_online_node(nid, 0, true);
mem_hotplug_done();
return ret;
}
@@ -1099,9 +1121,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
int __ref add_memory_resource(int nid, struct resource *res, bool online)
{
u64 start, size;
- pg_data_t *pgdat = NULL;
- bool new_pgdat;
- bool new_node;
+ bool new_node = false;
int ret;
start = res->start;
@@ -1111,11 +1131,6 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
if (ret)
return ret;
- { /* Stupid hack to suppress address-never-null warning */
- void *p = NODE_DATA(nid);
- new_pgdat = !p;
- }
-
mem_hotplug_begin();
/*
@@ -1126,48 +1141,31 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
*/
memblock_add_node(start, size, nid);
- new_node = !node_online(nid);
- if (new_node) {
- pgdat = hotadd_new_pgdat(nid, start);
- ret = -ENOMEM;
- if (!pgdat)
- goto error;
- }
+ ret = __try_online_node(nid, start, false);
+ if (ret < 0)
+ goto error;
+ new_node = ret;
/* call arch's memory hotadd */
ret = arch_add_memory(nid, start, size, NULL, true);
-
if (ret < 0)
goto error;
- /* we online node here. we can't roll back from here. */
- node_set_online(nid);
-
if (new_node) {
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long nr_pages = size >> PAGE_SHIFT;
-
- ret = __register_one_node(nid);
- if (ret)
- goto register_fail;
-
- /*
- * link memory sections under this node. This is already
- * done when creatig memory section in register_new_memory
- * but that depends to have the node registered so offline
- * nodes have to go through register_node.
- * TODO clean up this mess.
- */
- ret = link_mem_sections(nid, start_pfn, nr_pages, false);
-register_fail:
- /*
- * If sysfs file of new node can't create, cpu on the node
+ /* If sysfs file of new node can't be created, cpu on the node
* can't be hot-added. There is no rollback way now.
* So, check by BUG_ON() to catch it reluctantly..
+ * We online node here. We can't roll back from here.
*/
+ node_set_online(nid);
+ ret = __register_one_node(nid);
BUG_ON(ret);
}
+ /* link memory sections under this node.*/
+ ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1));
+ BUG_ON(ret);
+
/* create new memmap entry */
firmware_map_add_hotplug(start, start + size, "System RAM");
@@ -1180,8 +1178,8 @@ register_fail:
error:
/* rollback pgdat allocation and others */
- if (new_pgdat && pgdat)
- rollback_node_hotadd(nid, pgdat);
+ if (new_node)
+ rollback_node_hotadd(nid);
memblock_remove(start, size);
out:
diff --git a/mm/mempool.c b/mm/mempool.c
index b54f2c20e5e0..44f5fa98c1e7 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -111,7 +111,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
kasan_free_pages(element, (unsigned long)pool->pool_data);
}
-static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags)
+static void kasan_unpoison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_unpoison_slab(element);
@@ -127,12 +127,12 @@ static __always_inline void add_element(mempool_t *pool, void *element)
pool->elements[pool->curr_nr++] = element;
}
-static void *remove_element(mempool_t *pool, gfp_t flags)
+static void *remove_element(mempool_t *pool)
{
void *element = pool->elements[--pool->curr_nr];
BUG_ON(pool->curr_nr < 0);
- kasan_unpoison_element(pool, element, flags);
+ kasan_unpoison_element(pool, element);
check_element(pool, element);
return element;
}
@@ -151,7 +151,7 @@ static void *remove_element(mempool_t *pool, gfp_t flags)
void mempool_exit(mempool_t *pool)
{
while (pool->curr_nr) {
- void *element = remove_element(pool, GFP_KERNEL);
+ void *element = remove_element(pool);
pool->free(element, pool->pool_data);
}
kfree(pool->elements);
@@ -301,7 +301,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
spin_lock_irqsave(&pool->lock, flags);
if (new_min_nr <= pool->min_nr) {
while (new_min_nr < pool->curr_nr) {
- element = remove_element(pool, GFP_KERNEL);
+ element = remove_element(pool);
spin_unlock_irqrestore(&pool->lock, flags);
pool->free(element, pool->pool_data);
spin_lock_irqsave(&pool->lock, flags);
@@ -387,7 +387,7 @@ repeat_alloc:
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr)) {
- element = remove_element(pool, gfp_temp);
+ element = remove_element(pool);
spin_unlock_irqrestore(&pool->lock, flags);
/* paired with rmb in mempool_free(), read comment there */
smp_wmb();
diff --git a/mm/migrate.c b/mm/migrate.c
index 8c0af0f7cab1..4a83268e23c2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2951,7 +2951,8 @@ int migrate_vma(const struct migrate_vma_ops *ops,
/* Sanity check the arguments */
start &= PAGE_MASK;
end &= PAGE_MASK;
- if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL))
+ if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
+ vma_is_dax(vma))
return -EINVAL;
if (start < vma->vm_start || start >= vma->vm_end)
return -EINVAL;
diff --git a/mm/mlock.c b/mm/mlock.c
index 74e5a6547c3d..41cc47e28ad6 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -527,7 +527,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
vm_flags_t old_flags = vma->vm_flags;
if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
- is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
+ is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
+ vma_is_dax(vma))
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
goto out;
diff --git a/mm/mmap.c b/mm/mmap.c
index 17bbf4d3e24f..8d6449e74431 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1796,11 +1796,12 @@ out:
vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
- if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
- vma == get_gate_vma(current->mm)))
- mm->locked_vm += (len >> PAGE_SHIFT);
- else
+ if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
+ is_vm_hugetlb_page(vma) ||
+ vma == get_gate_vma(current->mm))
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ else
+ mm->locked_vm += (len >> PAGE_SHIFT);
}
if (file)
diff --git a/mm/nommu.c b/mm/nommu.c
index 9fc9e43335b6..e4aac33216ae 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -364,10 +364,6 @@ void *vzalloc_node(unsigned long size, int node)
}
EXPORT_SYMBOL(vzalloc_node);
-#ifndef PAGE_KERNEL_EXEC
-# define PAGE_KERNEL_EXEC PAGE_KERNEL
-#endif
-
/**
* vmalloc_exec - allocate virtually contiguous, executable memory
* @size: allocation size
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 84081e77bc51..412f43453a68 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -53,6 +53,14 @@ int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks = 1;
+/*
+ * Serializes oom killer invocations (out_of_memory()) from all contexts to
+ * prevent from over eager oom killing (e.g. when the oom killer is invoked
+ * from different domains).
+ *
+ * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
+ * and mark_oom_victim
+ */
DEFINE_MUTEX(oom_lock);
#ifdef CONFIG_NUMA
@@ -1077,15 +1085,9 @@ bool out_of_memory(struct oom_control *oc)
dump_header(oc, NULL);
panic("Out of memory and no killable processes...\n");
}
- if (oc->chosen && oc->chosen != (void *)-1UL) {
+ if (oc->chosen && oc->chosen != (void *)-1UL)
oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
"Memory cgroup out of memory");
- /*
- * Give the killed process a good chance to exit before trying
- * to allocate memory again.
- */
- schedule_timeout_killable(1);
- }
return !!oc->chosen;
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 337c6afb3345..6551d3b0dc30 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2490,8 +2490,8 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers);
/*
* Call this whenever redirtying a page, to de-account the dirty counters
- * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written
- * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to
+ * (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written
+ * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to
* systematic errors in balanced_dirty_ratelimit and the dirty pages position
* control.
*/
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0922ef5d2e46..15ea511fb41c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4165,11 +4165,12 @@ retry:
alloc_flags = reserve_flags;
/*
- * Reset the zonelist iterators if memory policies can be ignored.
- * These allocations are high priority and system rather than user
- * orientated.
+ * Reset the nodemask and zonelist iterators if memory policies can be
+ * ignored. These allocations are high priority and system rather than
+ * user oriented.
*/
if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
+ ac->nodemask = NULL;
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
@@ -4403,19 +4404,15 @@ out:
EXPORT_SYMBOL(__alloc_pages_nodemask);
/*
- * Common helper functions.
+ * Common helper functions. Never use with __GFP_HIGHMEM because the returned
+ * address cannot represent highmem pages. Use alloc_pages and then kmap if
+ * you need to access high mem.
*/
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *page;
- /*
- * __get_free_pages() returns a virtual address, which cannot represent
- * a highmem page
- */
- VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
-
- page = alloc_pages(gfp_mask, order);
+ page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
if (!page)
return 0;
return (unsigned long) page_address(page);
@@ -5567,13 +5564,12 @@ static int zone_batchsize(struct zone *zone)
/*
* The per-cpu-pages pools are set to around 1000th of the
- * size of the zone. But no more than 1/2 of a meg.
- *
- * OK, so we don't know how big the cache is. So guess.
+ * size of the zone.
*/
batch = zone->managed_pages / 1024;
- if (batch * PAGE_SIZE > 512 * 1024)
- batch = (512 * 1024) / PAGE_SIZE;
+ /* But no more than a meg. */
+ if (batch * PAGE_SIZE > 1024 * 1024)
+ batch = (1024 * 1024) / PAGE_SIZE;
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;
@@ -6405,8 +6401,11 @@ void __paginginit zero_resv_unavail(void)
pgcnt = 0;
for_each_resv_unavail_range(i, &start, &end) {
for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages)))
+ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
+ pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
+ + pageblock_nr_pages - 1;
continue;
+ }
mm_zero_struct_page(pfn_to_page(pfn));
pgcnt++;
}
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 5295ef331165..a9826da84ccb 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -120,7 +120,7 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
pgdat->node_page_ext = NULL;
}
-struct page_ext *lookup_page_ext(struct page *page)
+struct page_ext *lookup_page_ext(const struct page *page)
{
unsigned long pfn = page_to_pfn(page);
unsigned long index;
@@ -195,7 +195,7 @@ fail:
#else /* CONFIG_FLAT_NODE_MEM_MAP */
-struct page_ext *lookup_page_ext(struct page *page)
+struct page_ext *lookup_page_ext(const struct page *page)
{
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
diff --git a/mm/shmem.c b/mm/shmem.c
index 06ebe17bb924..c48c79018a7c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -29,6 +29,7 @@
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/mm.h>
+#include <linux/random.h>
#include <linux/sched/signal.h>
#include <linux/export.h>
#include <linux/swap.h>
@@ -2188,7 +2189,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
- inode->i_generation = get_seconds();
+ inode->i_generation = prandom_u32();
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
diff --git a/mm/slab.h b/mm/slab.h
index 68bdf498da3b..58c6c1c2a78e 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -203,7 +203,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
/* List of all root caches. */
extern struct list_head slab_root_caches;
@@ -296,7 +296,7 @@ extern void memcg_link_cache(struct kmem_cache *s);
extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
void (*deact_fn)(struct kmem_cache *));
-#else /* CONFIG_MEMCG && !CONFIG_SLOB */
+#else /* CONFIG_MEMCG_KMEM */
/* If !memcg, all caches are root. */
#define slab_root_caches slab_caches
@@ -351,7 +351,7 @@ static inline void memcg_link_cache(struct kmem_cache *s)
{
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 2296caf87bfb..fea3376f9816 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -127,7 +127,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
return i;
}
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
LIST_HEAD(slab_root_caches);
@@ -256,7 +256,7 @@ static inline void destroy_memcg_params(struct kmem_cache *s)
static inline void memcg_unlink_cache(struct kmem_cache *s)
{
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
/*
* Figure out what the alignment of the objects will be given a set of
@@ -584,7 +584,7 @@ static int shutdown_cache(struct kmem_cache *s)
return 0;
}
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+#ifdef CONFIG_MEMCG_KMEM
/*
* memcg_create_kmem_cache - Create a cache for a memory cgroup.
* @memcg: The memory cgroup the new cache is for.
@@ -861,7 +861,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s)
static inline void flush_memcg_workqueue(struct kmem_cache *s)
{
}
-#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
+#endif /* CONFIG_MEMCG_KMEM */
void slab_kmem_cache_release(struct kmem_cache *s)
{
diff --git a/mm/slub.c b/mm/slub.c
index 51258eff4178..ce2b9e5cea77 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -271,8 +271,7 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
static void prefetch_freepointer(const struct kmem_cache *s, void *object)
{
- if (object)
- prefetch(freelist_dereference(s, object + s->offset));
+ prefetch(object + s->offset);
}
static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index bd0276d5f66b..8301293331a2 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -43,12 +43,9 @@ static void * __ref __earlyonly_bootmem_alloc(int node,
unsigned long goal)
{
return memblock_virt_alloc_try_nid_raw(size, align, goal,
- BOOTMEM_ALLOC_ACCESSIBLE, node);
+ BOOTMEM_ALLOC_ACCESSIBLE, node);
}
-static void *vmemmap_buf;
-static void *vmemmap_buf_end;
-
void * __meminit vmemmap_alloc_block(unsigned long size, int node)
{
/* If the main allocator is up use that, fallback to bootmem. */
@@ -76,18 +73,10 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
/* need to make sure size is all the same during early stage */
void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
{
- void *ptr;
-
- if (!vmemmap_buf)
- return vmemmap_alloc_block(size, node);
-
- /* take the from buf */
- ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size);
- if (ptr + size > vmemmap_buf_end)
- return vmemmap_alloc_block(size, node);
-
- vmemmap_buf = ptr + size;
+ void *ptr = sparse_buffer_alloc(size);
+ if (!ptr)
+ ptr = vmemmap_alloc_block(size, node);
return ptr;
}
@@ -272,45 +261,3 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid,
return map;
}
-
-void __init sparse_mem_maps_populate_node(struct page **map_map,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long map_count, int nodeid)
-{
- unsigned long pnum;
- unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
- void *vmemmap_buf_start;
-
- size = ALIGN(size, PMD_SIZE);
- vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count,
- PMD_SIZE, __pa(MAX_DMA_ADDRESS));
-
- if (vmemmap_buf_start) {
- vmemmap_buf = vmemmap_buf_start;
- vmemmap_buf_end = vmemmap_buf_start + size * map_count;
- }
-
- for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
- struct mem_section *ms;
-
- if (!present_section_nr(pnum))
- continue;
-
- map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL);
- if (map_map[pnum])
- continue;
- ms = __nr_to_section(pnum);
- pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
- __func__);
- ms->section_mem_map = 0;
- }
-
- if (vmemmap_buf_start) {
- /* need to free left buf */
- memblock_free_early(__pa(vmemmap_buf),
- vmemmap_buf_end - vmemmap_buf);
- vmemmap_buf = NULL;
- vmemmap_buf_end = NULL;
- }
-}
diff --git a/mm/sparse.c b/mm/sparse.c
index f13f2723950a..10b07eea9a6e 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -200,6 +200,11 @@ static inline int next_present_section_nr(int section_nr)
(section_nr <= __highest_present_section_nr)); \
section_nr = next_present_section_nr(section_nr))
+static inline unsigned long first_present_section_nr(void)
+{
+ return next_present_section_nr(-1);
+}
+
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
@@ -257,19 +262,14 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}
-static int __meminit sparse_init_one_section(struct mem_section *ms,
+static void __meminit sparse_init_one_section(struct mem_section *ms,
unsigned long pnum, struct page *mem_map,
unsigned long *pageblock_bitmap)
{
- if (!present_section(ms))
- return -EINVAL;
-
ms->section_mem_map &= ~SECTION_MAP_MASK;
ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
SECTION_HAS_MEM_MAP;
ms->pageblock_flags = pageblock_bitmap;
-
- return 1;
}
unsigned long usemap_size(void)
@@ -370,160 +370,121 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
-static void __init sparse_early_usemaps_alloc_node(void *data,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long usemap_count, int nodeid)
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static unsigned long __init section_map_size(void)
{
- void *usemap;
- unsigned long pnum;
- unsigned long **usemap_map = (unsigned long **)data;
- int size = usemap_size();
-
- usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
- size * usemap_count);
- if (!usemap) {
- pr_warn("%s: allocation failed\n", __func__);
- return;
- }
+ return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
+}
- for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
- if (!present_section_nr(pnum))
- continue;
- usemap_map[pnum] = usemap;
- usemap += size;
- check_usemap_section_nr(nodeid, usemap_map[pnum]);
- }
+#else
+static unsigned long __init section_map_size(void)
+{
+ return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
}
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid,
struct vmem_altmap *altmap)
{
- struct page *map;
- unsigned long size;
+ unsigned long size = section_map_size();
+ struct page *map = sparse_buffer_alloc(size);
+
+ if (map)
+ return map;
- size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
map = memblock_virt_alloc_try_nid(size,
PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
BOOTMEM_ALLOC_ACCESSIBLE, nid);
return map;
}
-void __init sparse_mem_maps_populate_node(struct page **map_map,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long map_count, int nodeid)
-{
- void *map;
- unsigned long pnum;
- unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
-
- size = PAGE_ALIGN(size);
- map = memblock_virt_alloc_try_nid_raw(size * map_count,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
- BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
- if (map) {
- for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
- if (!present_section_nr(pnum))
- continue;
- map_map[pnum] = map;
- map += size;
- }
- return;
- }
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
- /* fallback */
- for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
- struct mem_section *ms;
+static void *sparsemap_buf __meminitdata;
+static void *sparsemap_buf_end __meminitdata;
- if (!present_section_nr(pnum))
- continue;
- map_map[pnum] = sparse_mem_map_populate(pnum, nodeid, NULL);
- if (map_map[pnum])
- continue;
- ms = __nr_to_section(pnum);
- pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
- __func__);
- ms->section_mem_map = 0;
- }
+static void __init sparse_buffer_init(unsigned long size, int nid)
+{
+ WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
+ sparsemap_buf =
+ memblock_virt_alloc_try_nid_raw(size, PAGE_SIZE,
+ __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nid);
+ sparsemap_buf_end = sparsemap_buf + size;
}
-#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
-#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
-static void __init sparse_early_mem_maps_alloc_node(void *data,
- unsigned long pnum_begin,
- unsigned long pnum_end,
- unsigned long map_count, int nodeid)
+static void __init sparse_buffer_fini(void)
{
- struct page **map_map = (struct page **)data;
- sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
- map_count, nodeid);
+ unsigned long size = sparsemap_buf_end - sparsemap_buf;
+
+ if (sparsemap_buf && size > 0)
+ memblock_free_early(__pa(sparsemap_buf), size);
+ sparsemap_buf = NULL;
}
-#else
-static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
-{
- struct page *map;
- struct mem_section *ms = __nr_to_section(pnum);
- int nid = sparse_early_nid(ms);
- map = sparse_mem_map_populate(pnum, nid, NULL);
- if (map)
- return map;
+void * __meminit sparse_buffer_alloc(unsigned long size)
+{
+ void *ptr = NULL;
- pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
- __func__);
- ms->section_mem_map = 0;
- return NULL;
+ if (sparsemap_buf) {
+ ptr = PTR_ALIGN(sparsemap_buf, size);
+ if (ptr + size > sparsemap_buf_end)
+ ptr = NULL;
+ else
+ sparsemap_buf = ptr + size;
+ }
+ return ptr;
}
-#endif
void __weak __meminit vmemmap_populate_print_last(void)
{
}
-/**
- * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap
- * @map: usemap_map for pageblock flags or mmap_map for vmemmap
+/*
+ * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
+ * And number of present sections in this node is map_count.
*/
-static void __init alloc_usemap_and_memmap(void (*alloc_func)
- (void *, unsigned long, unsigned long,
- unsigned long, int), void *data)
+static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count)
{
- unsigned long pnum;
- unsigned long map_count;
- int nodeid_begin = 0;
- unsigned long pnum_begin = 0;
-
- for_each_present_section_nr(0, pnum) {
- struct mem_section *ms;
+ unsigned long pnum, usemap_longs, *usemap;
+ struct page *map;
- ms = __nr_to_section(pnum);
- nodeid_begin = sparse_early_nid(ms);
- pnum_begin = pnum;
- break;
+ usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
+ usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
+ usemap_size() *
+ map_count);
+ if (!usemap) {
+ pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
+ goto failed;
+ }
+ sparse_buffer_init(map_count * section_map_size(), nid);
+ for_each_present_section_nr(pnum_begin, pnum) {
+ if (pnum >= pnum_end)
+ break;
+
+ map = sparse_mem_map_populate(pnum, nid, NULL);
+ if (!map) {
+ pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
+ __func__, nid);
+ pnum_begin = pnum;
+ goto failed;
+ }
+ check_usemap_section_nr(nid, usemap);
+ sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap);
+ usemap += usemap_longs;
}
- map_count = 1;
- for_each_present_section_nr(pnum_begin + 1, pnum) {
+ sparse_buffer_fini();
+ return;
+failed:
+ /* We failed to allocate, mark all the following pnums as not present */
+ for_each_present_section_nr(pnum_begin, pnum) {
struct mem_section *ms;
- int nodeid;
+ if (pnum >= pnum_end)
+ break;
ms = __nr_to_section(pnum);
- nodeid = sparse_early_nid(ms);
- if (nodeid == nodeid_begin) {
- map_count++;
- continue;
- }
- /* ok, we need to take cake of from pnum_begin to pnum - 1*/
- alloc_func(data, pnum_begin, pnum,
- map_count, nodeid_begin);
- /* new start, update count etc*/
- nodeid_begin = nodeid;
- pnum_begin = pnum;
- map_count = 1;
+ ms->section_mem_map = 0;
}
- /* ok, last chunk */
- alloc_func(data, pnum_begin, __highest_present_section_nr+1,
- map_count, nodeid_begin);
}
/*
@@ -532,72 +493,29 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func)
*/
void __init sparse_init(void)
{
- unsigned long pnum;
- struct page *map;
- unsigned long *usemap;
- unsigned long **usemap_map;
- int size;
-#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- int size2;
- struct page **map_map;
-#endif
-
- /* see include/linux/mmzone.h 'struct mem_section' definition */
- BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section)));
+ unsigned long pnum_begin = first_present_section_nr();
+ int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
+ unsigned long pnum_end, map_count = 1;
/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
set_pageblock_order();
- /*
- * map is using big page (aka 2M in x86 64 bit)
- * usemap is less one page (aka 24 bytes)
- * so alloc 2M (with 2M align) and 24 bytes in turn will
- * make next 2M slip to one more 2M later.
- * then in big system, the memory will have a lot of holes...
- * here try to allocate 2M pages continuously.
- *
- * powerpc need to call sparse_init_one_section right after each
- * sparse_early_mem_map_alloc, so allocate usemap_map at first.
- */
- size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
- usemap_map = memblock_virt_alloc(size, 0);
- if (!usemap_map)
- panic("can not allocate usemap_map\n");
- alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
- (void *)usemap_map);
-
-#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
- map_map = memblock_virt_alloc(size2, 0);
- if (!map_map)
- panic("can not allocate map_map\n");
- alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
- (void *)map_map);
-#endif
-
- for_each_present_section_nr(0, pnum) {
- usemap = usemap_map[pnum];
- if (!usemap)
- continue;
+ for_each_present_section_nr(pnum_begin + 1, pnum_end) {
+ int nid = sparse_early_nid(__nr_to_section(pnum_end));
-#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- map = map_map[pnum];
-#else
- map = sparse_early_mem_map_alloc(pnum);
-#endif
- if (!map)
+ if (nid == nid_begin) {
+ map_count++;
continue;
-
- sparse_init_one_section(__nr_to_section(pnum), pnum, map,
- usemap);
+ }
+ /* Init node with sections in range [pnum_begin, pnum_end) */
+ sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
+ nid_begin = nid;
+ pnum_begin = pnum_end;
+ map_count = 1;
}
-
+ /* cover the last node */
+ sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
vmemmap_populate_print_last();
-
-#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- memblock_free_early(__pa(map_map), size2);
-#endif
- memblock_free_early(__pa(usemap_map), size);
}
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -760,6 +678,7 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
ret = sparse_index_init(section_nr, pgdat->node_id);
if (ret < 0 && ret != -EEXIST)
return ret;
+ ret = 0;
memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, altmap);
if (!memmap)
return -ENOMEM;
@@ -786,12 +705,11 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
#endif
section_mark_present(ms);
-
- ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
+ sparse_init_one_section(ms, section_nr, memmap, usemap);
out:
pgdat_resize_unlock(pgdat, &flags);
- if (ret <= 0) {
+ if (ret < 0) {
kfree(usemap);
__kfree_section_memmap(memmap, altmap);
}
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index a791411fed71..008ccb22fee6 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -38,9 +38,9 @@ static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
static bool swap_slot_cache_active;
bool swap_slot_cache_enabled;
static bool swap_slot_cache_initialized;
-DEFINE_MUTEX(swap_slots_cache_mutex);
+static DEFINE_MUTEX(swap_slots_cache_mutex);
/* Serialize swap slots cache enable/disable operations */
-DEFINE_MUTEX(swap_slots_cache_enable_mutex);
+static DEFINE_MUTEX(swap_slots_cache_enable_mutex);
static void __drain_swap_slots_cache(unsigned int type);
static void deactivate_swap_slots_cache(void);
diff --git a/mm/vmacache.c b/mm/vmacache.c
index db7596eb6132..ea517bef7dc5 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -6,6 +6,18 @@
#include <linux/sched/task.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
+#include <asm/pgtable.h>
+
+/*
+ * Hash based on the pmd of addr if configured with MMU, which provides a good
+ * hit rate for workloads with spatial locality. Otherwise, use pages.
+ */
+#ifdef CONFIG_MMU
+#define VMACACHE_SHIFT PMD_SHIFT
+#else
+#define VMACACHE_SHIFT PAGE_SHIFT
+#endif
+#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK)
/*
* Flush vma caches for threads that share a given mm.
@@ -87,6 +99,7 @@ static bool vmacache_valid(struct mm_struct *mm)
struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
{
+ int idx = VMACACHE_HASH(addr);
int i;
count_vm_vmacache_event(VMACACHE_FIND_CALLS);
@@ -95,16 +108,20 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
return NULL;
for (i = 0; i < VMACACHE_SIZE; i++) {
- struct vm_area_struct *vma = current->vmacache.vmas[i];
+ struct vm_area_struct *vma = current->vmacache.vmas[idx];
- if (!vma)
- continue;
- if (WARN_ON_ONCE(vma->vm_mm != mm))
- break;
- if (vma->vm_start <= addr && vma->vm_end > addr) {
- count_vm_vmacache_event(VMACACHE_FIND_HITS);
- return vma;
+ if (vma) {
+#ifdef CONFIG_DEBUG_VM_VMACACHE
+ if (WARN_ON_ONCE(vma->vm_mm != mm))
+ break;
+#endif
+ if (vma->vm_start <= addr && vma->vm_end > addr) {
+ count_vm_vmacache_event(VMACACHE_FIND_HITS);
+ return vma;
+ }
}
+ if (++idx == VMACACHE_SIZE)
+ idx = 0;
}
return NULL;
@@ -115,6 +132,7 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
+ int idx = VMACACHE_HASH(start);
int i;
count_vm_vmacache_event(VMACACHE_FIND_CALLS);
@@ -123,12 +141,14 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
return NULL;
for (i = 0; i < VMACACHE_SIZE; i++) {
- struct vm_area_struct *vma = current->vmacache.vmas[i];
+ struct vm_area_struct *vma = current->vmacache.vmas[idx];
if (vma && vma->vm_start == start && vma->vm_end == end) {
count_vm_vmacache_event(VMACACHE_FIND_HITS);
return vma;
}
+ if (++idx == VMACACHE_SIZE)
+ idx = 0;
}
return NULL;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index cfea25be7754..a728fc492557 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1907,10 +1907,6 @@ void *vzalloc_node(unsigned long size, int node)
}
EXPORT_SYMBOL(vzalloc_node);
-#ifndef PAGE_KERNEL_EXEC
-# define PAGE_KERNEL_EXEC PAGE_KERNEL
-#endif
-
/**
* vmalloc_exec - allocate virtually contiguous, executable memory
* @size: allocation size
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 03822f86f288..4375b1e9bd56 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -65,12 +65,6 @@ struct scan_control {
/* How many pages shrink_list() should reclaim */
unsigned long nr_to_reclaim;
- /* This context's GFP mask */
- gfp_t gfp_mask;
-
- /* Allocation order */
- int order;
-
/*
* Nodemask of nodes allowed by the caller. If NULL, all nodes
* are scanned.
@@ -83,12 +77,6 @@ struct scan_control {
*/
struct mem_cgroup *target_mem_cgroup;
- /* Scan (total_size >> priority) pages at once */
- int priority;
-
- /* The highest zone to isolate pages for reclaim from */
- enum zone_type reclaim_idx;
-
/* Writepage batching in laptop mode; RECLAIM_WRITE */
unsigned int may_writepage:1;
@@ -111,6 +99,18 @@ struct scan_control {
/* One of the zones is ready for compaction */
unsigned int compaction_ready:1;
+ /* Allocation order */
+ s8 order;
+
+ /* Scan (total_size >> priority) pages at once */
+ s8 priority;
+
+ /* The highest zone to isolate pages for reclaim from */
+ s8 reclaim_idx;
+
+ /* This context's GFP mask */
+ gfp_t gfp_mask;
+
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
@@ -169,6 +169,70 @@ unsigned long vm_total_pages;
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
+#ifdef CONFIG_MEMCG_KMEM
+
+/*
+ * We allow subsystems to populate their shrinker-related
+ * LRU lists before register_shrinker_prepared() is called
+ * for the shrinker, since we don't want to impose
+ * restrictions on their internal registration order.
+ * In this case shrink_slab_memcg() may find corresponding
+ * bit is set in the shrinkers map.
+ *
+ * This value is used by the function to detect registering
+ * shrinkers and to skip do_shrink_slab() calls for them.
+ */
+#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
+
+static DEFINE_IDR(shrinker_idr);
+static int shrinker_nr_max;
+
+static int prealloc_memcg_shrinker(struct shrinker *shrinker)
+{
+ int id, ret = -ENOMEM;
+
+ down_write(&shrinker_rwsem);
+ /* This may call shrinker, so it must use down_read_trylock() */
+ id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
+ if (id < 0)
+ goto unlock;
+
+ if (id >= shrinker_nr_max) {
+ if (memcg_expand_shrinker_maps(id)) {
+ idr_remove(&shrinker_idr, id);
+ goto unlock;
+ }
+
+ shrinker_nr_max = id + 1;
+ }
+ shrinker->id = id;
+ ret = 0;
+unlock:
+ up_write(&shrinker_rwsem);
+ return ret;
+}
+
+static void unregister_memcg_shrinker(struct shrinker *shrinker)
+{
+ int id = shrinker->id;
+
+ BUG_ON(id < 0);
+
+ down_write(&shrinker_rwsem);
+ idr_remove(&shrinker_idr, id);
+ up_write(&shrinker_rwsem);
+}
+#else /* CONFIG_MEMCG_KMEM */
+static int prealloc_memcg_shrinker(struct shrinker *shrinker)
+{
+ return 0;
+}
+
+static void unregister_memcg_shrinker(struct shrinker *shrinker)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
#ifdef CONFIG_MEMCG
static bool global_reclaim(struct scan_control *sc)
{
@@ -313,11 +377,28 @@ int prealloc_shrinker(struct shrinker *shrinker)
shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
if (!shrinker->nr_deferred)
return -ENOMEM;
+
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+ if (prealloc_memcg_shrinker(shrinker))
+ goto free_deferred;
+ }
+
return 0;
+
+free_deferred:
+ kfree(shrinker->nr_deferred);
+ shrinker->nr_deferred = NULL;
+ return -ENOMEM;
}
void free_prealloced_shrinker(struct shrinker *shrinker)
{
+ if (!shrinker->nr_deferred)
+ return;
+
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ unregister_memcg_shrinker(shrinker);
+
kfree(shrinker->nr_deferred);
shrinker->nr_deferred = NULL;
}
@@ -326,6 +407,9 @@ void register_shrinker_prepared(struct shrinker *shrinker)
{
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
+#ifdef CONFIG_MEMCG_KMEM
+ idr_replace(&shrinker_idr, shrinker, shrinker->id);
+#endif
up_write(&shrinker_rwsem);
}
@@ -347,6 +431,8 @@ void unregister_shrinker(struct shrinker *shrinker)
{
if (!shrinker->nr_deferred)
return;
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ unregister_memcg_shrinker(shrinker);
down_write(&shrinker_rwsem);
list_del(&shrinker->list);
up_write(&shrinker_rwsem);
@@ -371,9 +457,12 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
: SHRINK_BATCH;
long scanned = 0, next_deferred;
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+ nid = 0;
+
freeable = shrinker->count_objects(shrinker, shrinkctl);
- if (freeable == 0)
- return 0;
+ if (freeable == 0 || freeable == SHRINK_EMPTY)
+ return freeable;
/*
* copy the current shrinker scan count into a local variable
@@ -474,6 +563,84 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
return freed;
}
+#ifdef CONFIG_MEMCG_KMEM
+static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
+ struct mem_cgroup *memcg, int priority)
+{
+ struct memcg_shrinker_map *map;
+ unsigned long freed = 0;
+ int ret, i;
+
+ if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))
+ return 0;
+
+ if (!down_read_trylock(&shrinker_rwsem))
+ return 0;
+
+ map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
+ true);
+ if (unlikely(!map))
+ goto unlock;
+
+ for_each_set_bit(i, map->map, shrinker_nr_max) {
+ struct shrink_control sc = {
+ .gfp_mask = gfp_mask,
+ .nid = nid,
+ .memcg = memcg,
+ };
+ struct shrinker *shrinker;
+
+ shrinker = idr_find(&shrinker_idr, i);
+ if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
+ if (!shrinker)
+ clear_bit(i, map->map);
+ continue;
+ }
+
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY) {
+ clear_bit(i, map->map);
+ /*
+ * After the shrinker reported that it had no objects to
+ * free, but before we cleared the corresponding bit in
+ * the memcg shrinker map, a new object might have been
+ * added. To make sure, we have the bit set in this
+ * case, we invoke the shrinker one more time and reset
+ * the bit if it reports that it is not empty anymore.
+ * The memory barrier here pairs with the barrier in
+ * memcg_set_shrinker_bit():
+ *
+ * list_lru_add() shrink_slab_memcg()
+ * list_add_tail() clear_bit()
+ * <MB> <MB>
+ * set_bit() do_shrink_slab()
+ */
+ smp_mb__after_atomic();
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY)
+ ret = 0;
+ else
+ memcg_set_shrinker_bit(memcg, nid, i);
+ }
+ freed += ret;
+
+ if (rwsem_is_contended(&shrinker_rwsem)) {
+ freed = freed ? : 1;
+ break;
+ }
+ }
+unlock:
+ up_read(&shrinker_rwsem);
+ return freed;
+}
+#else /* CONFIG_MEMCG_KMEM */
+static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
+ struct mem_cgroup *memcg, int priority)
+{
+ return 0;
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
/**
* shrink_slab - shrink slab caches
* @gfp_mask: allocation context
@@ -486,10 +653,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
* @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
* unaware shrinkers will receive a node id of 0 instead.
*
- * @memcg specifies the memory cgroup to target. If it is not NULL,
- * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
- * objects from the memory cgroup specified. Otherwise, only unaware
- * shrinkers are called.
+ * @memcg specifies the memory cgroup to target. Unaware shrinkers
+ * are called only if it is the root cgroup.
*
* @priority is sc->priority, we take the number of objects and >> by priority
* in order to get the scan target.
@@ -502,9 +667,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
{
struct shrinker *shrinker;
unsigned long freed = 0;
+ int ret;
- if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
- return 0;
+ if (!mem_cgroup_is_root(memcg))
+ return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
if (!down_read_trylock(&shrinker_rwsem))
goto out;
@@ -516,19 +682,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
.memcg = memcg,
};
- /*
- * If kernel memory accounting is disabled, we ignore
- * SHRINKER_MEMCG_AWARE flag and call all shrinkers
- * passing NULL for memcg.
- */
- if (memcg_kmem_enabled() &&
- !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE))
- continue;
-
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
- sc.nid = 0;
-
- freed += do_shrink_slab(&sc, shrinker, priority);
+ ret = do_shrink_slab(&sc, shrinker, priority);
+ if (ret == SHRINK_EMPTY)
+ ret = 0;
+ freed += ret;
/*
* Bail out if someone want to register a new shrinker to
* prevent the regsitration from being stalled for long periods
@@ -554,6 +711,7 @@ void drop_slab_node(int nid)
struct mem_cgroup *memcg = NULL;
freed = 0;
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
@@ -2573,9 +2731,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
node_lru_pages += lru_pages;
- if (memcg)
- shrink_slab(sc->gfp_mask, pgdat->node_id,
- memcg, sc->priority);
+ shrink_slab(sc->gfp_mask, pgdat->node_id,
+ memcg, sc->priority);
/* Record the group's reclaim efficiency */
vmpressure(sc->gfp_mask, memcg, false,
@@ -2599,10 +2756,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
}
} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
- if (global_reclaim(sc))
- shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
- sc->priority);
-
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
@@ -3064,6 +3217,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
};
/*
+ * scan_control uses s8 fields for order, priority, and reclaim_idx.
+ * Confirm they are large enough for max values.
+ */
+ BUILD_BUG_ON(MAX_ORDER > S8_MAX);
+ BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
+ BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
+
+ /*
* Do not enter reclaim if fatal signal was delivered while throttled.
* 1 is returned so that the page allocator does not OOM kill at this
* point.
diff --git a/mm/workingset.c b/mm/workingset.c
index 40ee02c83978..4516dd790129 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -366,10 +366,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
unsigned long nodes;
unsigned long cache;
- /* list_lru lock nests inside the IRQ-safe i_pages lock */
- local_irq_disable();
nodes = list_lru_shrink_count(&shadow_nodes, sc);
- local_irq_enable();
/*
* Approximate a reasonable limit for the radix tree nodes
@@ -402,6 +399,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
}
max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3);
+ if (!nodes)
+ return SHRINK_EMPTY;
+
if (nodes <= max_nodes)
return 0;
return nodes - max_nodes;
@@ -434,7 +434,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
/* Coming from the list, invert the lock order */
if (!xa_trylock(&mapping->i_pages)) {
- spin_unlock(lru_lock);
+ spin_unlock_irq(lru_lock);
ret = LRU_RETRY;
goto out;
}
@@ -472,26 +472,20 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
workingset_lookup_update(mapping));
out_invalid:
- xa_unlock(&mapping->i_pages);
+ xa_unlock_irq(&mapping->i_pages);
ret = LRU_REMOVED_RETRY;
out:
- local_irq_enable();
cond_resched();
- local_irq_disable();
- spin_lock(lru_lock);
+ spin_lock_irq(lru_lock);
return ret;
}
static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
struct shrink_control *sc)
{
- unsigned long ret;
-
/* list_lru lock nests inside the IRQ-safe i_pages lock */
- local_irq_disable();
- ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
- local_irq_enable();
- return ret;
+ return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate,
+ NULL);
}
static struct shrinker workingset_shadow_shrinker = {
@@ -528,15 +522,17 @@ static int __init workingset_init(void)
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
timestamp_bits, max_order, bucket_order);
- ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key);
+ ret = prealloc_shrinker(&workingset_shadow_shrinker);
if (ret)
goto err;
- ret = register_shrinker(&workingset_shadow_shrinker);
+ ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
+ &workingset_shadow_shrinker);
if (ret)
goto err_list_lru;
+ register_shrinker_prepared(&workingset_shadow_shrinker);
return 0;
err_list_lru:
- list_lru_destroy(&shadow_nodes);
+ free_prealloced_shrinker(&workingset_shadow_shrinker);
err:
return ret;
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 8d87e973a4f5..9da65552e7ca 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -924,20 +924,7 @@ static void reset_page(struct page *page)
page->freelist = NULL;
}
-/*
- * To prevent zspage destroy during migration, zspage freeing should
- * hold locks of all pages in the zspage.
- */
-void lock_zspage(struct zspage *zspage)
-{
- struct page *page = get_first_page(zspage);
-
- do {
- lock_page(page);
- } while ((page = get_next_page(page)) != NULL);
-}
-
-int trylock_zspage(struct zspage *zspage)
+static int trylock_zspage(struct zspage *zspage)
{
struct page *cursor, *fail;
@@ -1814,6 +1801,19 @@ static enum fullness_group putback_zspage(struct size_class *class,
}
#ifdef CONFIG_COMPACTION
+/*
+ * To prevent zspage destroy during migration, zspage freeing should
+ * hold locks of all pages in the zspage.
+ */
+static void lock_zspage(struct zspage *zspage)
+{
+ struct page *page = get_first_page(zspage);
+
+ do {
+ lock_page(page);
+ } while ((page = get_next_page(page)) != NULL);
+}
+
static struct dentry *zs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
@@ -1905,7 +1905,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage,
__SetPageMovable(newpage, page_mapping(oldpage));
}
-bool zs_page_isolate(struct page *page, isolate_mode_t mode)
+static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
{
struct zs_pool *pool;
struct size_class *class;
@@ -1960,7 +1960,7 @@ bool zs_page_isolate(struct page *page, isolate_mode_t mode)
return true;
}
-int zs_page_migrate(struct address_space *mapping, struct page *newpage,
+static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
struct page *page, enum migrate_mode mode)
{
struct zs_pool *pool;
@@ -2076,7 +2076,7 @@ unpin_objects:
return ret;
}
-void zs_page_putback(struct page *page)
+static void zs_page_putback(struct page *page)
{
struct zs_pool *pool;
struct size_class *class;
@@ -2108,7 +2108,7 @@ void zs_page_putback(struct page *page)
spin_unlock(&class->lock);
}
-const struct address_space_operations zsmalloc_aops = {
+static const struct address_space_operations zsmalloc_aops = {
.isolate_page = zs_page_isolate,
.migratepage = zs_page_migrate,
.putback_page = zs_page_putback,
diff --git a/scripts/spdxcheck.py b/scripts/spdxcheck.py
index 7deaef297f52..839e190bbd7a 100755
--- a/scripts/spdxcheck.py
+++ b/scripts/spdxcheck.py
@@ -4,6 +4,7 @@
from argparse import ArgumentParser
from ply import lex, yacc
+import locale
import traceback
import sys
import git
@@ -32,7 +33,7 @@ def read_spdxdata(repo):
# The subdirectories of LICENSES in the kernel source
license_dirs = [ "preferred", "other", "exceptions" ]
- lictree = repo.heads.master.commit.tree['LICENSES']
+ lictree = repo.head.commit.tree['LICENSES']
spdx = SPDXdata()
@@ -102,7 +103,7 @@ class id_parser(object):
raise ParserException(tok, 'Invalid License ID')
self.lastid = id
elif tok.type == 'EXC':
- if not self.spdx.exceptions.has_key(id):
+ if id not in self.spdx.exceptions:
raise ParserException(tok, 'Invalid Exception ID')
if self.lastid not in self.spdx.exceptions[id]:
raise ParserException(tok, 'Exception not valid for license %s' %self.lastid)
@@ -167,6 +168,7 @@ class id_parser(object):
self.curline = 0
try:
for line in fd:
+ line = line.decode(locale.getpreferredencoding(False), errors='ignore')
self.curline += 1
if self.curline > maxlines:
break
@@ -199,11 +201,10 @@ def scan_git_tree(tree):
continue
if el.path.find("license-rules.rst") >= 0:
continue
- if el.path == 'scripts/checkpatch.pl':
- continue
if not os.path.isfile(el.path):
continue
- parser.parse_lines(open(el.path), args.maxlines, el.path)
+ with open(el.path, 'rb') as fd:
+ parser.parse_lines(fd, args.maxlines, el.path)
def scan_git_subtree(tree, path):
for p in path.strip('/').split('/'):
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index cce853dca691..30cb0a0713ff 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -75,8 +75,11 @@
#define KPF_BYTES 8
#define PROC_KPAGEFLAGS "/proc/kpageflags"
+#define PROC_KPAGECOUNT "/proc/kpagecount"
#define PROC_KPAGECGROUP "/proc/kpagecgroup"
+#define SYS_KERNEL_MM_PAGE_IDLE "/sys/kernel/mm/page_idle/bitmap"
+
/* [32-] kernel hacking assistances */
#define KPF_RESERVED 32
#define KPF_MLOCKED 33
@@ -168,11 +171,13 @@ static const char * const debugfs_known_mountpoints[] = {
static int opt_raw; /* for kernel developers */
static int opt_list; /* list pages (in ranges) */
+static int opt_mark_idle; /* set accessed bit */
static int opt_no_summary; /* don't show summary */
static pid_t opt_pid; /* process to walk */
const char *opt_file; /* file or directory path */
static uint64_t opt_cgroup; /* cgroup inode */
static int opt_list_cgroup;/* list page cgroup */
+static int opt_list_mapcnt;/* list page map count */
static const char *opt_kpageflags;/* kpageflags file to parse */
#define MAX_ADDR_RANGES 1024
@@ -194,7 +199,9 @@ static int page_size;
static int pagemap_fd;
static int kpageflags_fd;
+static int kpagecount_fd = -1;
static int kpagecgroup_fd = -1;
+static int page_idle_fd = -1;
static int opt_hwpoison;
static int opt_unpoison;
@@ -298,6 +305,15 @@ static unsigned long kpagecgroup_read(uint64_t *buf,
return do_u64_read(kpagecgroup_fd, opt_kpageflags, buf, index, pages);
}
+static unsigned long kpagecount_read(uint64_t *buf,
+ unsigned long index,
+ unsigned long pages)
+{
+ return kpagecount_fd < 0 ? pages :
+ do_u64_read(kpagecount_fd, PROC_KPAGECOUNT,
+ buf, index, pages);
+}
+
static unsigned long pagemap_read(uint64_t *buf,
unsigned long index,
unsigned long pages)
@@ -370,16 +386,18 @@ static char *page_flag_longname(uint64_t flags)
*/
static void show_page_range(unsigned long voffset, unsigned long offset,
- unsigned long size, uint64_t flags, uint64_t cgroup)
+ unsigned long size, uint64_t flags,
+ uint64_t cgroup, uint64_t mapcnt)
{
static uint64_t flags0;
static uint64_t cgroup0;
+ static uint64_t mapcnt0;
static unsigned long voff;
static unsigned long index;
static unsigned long count;
- if (flags == flags0 && cgroup == cgroup0 && offset == index + count &&
- size && voffset == voff + count) {
+ if (flags == flags0 && cgroup == cgroup0 && mapcnt == mapcnt0 &&
+ offset == index + count && size && voffset == voff + count) {
count += size;
return;
}
@@ -391,12 +409,15 @@ static void show_page_range(unsigned long voffset, unsigned long offset,
printf("%lu\t", voff);
if (opt_list_cgroup)
printf("@%llu\t", (unsigned long long)cgroup0);
+ if (opt_list_mapcnt)
+ printf("%lu\t", mapcnt0);
printf("%lx\t%lx\t%s\n",
index, count, page_flag_name(flags0));
}
flags0 = flags;
- cgroup0= cgroup;
+ cgroup0 = cgroup;
+ mapcnt0 = mapcnt;
index = offset;
voff = voffset;
count = size;
@@ -404,11 +425,11 @@ static void show_page_range(unsigned long voffset, unsigned long offset,
static void flush_page_range(void)
{
- show_page_range(0, 0, 0, 0, 0);
+ show_page_range(0, 0, 0, 0, 0, 0);
}
static void show_page(unsigned long voffset, unsigned long offset,
- uint64_t flags, uint64_t cgroup)
+ uint64_t flags, uint64_t cgroup, uint64_t mapcnt)
{
if (opt_pid)
printf("%lx\t", voffset);
@@ -416,6 +437,9 @@ static void show_page(unsigned long voffset, unsigned long offset,
printf("%lu\t", voffset);
if (opt_list_cgroup)
printf("@%llu\t", (unsigned long long)cgroup);
+ if (opt_list_mapcnt)
+ printf("%lu\t", mapcnt);
+
printf("%lx\t%s\n", offset, page_flag_name(flags));
}
@@ -567,6 +591,30 @@ static int unpoison_page(unsigned long offset)
return 0;
}
+static int mark_page_idle(unsigned long offset)
+{
+ static unsigned long off;
+ static uint64_t buf;
+ int len;
+
+ if ((offset / 64 == off / 64) || buf == 0) {
+ buf |= 1UL << (offset % 64);
+ off = offset;
+ return 0;
+ }
+
+ len = pwrite(page_idle_fd, &buf, 8, 8 * (off / 64));
+ if (len < 0) {
+ perror("mark page idle");
+ return len;
+ }
+
+ buf = 1UL << (offset % 64);
+ off = offset;
+
+ return 0;
+}
+
/*
* page frame walker
*/
@@ -599,7 +647,8 @@ static size_t hash_slot(uint64_t flags)
}
static void add_page(unsigned long voffset, unsigned long offset,
- uint64_t flags, uint64_t cgroup, uint64_t pme)
+ uint64_t flags, uint64_t cgroup, uint64_t mapcnt,
+ uint64_t pme)
{
flags = kpageflags_flags(flags, pme);
@@ -614,10 +663,13 @@ static void add_page(unsigned long voffset, unsigned long offset,
if (opt_unpoison)
unpoison_page(offset);
+ if (opt_mark_idle)
+ mark_page_idle(offset);
+
if (opt_list == 1)
- show_page_range(voffset, offset, 1, flags, cgroup);
+ show_page_range(voffset, offset, 1, flags, cgroup, mapcnt);
else if (opt_list == 2)
- show_page(voffset, offset, flags, cgroup);
+ show_page(voffset, offset, flags, cgroup, mapcnt);
nr_pages[hash_slot(flags)]++;
total_pages++;
@@ -631,6 +683,7 @@ static void walk_pfn(unsigned long voffset,
{
uint64_t buf[KPAGEFLAGS_BATCH];
uint64_t cgi[KPAGEFLAGS_BATCH];
+ uint64_t cnt[KPAGEFLAGS_BATCH];
unsigned long batch;
unsigned long pages;
unsigned long i;
@@ -654,8 +707,12 @@ static void walk_pfn(unsigned long voffset,
if (kpagecgroup_read(cgi, index, pages) != pages)
fatal("kpagecgroup returned fewer pages than expected");
+ if (kpagecount_read(cnt, index, batch) != pages)
+ fatal("kpagecount returned fewer pages than expected");
+
for (i = 0; i < pages; i++)
- add_page(voffset + i, index + i, buf[i], cgi[i], pme);
+ add_page(voffset + i, index + i,
+ buf[i], cgi[i], cnt[i], pme);
index += pages;
count -= pages;
@@ -673,9 +730,10 @@ static void walk_swap(unsigned long voffset, uint64_t pme)
return;
if (opt_list == 1)
- show_page_range(voffset, pagemap_swap_offset(pme), 1, flags, 0);
+ show_page_range(voffset, pagemap_swap_offset(pme),
+ 1, flags, 0, 0);
else if (opt_list == 2)
- show_page(voffset, pagemap_swap_offset(pme), flags, 0);
+ show_page(voffset, pagemap_swap_offset(pme), flags, 0, 0);
nr_pages[hash_slot(flags)]++;
total_pages++;
@@ -756,6 +814,9 @@ static void walk_addr_ranges(void)
else
walk_task(opt_offset[i], opt_size[i]);
+ if (opt_mark_idle)
+ mark_page_idle(0);
+
close(kpageflags_fd);
}
@@ -786,9 +847,11 @@ static void usage(void)
" -c|--cgroup path|@inode Walk pages within memory cgroup\n"
" -p|--pid pid Walk process address space\n"
" -f|--file filename Walk file address space\n"
+" -i|--mark-idle Mark pages idle\n"
" -l|--list Show page details in ranges\n"
" -L|--list-each Show page details one by one\n"
" -C|--list-cgroup Show cgroup inode for pages\n"
+" -M|--list-mapcnt Show page map count\n"
" -N|--no-summary Don't show summary info\n"
" -X|--hwpoison hwpoison pages\n"
" -x|--unpoison unpoison pages\n"
@@ -925,6 +988,7 @@ static void walk_file(const char *name, const struct stat *st)
uint8_t vec[PAGEMAP_BATCH];
uint64_t buf[PAGEMAP_BATCH], flags;
uint64_t cgroup = 0;
+ uint64_t mapcnt = 0;
unsigned long nr_pages, pfn, i;
off_t off, end = st->st_size;
int fd;
@@ -984,13 +1048,15 @@ got_sigbus:
continue;
if (!kpagecgroup_read(&cgroup, pfn, 1))
fatal("kpagecgroup_read failed");
+ if (!kpagecount_read(&mapcnt, pfn, 1))
+ fatal("kpagecount_read failed");
if (first && opt_list) {
first = 0;
flush_page_range();
show_file(name, st);
}
add_page(off / page_size + i, pfn,
- flags, cgroup, buf[i]);
+ flags, cgroup, mapcnt, buf[i]);
}
}
@@ -1190,9 +1256,11 @@ static const struct option opts[] = {
{ "bits" , 1, NULL, 'b' },
{ "cgroup" , 1, NULL, 'c' },
{ "describe" , 1, NULL, 'd' },
+ { "mark-idle" , 0, NULL, 'i' },
{ "list" , 0, NULL, 'l' },
{ "list-each" , 0, NULL, 'L' },
{ "list-cgroup", 0, NULL, 'C' },
+ { "list-mapcnt", 0, NULL, 'M' },
{ "no-summary", 0, NULL, 'N' },
{ "hwpoison" , 0, NULL, 'X' },
{ "unpoison" , 0, NULL, 'x' },
@@ -1208,7 +1276,8 @@ int main(int argc, char *argv[])
page_size = getpagesize();
while ((c = getopt_long(argc, argv,
- "rp:f:a:b:d:c:ClLNXxF:h", opts, NULL)) != -1) {
+ "rp:f:a:b:d:c:CilLMNXxF:h",
+ opts, NULL)) != -1) {
switch (c) {
case 'r':
opt_raw = 1;
@@ -1234,12 +1303,18 @@ int main(int argc, char *argv[])
case 'd':
describe_flags(optarg);
exit(0);
+ case 'i':
+ opt_mark_idle = 1;
+ break;
case 'l':
opt_list = 1;
break;
case 'L':
opt_list = 2;
break;
+ case 'M':
+ opt_list_mapcnt = 1;
+ break;
case 'N':
opt_no_summary = 1;
break;
@@ -1269,12 +1344,21 @@ int main(int argc, char *argv[])
if (opt_cgroup || opt_list_cgroup)
kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY);
+ if (opt_list && opt_list_mapcnt)
+ kpagecount_fd = checked_open(PROC_KPAGECOUNT, O_RDONLY);
+
+ if (opt_mark_idle && opt_file)
+ page_idle_fd = checked_open(SYS_KERNEL_MM_PAGE_IDLE, O_RDWR);
+
if (opt_list && opt_pid)
printf("voffset\t");
if (opt_list && opt_file)
printf("foffset\t");
if (opt_list && opt_list_cgroup)
printf("cgroup\t");
+ if (opt_list && opt_list_mapcnt)
+ printf("map-cnt\t");
+
if (opt_list == 1)
printf("offset\tlen\tflags\n");
if (opt_list == 2)
@@ -1296,5 +1380,11 @@ int main(int argc, char *argv[])
show_summary();
+ if (opt_list_mapcnt)
+ close(kpagecount_fd);
+
+ if (page_idle_fd >= 0)
+ close(page_idle_fd);
+
return 0;
}