summaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/Kconfig6
-rw-r--r--lib/Kconfig.debug53
-rw-r--r--lib/Kconfig.ubsan11
-rw-r--r--lib/Makefile5
-rw-r--r--lib/atomic64_test.c4
-rw-r--r--lib/bitmap.c50
-rw-r--r--lib/cpu-notifier-error-inject.c46
-rw-r--r--lib/debugobjects.c10
-rw-r--r--lib/dma-debug.c88
-rw-r--r--lib/genalloc.c3
-rw-r--r--lib/idr.c11
-rw-r--r--lib/iov_iter.c423
-rw-r--r--lib/irq_poll.c28
-rw-r--r--lib/kobject_uevent.c6
-rw-r--r--lib/kstrtox.c6
-rw-r--r--lib/list_debug.c99
-rw-r--r--lib/locking-selftest.c66
-rw-r--r--lib/lockref.c2
-rw-r--r--lib/mpi/mpi-pow.c7
-rw-r--r--lib/nlattr.c2
-rw-r--r--lib/nmi_backtrace.c42
-rw-r--r--lib/parser.c47
-rw-r--r--lib/percpu-refcount.c169
-rw-r--r--lib/percpu_counter.c25
-rw-r--r--lib/radix-tree.c336
-rw-r--r--lib/raid6/.gitignore1
-rw-r--r--lib/raid6/Makefile8
-rw-r--r--lib/raid6/algos.c18
-rw-r--r--lib/raid6/avx512.c569
-rw-r--r--lib/raid6/recov_avx512.c388
-rw-r--r--lib/raid6/recov_s390xc.c116
-rw-r--r--lib/raid6/s390vx.uc168
-rw-r--r--lib/raid6/test/Makefile5
-rw-r--r--lib/raid6/test/test.c7
-rw-r--r--lib/raid6/x86.h10
-rw-r--r--lib/random32.c6
-rw-r--r--lib/rbtree.c23
-rw-r--r--lib/rhashtable.c300
-rw-r--r--lib/sbitmap.c347
-rw-r--r--lib/stackdepot.c4
-rw-r--r--lib/strncpy_from_user.c2
-rw-r--r--lib/syscall.c15
-rw-r--r--lib/test_bpf.c3
-rw-r--r--lib/test_kasan.c29
-rw-r--r--lib/ucs2_string.c2
-rw-r--r--lib/win_minmax.c98
46 files changed, 3164 insertions, 500 deletions
diff --git a/lib/Kconfig b/lib/Kconfig
index d79909dc01ec..260a80e313b9 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -457,9 +457,6 @@ config NLATTR
config GENERIC_ATOMIC64
bool
-config ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
- def_bool y if GENERIC_ATOMIC64
-
config LRU_CACHE
tristate
@@ -550,4 +547,7 @@ config STACKDEPOT
bool
select STACKTRACE
+config SBITMAP
+ bool
+
endmenu
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index cab7405f48d2..e6327d102184 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -13,7 +13,22 @@ config PRINTK_TIME
be included, not that the timestamp is recorded.
The behavior is also controlled by the kernel command line
- parameter printk.time=1. See Documentation/kernel-parameters.txt
+ parameter printk.time=1. See Documentation/admin-guide/kernel-parameters.rst
+
+config CONSOLE_LOGLEVEL_DEFAULT
+ int "Default console loglevel (1-15)"
+ range 1 15
+ default "7"
+ help
+ Default loglevel to determine what will be printed on the console.
+
+ Setting a default here is equivalent to passing in loglevel=<x> in
+ the kernel bootargs. loglevel=<x> continues to override whatever
+ value is specified here as well.
+
+ Note: This does not affect the log level of un-prefixed prink()
+ usage in the kernel. That is controlled by the MESSAGE_LOGLEVEL_DEFAULT
+ option.
config MESSAGE_LOGLEVEL_DEFAULT
int "Default message log level (1-7)"
@@ -26,6 +41,10 @@ config MESSAGE_LOGLEVEL_DEFAULT
that are auditing their logs closely may want to set it to a lower
priority.
+ Note: This does not affect what message level gets printed on the console
+ by default. To change that, use loglevel=<x> in the kernel bootargs,
+ or pick a different CONSOLE_LOGLEVEL_DEFAULT configuration value.
+
config BOOT_PRINTK_DELAY
bool "Delay each boot printk message by N milliseconds"
depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY
@@ -198,6 +217,7 @@ config FRAME_WARN
int "Warn for stack frames larger than (needs gcc 4.4)"
range 0 8192
default 0 if KASAN
+ default 2048 if GCC_PLUGIN_LATENT_ENTROPY
default 1024 if !64BIT
default 2048 if 64BIT
help
@@ -305,7 +325,7 @@ config DEBUG_SECTION_MISMATCH
a larger kernel).
- Run the section mismatch analysis for each module/built-in.o file.
When we run the section mismatch analysis on vmlinux.o, we
- lose valueble information about where the mismatch was
+ lose valuable information about where the mismatch was
introduced.
Running the analysis for each module/built-in.o file
tells where the mismatch happens much closer to the
@@ -1084,6 +1104,9 @@ config PROVE_LOCKING
For more details, see Documentation/locking/lockdep-design.txt.
+config PROVE_LOCKING_SMALL
+ bool
+
config LOCKDEP
bool
depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
@@ -1214,7 +1237,7 @@ config DEBUG_BUGVERBOSE
config DEBUG_LIST
bool "Debug linked list manipulation"
- depends on DEBUG_KERNEL
+ depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION
help
Enable this to turn on extended checks in the linked-list
walking routines.
@@ -1430,7 +1453,8 @@ config RCU_TRACE
select TRACE_CLOCK
help
This option provides tracing in RCU which presents stats
- in debugfs for debugging RCU implementation.
+ in debugfs for debugging RCU implementation. It also enables
+ additional tracepoints for ftrace-style event tracing.
Say Y here if you want to enable RCU tracing
Say N if you are unsure.
@@ -1857,15 +1881,6 @@ config PROVIDE_OHCI1394_DMA_INIT
See Documentation/debugging-via-ohci1394.txt for more information.
-config BUILD_DOCSRC
- bool "Build targets in Documentation/ tree"
- depends on HEADERS_CHECK
- help
- This option attempts to build objects from the source files in the
- kernel Documentation/ tree.
-
- Say N if you are unsure.
-
config DMA_API_DEBUG
bool "Enable debugging of DMA-API usage"
depends on HAVE_DMA_API_DEBUG
@@ -1969,6 +1984,16 @@ config TEST_STATIC_KEYS
If unsure, say N.
+config BUG_ON_DATA_CORRUPTION
+ bool "Trigger a BUG when data corruption is detected"
+ select DEBUG_LIST
+ help
+ Select this option if the kernel should BUG when it encounters
+ data corruption in kernel memory structures when they get checked
+ for validity.
+
+ If unsure, say N.
+
source "samples/Kconfig"
source "lib/Kconfig.kgdb"
@@ -1980,7 +2005,7 @@ config ARCH_HAS_DEVMEM_IS_ALLOWED
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
- depends on MMU
+ depends on MMU && DEVMEM
depends on ARCH_HAS_DEVMEM_IS_ALLOWED
default y if TILE || PPC
---help---
diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan
index 39494af9a84a..bc6e651df68c 100644
--- a/lib/Kconfig.ubsan
+++ b/lib/Kconfig.ubsan
@@ -1,6 +1,9 @@
config ARCH_HAS_UBSAN_SANITIZE_ALL
bool
+config ARCH_WANTS_UBSAN_NO_NULL
+ def_bool n
+
config UBSAN
bool "Undefined behaviour sanity checker"
help
@@ -34,3 +37,11 @@ config UBSAN_ALIGNMENT
This option enables detection of unaligned memory accesses.
Enabling this option on architectures that support unaligned
accesses may produce a lot of false positives.
+
+config UBSAN_NULL
+ bool "Enable checking of null pointers"
+ depends on UBSAN
+ default y if !ARCH_WANTS_UBSAN_NO_NULL
+ help
+ This option enables detection of memory accesses via a
+ null pointer.
diff --git a/lib/Makefile b/lib/Makefile
index 5dc77a8ec297..50144a3aeebd 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -22,7 +22,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
sha1.o chacha20.o md5.o irq_regs.o argv_split.o \
flex_proportions.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
- earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o
+ earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o win_minmax.o
lib-$(CONFIG_MMU) += ioremap.o
lib-$(CONFIG_SMP) += cpumask.o
@@ -180,6 +180,7 @@ obj-$(CONFIG_IRQ_POLL) += irq_poll.o
obj-$(CONFIG_STACKDEPOT) += stackdepot.o
KASAN_SANITIZE_stackdepot.o := n
+KCOV_INSTRUMENT_stackdepot.o := n
libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \
fdt_empty_tree.o
@@ -227,3 +228,5 @@ obj-$(CONFIG_UCS2_STRING) += ucs2_string.o
obj-$(CONFIG_UBSAN) += ubsan.o
UBSAN_SANITIZE_ubsan.o := n
+
+obj-$(CONFIG_SBITMAP) += sbitmap.o
diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index dbb369145dda..46042901130f 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -213,7 +213,6 @@ static __init void test_atomic64(void)
r += one;
BUG_ON(v.counter != r);
-#ifdef CONFIG_ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
INIT(onestwos);
BUG_ON(atomic64_dec_if_positive(&v) != (onestwos - 1));
r -= one;
@@ -226,9 +225,6 @@ static __init void test_atomic64(void)
INIT(-one);
BUG_ON(atomic64_dec_if_positive(&v) != (-one - one));
BUG_ON(v.counter != r);
-#else
-#warning Please implement atomic64_dec_if_positive for your architecture and select the above Kconfig symbol
-#endif
INIT(onestwos);
BUG_ON(!atomic64_inc_not_zero(&v));
diff --git a/lib/bitmap.c b/lib/bitmap.c
index eca88087fa8a..0b66f0e5eb6b 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -496,6 +496,11 @@ EXPORT_SYMBOL(bitmap_print_to_pagebuf);
* ranges. Consecutively set bits are shown as two hyphen-separated
* decimal numbers, the smallest and largest bit numbers set in
* the range.
+ * Optionally each range can be postfixed to denote that only parts of it
+ * should be set. The range will divided to groups of specific size.
+ * From each group will be used only defined amount of bits.
+ * Syntax: range:used_size/group_size
+ * Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769
*
* Returns 0 on success, -errno on invalid input strings.
* Error values:
@@ -507,16 +512,20 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
int is_user, unsigned long *maskp,
int nmaskbits)
{
- unsigned a, b;
+ unsigned int a, b, old_a, old_b;
+ unsigned int group_size, used_size;
int c, old_c, totaldigits, ndigits;
const char __user __force *ubuf = (const char __user __force *)buf;
- int at_start, in_range;
+ int at_start, in_range, in_partial_range;
totaldigits = c = 0;
+ old_a = old_b = 0;
+ group_size = used_size = 0;
bitmap_zero(maskp, nmaskbits);
do {
at_start = 1;
in_range = 0;
+ in_partial_range = 0;
a = b = 0;
ndigits = totaldigits;
@@ -547,6 +556,24 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
if ((totaldigits != ndigits) && isspace(old_c))
return -EINVAL;
+ if (c == '/') {
+ used_size = a;
+ at_start = 1;
+ in_range = 0;
+ a = b = 0;
+ continue;
+ }
+
+ if (c == ':') {
+ old_a = a;
+ old_b = b;
+ at_start = 1;
+ in_range = 0;
+ in_partial_range = 1;
+ a = b = 0;
+ continue;
+ }
+
if (c == '-') {
if (at_start || in_range)
return -EINVAL;
@@ -567,15 +594,30 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
}
if (ndigits == totaldigits)
continue;
+ if (in_partial_range) {
+ group_size = a;
+ a = old_a;
+ b = old_b;
+ old_a = old_b = 0;
+ }
/* if no digit is after '-', it's wrong*/
if (at_start && in_range)
return -EINVAL;
- if (!(a <= b))
+ if (!(a <= b) || !(used_size <= group_size))
return -EINVAL;
if (b >= nmaskbits)
return -ERANGE;
while (a <= b) {
- set_bit(a, maskp);
+ if (in_partial_range) {
+ static int pos_in_group = 1;
+
+ if (pos_in_group <= used_size)
+ set_bit(a, maskp);
+
+ if (a == b || ++pos_in_group > group_size)
+ pos_in_group = 1;
+ } else
+ set_bit(a, maskp);
a++;
}
} while (buflen && c == ',');
diff --git a/lib/cpu-notifier-error-inject.c b/lib/cpu-notifier-error-inject.c
index 707ca24f7b18..0e2c9a1e958a 100644
--- a/lib/cpu-notifier-error-inject.c
+++ b/lib/cpu-notifier-error-inject.c
@@ -8,16 +8,47 @@ static int priority;
module_param(priority, int, 0);
MODULE_PARM_DESC(priority, "specify cpu notifier priority");
+#define UP_PREPARE 0
+#define UP_PREPARE_FROZEN 0
+#define DOWN_PREPARE 0
+#define DOWN_PREPARE_FROZEN 0
+
static struct notifier_err_inject cpu_notifier_err_inject = {
.actions = {
- { NOTIFIER_ERR_INJECT_ACTION(CPU_UP_PREPARE) },
- { NOTIFIER_ERR_INJECT_ACTION(CPU_UP_PREPARE_FROZEN) },
- { NOTIFIER_ERR_INJECT_ACTION(CPU_DOWN_PREPARE) },
- { NOTIFIER_ERR_INJECT_ACTION(CPU_DOWN_PREPARE_FROZEN) },
+ { NOTIFIER_ERR_INJECT_ACTION(UP_PREPARE) },
+ { NOTIFIER_ERR_INJECT_ACTION(UP_PREPARE_FROZEN) },
+ { NOTIFIER_ERR_INJECT_ACTION(DOWN_PREPARE) },
+ { NOTIFIER_ERR_INJECT_ACTION(DOWN_PREPARE_FROZEN) },
{}
}
};
+static int notf_err_handle(struct notifier_err_inject_action *action)
+{
+ int ret;
+
+ ret = action->error;
+ if (ret)
+ pr_info("Injecting error (%d) to %s\n", ret, action->name);
+ return ret;
+}
+
+static int notf_err_inj_up_prepare(unsigned int cpu)
+{
+ if (!cpuhp_tasks_frozen)
+ return notf_err_handle(&cpu_notifier_err_inject.actions[0]);
+ else
+ return notf_err_handle(&cpu_notifier_err_inject.actions[1]);
+}
+
+static int notf_err_inj_dead(unsigned int cpu)
+{
+ if (!cpuhp_tasks_frozen)
+ return notf_err_handle(&cpu_notifier_err_inject.actions[2]);
+ else
+ return notf_err_handle(&cpu_notifier_err_inject.actions[3]);
+}
+
static struct dentry *dir;
static int err_inject_init(void)
@@ -29,7 +60,10 @@ static int err_inject_init(void)
if (IS_ERR(dir))
return PTR_ERR(dir);
- err = register_hotcpu_notifier(&cpu_notifier_err_inject.nb);
+ err = cpuhp_setup_state_nocalls(CPUHP_NOTF_ERR_INJ_PREPARE,
+ "cpu-err-notif:prepare",
+ notf_err_inj_up_prepare,
+ notf_err_inj_dead);
if (err)
debugfs_remove_recursive(dir);
@@ -38,7 +72,7 @@ static int err_inject_init(void)
static void err_inject_exit(void)
{
- unregister_hotcpu_notifier(&cpu_notifier_err_inject.nb);
+ cpuhp_remove_state_nocalls(CPUHP_NOTF_ERR_INJ_PREPARE);
debugfs_remove_recursive(dir);
}
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index a8e12601eb37..04c1ef717fe0 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -199,7 +199,7 @@ static void free_object(struct debug_obj *obj)
* initialized:
*/
if (obj_pool_free > ODEBUG_POOL_SIZE && obj_cache)
- sched = keventd_up();
+ sched = 1;
hlist_add_head(&obj->node, &obj_pool);
obj_pool_free++;
obj_pool_used--;
@@ -362,6 +362,7 @@ void debug_object_init(void *addr, struct debug_obj_descr *descr)
__debug_object_init(addr, descr, 0);
}
+EXPORT_SYMBOL_GPL(debug_object_init);
/**
* debug_object_init_on_stack - debug checks when an object on stack is
@@ -376,6 +377,7 @@ void debug_object_init_on_stack(void *addr, struct debug_obj_descr *descr)
__debug_object_init(addr, descr, 1);
}
+EXPORT_SYMBOL_GPL(debug_object_init_on_stack);
/**
* debug_object_activate - debug checks when an object is activated
@@ -449,6 +451,7 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
}
return 0;
}
+EXPORT_SYMBOL_GPL(debug_object_activate);
/**
* debug_object_deactivate - debug checks when an object is deactivated
@@ -496,6 +499,7 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr)
raw_spin_unlock_irqrestore(&db->lock, flags);
}
+EXPORT_SYMBOL_GPL(debug_object_deactivate);
/**
* debug_object_destroy - debug checks when an object is destroyed
@@ -542,6 +546,7 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr)
out_unlock:
raw_spin_unlock_irqrestore(&db->lock, flags);
}
+EXPORT_SYMBOL_GPL(debug_object_destroy);
/**
* debug_object_free - debug checks when an object is freed
@@ -582,6 +587,7 @@ void debug_object_free(void *addr, struct debug_obj_descr *descr)
out_unlock:
raw_spin_unlock_irqrestore(&db->lock, flags);
}
+EXPORT_SYMBOL_GPL(debug_object_free);
/**
* debug_object_assert_init - debug checks when object should be init-ed
@@ -626,6 +632,7 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr)
raw_spin_unlock_irqrestore(&db->lock, flags);
}
+EXPORT_SYMBOL_GPL(debug_object_assert_init);
/**
* debug_object_active_state - debug checks object usage state machine
@@ -673,6 +680,7 @@ debug_object_active_state(void *addr, struct debug_obj_descr *descr,
raw_spin_unlock_irqrestore(&db->lock, flags);
}
+EXPORT_SYMBOL_GPL(debug_object_active_state);
#ifdef CONFIG_DEBUG_OBJECTS_FREE
static void __debug_check_no_obj_freed(const void *address, unsigned long size)
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index fcfa1939ac41..8971370bfb16 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -22,6 +22,7 @@
#include <linux/stacktrace.h>
#include <linux/dma-debug.h>
#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/export.h>
@@ -43,6 +44,7 @@ enum {
dma_debug_page,
dma_debug_sg,
dma_debug_coherent,
+ dma_debug_resource,
};
enum map_err_types {
@@ -150,8 +152,9 @@ static const char *const maperr2str[] = {
[MAP_ERR_CHECKED] = "dma map error checked",
};
-static const char *type2name[4] = { "single", "page",
- "scather-gather", "coherent" };
+static const char *type2name[5] = { "single", "page",
+ "scather-gather", "coherent",
+ "resource" };
static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE",
"DMA_FROM_DEVICE", "DMA_NONE" };
@@ -399,6 +402,9 @@ static void hash_bucket_del(struct dma_debug_entry *entry)
static unsigned long long phys_addr(struct dma_debug_entry *entry)
{
+ if (entry->type == dma_debug_resource)
+ return __pfn_to_phys(entry->pfn) + entry->offset;
+
return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset;
}
@@ -1164,11 +1170,32 @@ static void check_unmap(struct dma_debug_entry *ref)
put_hash_bucket(bucket, &flags);
}
-static void check_for_stack(struct device *dev, void *addr)
+static void check_for_stack(struct device *dev,
+ struct page *page, size_t offset)
{
- if (object_is_on_stack(addr))
- err_printk(dev, NULL, "DMA-API: device driver maps memory from "
- "stack [addr=%p]\n", addr);
+ void *addr;
+ struct vm_struct *stack_vm_area = task_stack_vm_area(current);
+
+ if (!stack_vm_area) {
+ /* Stack is direct-mapped. */
+ if (PageHighMem(page))
+ return;
+ addr = page_address(page) + offset;
+ if (object_is_on_stack(addr))
+ err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [addr=%p]\n", addr);
+ } else {
+ /* Stack is vmalloced. */
+ int i;
+
+ for (i = 0; i < stack_vm_area->nr_pages; i++) {
+ if (page != stack_vm_area->pages[i])
+ continue;
+
+ addr = (u8 *)current->stack + i * PAGE_SIZE + offset;
+ err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [probable addr=%p]\n", addr);
+ break;
+ }
+ }
}
static inline bool overlap(void *addr, unsigned long len, void *start, void *end)
@@ -1291,10 +1318,11 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
if (map_single)
entry->type = dma_debug_single;
+ check_for_stack(dev, page, offset);
+
if (!PageHighMem(page)) {
void *addr = page_address(page) + offset;
- check_for_stack(dev, addr);
check_for_illegal_area(dev, addr, size);
}
@@ -1386,8 +1414,9 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
entry->sg_call_ents = nents;
entry->sg_mapped_ents = mapped_ents;
+ check_for_stack(dev, sg_page(s), s->offset);
+
if (!PageHighMem(sg_page(s))) {
- check_for_stack(dev, sg_virt(s));
check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s));
}
@@ -1495,6 +1524,49 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
}
EXPORT_SYMBOL(debug_dma_free_coherent);
+void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
+ int direction, dma_addr_t dma_addr)
+{
+ struct dma_debug_entry *entry;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ entry = dma_entry_alloc();
+ if (!entry)
+ return;
+
+ entry->type = dma_debug_resource;
+ entry->dev = dev;
+ entry->pfn = PHYS_PFN(addr);
+ entry->offset = offset_in_page(addr);
+ entry->size = size;
+ entry->dev_addr = dma_addr;
+ entry->direction = direction;
+ entry->map_err_type = MAP_ERR_NOT_CHECKED;
+
+ add_dma_entry(entry);
+}
+EXPORT_SYMBOL(debug_dma_map_resource);
+
+void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
+ size_t size, int direction)
+{
+ struct dma_debug_entry ref = {
+ .type = dma_debug_resource,
+ .dev = dev,
+ .dev_addr = dma_addr,
+ .size = size,
+ .direction = direction,
+ };
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ check_unmap(&ref);
+}
+EXPORT_SYMBOL(debug_dma_unmap_resource);
+
void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
size_t size, int direction)
{
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 0a1139644d32..144fe6b1a03e 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -292,7 +292,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size,
struct gen_pool_chunk *chunk;
unsigned long addr = 0;
int order = pool->min_alloc_order;
- int nbits, start_bit = 0, end_bit, remain;
+ int nbits, start_bit, end_bit, remain;
#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
BUG_ON(in_nmi());
@@ -307,6 +307,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size,
if (size > atomic_read(&chunk->avail))
continue;
+ start_bit = 0;
end_bit = chunk_size(chunk) >> order;
retry:
start_bit = algo(chunk->bits, end_bit, start_bit,
diff --git a/lib/idr.c b/lib/idr.c
index 6098336df267..52d2979a05e8 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -927,6 +927,9 @@ EXPORT_SYMBOL(ida_pre_get);
* and go back to the ida_pre_get() call. If the ida is full, it will
* return %-ENOSPC.
*
+ * Note that callers must ensure that concurrent access to @ida is not possible.
+ * See ida_simple_get() for a varaint which takes care of locking.
+ *
* @p_id returns a value in the range @starting_id ... %0x7fffffff.
*/
int ida_get_new_above(struct ida *ida, int starting_id, int *p_id)
@@ -1073,6 +1076,9 @@ EXPORT_SYMBOL(ida_destroy);
* Allocates an id in the range start <= id < end, or returns -ENOSPC.
* On memory allocation failure, returns -ENOMEM.
*
+ * Compared to ida_get_new_above() this function does its own locking, and
+ * should be used unless there are special requirements.
+ *
* Use ida_simple_remove() to get rid of an id.
*/
int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
@@ -1119,6 +1125,11 @@ EXPORT_SYMBOL(ida_simple_get);
* ida_simple_remove - remove an allocated id.
* @ida: the (initialized) ida.
* @id: the id returned by ida_simple_get.
+ *
+ * Use to release an id allocated with ida_simple_get().
+ *
+ * Compared to ida_remove() this function does its own locking, and should be
+ * used unless there are special requirements.
*/
void ida_simple_remove(struct ida *ida, unsigned int id)
{
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 7e3138cfc8c9..691a52b634fe 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1,10 +1,14 @@
#include <linux/export.h>
+#include <linux/bvec.h>
#include <linux/uio.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <linux/splice.h>
#include <net/checksum.h>
+#define PIPE_PARANOIA /* for now */
+
#define iterate_iovec(i, n, __v, __p, skip, STEP) { \
size_t left; \
size_t wanted = n; \
@@ -290,6 +294,93 @@ done:
return wanted - bytes;
}
+#ifdef PIPE_PARANOIA
+static bool sanity(const struct iov_iter *i)
+{
+ struct pipe_inode_info *pipe = i->pipe;
+ int idx = i->idx;
+ int next = pipe->curbuf + pipe->nrbufs;
+ if (i->iov_offset) {
+ struct pipe_buffer *p;
+ if (unlikely(!pipe->nrbufs))
+ goto Bad; // pipe must be non-empty
+ if (unlikely(idx != ((next - 1) & (pipe->buffers - 1))))
+ goto Bad; // must be at the last buffer...
+
+ p = &pipe->bufs[idx];
+ if (unlikely(p->offset + p->len != i->iov_offset))
+ goto Bad; // ... at the end of segment
+ } else {
+ if (idx != (next & (pipe->buffers - 1)))
+ goto Bad; // must be right after the last buffer
+ }
+ return true;
+Bad:
+ printk(KERN_ERR "idx = %d, offset = %zd\n", i->idx, i->iov_offset);
+ printk(KERN_ERR "curbuf = %d, nrbufs = %d, buffers = %d\n",
+ pipe->curbuf, pipe->nrbufs, pipe->buffers);
+ for (idx = 0; idx < pipe->buffers; idx++)
+ printk(KERN_ERR "[%p %p %d %d]\n",
+ pipe->bufs[idx].ops,
+ pipe->bufs[idx].page,
+ pipe->bufs[idx].offset,
+ pipe->bufs[idx].len);
+ WARN_ON(1);
+ return false;
+}
+#else
+#define sanity(i) true
+#endif
+
+static inline int next_idx(int idx, struct pipe_inode_info *pipe)
+{
+ return (idx + 1) & (pipe->buffers - 1);
+}
+
+static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
+ struct iov_iter *i)
+{
+ struct pipe_inode_info *pipe = i->pipe;
+ struct pipe_buffer *buf;
+ size_t off;
+ int idx;
+
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ if (!sanity(i))
+ return 0;
+
+ off = i->iov_offset;
+ idx = i->idx;
+ buf = &pipe->bufs[idx];
+ if (off) {
+ if (offset == off && buf->page == page) {
+ /* merge with the last one */
+ buf->len += bytes;
+ i->iov_offset += bytes;
+ goto out;
+ }
+ idx = next_idx(idx, pipe);
+ buf = &pipe->bufs[idx];
+ }
+ if (idx == pipe->curbuf && pipe->nrbufs)
+ return 0;
+ pipe->nrbufs++;
+ buf->ops = &page_cache_pipe_buf_ops;
+ get_page(buf->page = page);
+ buf->offset = offset;
+ buf->len = bytes;
+ i->iov_offset = offset + bytes;
+ i->idx = idx;
+out:
+ i->count -= bytes;
+ return bytes;
+}
+
/*
* Fault in one or more iovecs of the given iov_iter, to a maximum length of
* bytes. For each iovec, fault in each page that constitutes the iovec.
@@ -306,8 +397,7 @@ int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
if (!(i->type & (ITER_BVEC|ITER_KVEC))) {
iterate_iovec(i, bytes, v, iov, skip, ({
- err = fault_in_multipages_readable(v.iov_base,
- v.iov_len);
+ err = fault_in_pages_readable(v.iov_base, v.iov_len);
if (unlikely(err))
return err;
0;}))
@@ -356,9 +446,98 @@ static void memzero_page(struct page *page, size_t offset, size_t len)
kunmap_atomic(addr);
}
+static inline bool allocated(struct pipe_buffer *buf)
+{
+ return buf->ops == &default_pipe_buf_ops;
+}
+
+static inline void data_start(const struct iov_iter *i, int *idxp, size_t *offp)
+{
+ size_t off = i->iov_offset;
+ int idx = i->idx;
+ if (off && (!allocated(&i->pipe->bufs[idx]) || off == PAGE_SIZE)) {
+ idx = next_idx(idx, i->pipe);
+ off = 0;
+ }
+ *idxp = idx;
+ *offp = off;
+}
+
+static size_t push_pipe(struct iov_iter *i, size_t size,
+ int *idxp, size_t *offp)
+{
+ struct pipe_inode_info *pipe = i->pipe;
+ size_t off;
+ int idx;
+ ssize_t left;
+
+ if (unlikely(size > i->count))
+ size = i->count;
+ if (unlikely(!size))
+ return 0;
+
+ left = size;
+ data_start(i, &idx, &off);
+ *idxp = idx;
+ *offp = off;
+ if (off) {
+ left -= PAGE_SIZE - off;
+ if (left <= 0) {
+ pipe->bufs[idx].len += size;
+ return size;
+ }
+ pipe->bufs[idx].len = PAGE_SIZE;
+ idx = next_idx(idx, pipe);
+ }
+ while (idx != pipe->curbuf || !pipe->nrbufs) {
+ struct page *page = alloc_page(GFP_USER);
+ if (!page)
+ break;
+ pipe->nrbufs++;
+ pipe->bufs[idx].ops = &default_pipe_buf_ops;
+ pipe->bufs[idx].page = page;
+ pipe->bufs[idx].offset = 0;
+ if (left <= PAGE_SIZE) {
+ pipe->bufs[idx].len = left;
+ return size;
+ }
+ pipe->bufs[idx].len = PAGE_SIZE;
+ left -= PAGE_SIZE;
+ idx = next_idx(idx, pipe);
+ }
+ return size - left;
+}
+
+static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
+ struct iov_iter *i)
+{
+ struct pipe_inode_info *pipe = i->pipe;
+ size_t n, off;
+ int idx;
+
+ if (!sanity(i))
+ return 0;
+
+ bytes = n = push_pipe(i, bytes, &idx, &off);
+ if (unlikely(!n))
+ return 0;
+ for ( ; n; idx = next_idx(idx, pipe), off = 0) {
+ size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
+ memcpy_to_page(pipe->bufs[idx].page, off, addr, chunk);
+ i->idx = idx;
+ i->iov_offset = off + chunk;
+ n -= chunk;
+ addr += chunk;
+ }
+ i->count -= bytes;
+ return bytes;
+}
+
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
const char *from = addr;
+ if (unlikely(i->type & ITER_PIPE))
+ return copy_pipe_to_iter(addr, bytes, i);
iterate_and_advance(i, bytes, v,
__copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len,
v.iov_len),
@@ -374,6 +553,10 @@ EXPORT_SYMBOL(copy_to_iter);
size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
char *to = addr;
+ if (unlikely(i->type & ITER_PIPE)) {
+ WARN_ON(1);
+ return 0;
+ }
iterate_and_advance(i, bytes, v,
__copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base,
v.iov_len),
@@ -389,6 +572,10 @@ EXPORT_SYMBOL(copy_from_iter);
size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
char *to = addr;
+ if (unlikely(i->type & ITER_PIPE)) {
+ WARN_ON(1);
+ return 0;
+ }
iterate_and_advance(i, bytes, v,
__copy_from_user_nocache((to += v.iov_len) - v.iov_len,
v.iov_base, v.iov_len),
@@ -409,14 +596,20 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
kunmap_atomic(kaddr);
return wanted;
- } else
+ } else if (likely(!(i->type & ITER_PIPE)))
return copy_page_to_iter_iovec(page, offset, bytes, i);
+ else
+ return copy_page_to_iter_pipe(page, offset, bytes, i);
}
EXPORT_SYMBOL(copy_page_to_iter);
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
+ if (unlikely(i->type & ITER_PIPE)) {
+ WARN_ON(1);
+ return 0;
+ }
if (i->type & (ITER_BVEC|ITER_KVEC)) {
void *kaddr = kmap_atomic(page);
size_t wanted = copy_from_iter(kaddr + offset, bytes, i);
@@ -427,8 +620,34 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
}
EXPORT_SYMBOL(copy_page_from_iter);
+static size_t pipe_zero(size_t bytes, struct iov_iter *i)
+{
+ struct pipe_inode_info *pipe = i->pipe;
+ size_t n, off;
+ int idx;
+
+ if (!sanity(i))
+ return 0;
+
+ bytes = n = push_pipe(i, bytes, &idx, &off);
+ if (unlikely(!n))
+ return 0;
+
+ for ( ; n; idx = next_idx(idx, pipe), off = 0) {
+ size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
+ memzero_page(pipe->bufs[idx].page, off, chunk);
+ i->idx = idx;
+ i->iov_offset = off + chunk;
+ n -= chunk;
+ }
+ i->count -= bytes;
+ return bytes;
+}
+
size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
{
+ if (unlikely(i->type & ITER_PIPE))
+ return pipe_zero(bytes, i);
iterate_and_advance(i, bytes, v,
__clear_user(v.iov_base, v.iov_len),
memzero_page(v.bv_page, v.bv_offset, v.bv_len),
@@ -443,6 +662,11 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
struct iov_iter *i, unsigned long offset, size_t bytes)
{
char *kaddr = kmap_atomic(page), *p = kaddr + offset;
+ if (unlikely(i->type & ITER_PIPE)) {
+ kunmap_atomic(kaddr);
+ WARN_ON(1);
+ return 0;
+ }
iterate_all_kinds(i, bytes, v,
__copy_from_user_inatomic((p += v.iov_len) - v.iov_len,
v.iov_base, v.iov_len),
@@ -455,8 +679,51 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
}
EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+static void pipe_advance(struct iov_iter *i, size_t size)
+{
+ struct pipe_inode_info *pipe = i->pipe;
+ struct pipe_buffer *buf;
+ int idx = i->idx;
+ size_t off = i->iov_offset, orig_sz;
+
+ if (unlikely(i->count < size))
+ size = i->count;
+ orig_sz = size;
+
+ if (size) {
+ if (off) /* make it relative to the beginning of buffer */
+ size += off - pipe->bufs[idx].offset;
+ while (1) {
+ buf = &pipe->bufs[idx];
+ if (size <= buf->len)
+ break;
+ size -= buf->len;
+ idx = next_idx(idx, pipe);
+ }
+ buf->len = size;
+ i->idx = idx;
+ off = i->iov_offset = buf->offset + size;
+ }
+ if (off)
+ idx = next_idx(idx, pipe);
+ if (pipe->nrbufs) {
+ int unused = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+ /* [curbuf,unused) is in use. Free [idx,unused) */
+ while (idx != unused) {
+ pipe_buf_release(pipe, &pipe->bufs[idx]);
+ idx = next_idx(idx, pipe);
+ pipe->nrbufs--;
+ }
+ }
+ i->count -= orig_sz;
+}
+
void iov_iter_advance(struct iov_iter *i, size_t size)
{
+ if (unlikely(i->type & ITER_PIPE)) {
+ pipe_advance(i, size);
+ return;
+ }
iterate_and_advance(i, size, v, 0, 0, 0)
}
EXPORT_SYMBOL(iov_iter_advance);
@@ -466,6 +733,8 @@ EXPORT_SYMBOL(iov_iter_advance);
*/
size_t iov_iter_single_seg_count(const struct iov_iter *i)
{
+ if (unlikely(i->type & ITER_PIPE))
+ return i->count; // it is a silly place, anyway
if (i->nr_segs == 1)
return i->count;
else if (i->type & ITER_BVEC)
@@ -501,6 +770,19 @@ void iov_iter_bvec(struct iov_iter *i, int direction,
}
EXPORT_SYMBOL(iov_iter_bvec);
+void iov_iter_pipe(struct iov_iter *i, int direction,
+ struct pipe_inode_info *pipe,
+ size_t count)
+{
+ BUG_ON(direction != ITER_PIPE);
+ i->type = direction;
+ i->pipe = pipe;
+ i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+ i->iov_offset = 0;
+ i->count = count;
+}
+EXPORT_SYMBOL(iov_iter_pipe);
+
unsigned long iov_iter_alignment(const struct iov_iter *i)
{
unsigned long res = 0;
@@ -509,6 +791,11 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
if (!size)
return 0;
+ if (unlikely(i->type & ITER_PIPE)) {
+ if (i->iov_offset && allocated(&i->pipe->bufs[i->idx]))
+ return size | i->iov_offset;
+ return size;
+ }
iterate_all_kinds(i, size, v,
(res |= (unsigned long)v.iov_base | v.iov_len, 0),
res |= v.bv_offset | v.bv_len,
@@ -525,6 +812,11 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
if (!size)
return 0;
+ if (unlikely(i->type & ITER_PIPE)) {
+ WARN_ON(1);
+ return ~0U;
+ }
+
iterate_all_kinds(i, size, v,
(res |= (!res ? 0 : (unsigned long)v.iov_base) |
(size != v.iov_len ? size : 0), 0),
@@ -537,6 +829,47 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
}
EXPORT_SYMBOL(iov_iter_gap_alignment);
+static inline size_t __pipe_get_pages(struct iov_iter *i,
+ size_t maxsize,
+ struct page **pages,
+ int idx,
+ size_t *start)
+{
+ struct pipe_inode_info *pipe = i->pipe;
+ ssize_t n = push_pipe(i, maxsize, &idx, start);
+ if (!n)
+ return -EFAULT;
+
+ maxsize = n;
+ n += *start;
+ while (n > 0) {
+ get_page(*pages++ = pipe->bufs[idx].page);
+ idx = next_idx(idx, pipe);
+ n -= PAGE_SIZE;
+ }
+
+ return maxsize;
+}
+
+static ssize_t pipe_get_pages(struct iov_iter *i,
+ struct page **pages, size_t maxsize, unsigned maxpages,
+ size_t *start)
+{
+ unsigned npages;
+ size_t capacity;
+ int idx;
+
+ if (!sanity(i))
+ return -EFAULT;
+
+ data_start(i, &idx, start);
+ /* some of this one + all after this one */
+ npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1;
+ capacity = min(npages,maxpages) * PAGE_SIZE - *start;
+
+ return __pipe_get_pages(i, min(maxsize, capacity), pages, idx, start);
+}
+
ssize_t iov_iter_get_pages(struct iov_iter *i,
struct page **pages, size_t maxsize, unsigned maxpages,
size_t *start)
@@ -547,6 +880,8 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
if (!maxsize)
return 0;
+ if (unlikely(i->type & ITER_PIPE))
+ return pipe_get_pages(i, pages, maxsize, maxpages, start);
iterate_all_kinds(i, maxsize, v, ({
unsigned long addr = (unsigned long)v.iov_base;
size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
@@ -582,6 +917,37 @@ static struct page **get_pages_array(size_t n)
return p;
}
+static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
+ struct page ***pages, size_t maxsize,
+ size_t *start)
+{
+ struct page **p;
+ size_t n;
+ int idx;
+ int npages;
+
+ if (!sanity(i))
+ return -EFAULT;
+
+ data_start(i, &idx, start);
+ /* some of this one + all after this one */
+ npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1;
+ n = npages * PAGE_SIZE - *start;
+ if (maxsize > n)
+ maxsize = n;
+ else
+ npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
+ p = get_pages_array(npages);
+ if (!p)
+ return -ENOMEM;
+ n = __pipe_get_pages(i, maxsize, p, idx, start);
+ if (n > 0)
+ *pages = p;
+ else
+ kvfree(p);
+ return n;
+}
+
ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
struct page ***pages, size_t maxsize,
size_t *start)
@@ -594,6 +960,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
if (!maxsize)
return 0;
+ if (unlikely(i->type & ITER_PIPE))
+ return pipe_get_pages_alloc(i, pages, maxsize, start);
iterate_all_kinds(i, maxsize, v, ({
unsigned long addr = (unsigned long)v.iov_base;
size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
@@ -635,6 +1003,10 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
__wsum sum, next;
size_t off = 0;
sum = *csum;
+ if (unlikely(i->type & ITER_PIPE)) {
+ WARN_ON(1);
+ return 0;
+ }
iterate_and_advance(i, bytes, v, ({
int err = 0;
next = csum_and_copy_from_user(v.iov_base,
@@ -673,6 +1045,10 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum,
__wsum sum, next;
size_t off = 0;
sum = *csum;
+ if (unlikely(i->type & ITER_PIPE)) {
+ WARN_ON(1); /* for now */
+ return 0;
+ }
iterate_and_advance(i, bytes, v, ({
int err = 0;
next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
@@ -712,7 +1088,20 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
if (!size)
return 0;
- iterate_all_kinds(i, size, v, ({
+ if (unlikely(i->type & ITER_PIPE)) {
+ struct pipe_inode_info *pipe = i->pipe;
+ size_t off;
+ int idx;
+
+ if (!sanity(i))
+ return 0;
+
+ data_start(i, &idx, &off);
+ /* some of this one + all after this one */
+ npages = ((pipe->curbuf - idx - 1) & (pipe->buffers - 1)) + 1;
+ if (npages >= maxpages)
+ return maxpages;
+ } else iterate_all_kinds(i, size, v, ({
unsigned long p = (unsigned long)v.iov_base;
npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
- p / PAGE_SIZE;
@@ -737,6 +1126,10 @@ EXPORT_SYMBOL(iov_iter_npages);
const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
{
*new = *old;
+ if (unlikely(new->type & ITER_PIPE)) {
+ WARN_ON(1);
+ return NULL;
+ }
if (new->type & ITER_BVEC)
return new->bvec = kmemdup(new->bvec,
new->nr_segs * sizeof(struct bio_vec),
@@ -749,6 +1142,28 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
}
EXPORT_SYMBOL(dup_iter);
+/**
+ * import_iovec() - Copy an array of &struct iovec from userspace
+ * into the kernel, check that it is valid, and initialize a new
+ * &struct iov_iter iterator to access it.
+ *
+ * @type: One of %READ or %WRITE.
+ * @uvector: Pointer to the userspace array.
+ * @nr_segs: Number of elements in userspace array.
+ * @fast_segs: Number of elements in @iov.
+ * @iov: (input and output parameter) Pointer to pointer to (usually small
+ * on-stack) kernel array.
+ * @i: Pointer to iterator that will be initialized on success.
+ *
+ * If the array pointed to by *@iov is large enough to hold all @nr_segs,
+ * then this function places %NULL in *@iov on return. Otherwise, a new
+ * array will be allocated and the result placed in *@iov. This means that
+ * the caller may call kfree() on *@iov regardless of whether the small
+ * on-stack array was used or not (and regardless of whether this function
+ * returns an error or not).
+ *
+ * Return: 0 on success or negative error code on error.
+ */
int import_iovec(int type, const struct iovec __user * uvector,
unsigned nr_segs, unsigned fast_segs,
struct iovec **iov, struct iov_iter *i)
diff --git a/lib/irq_poll.c b/lib/irq_poll.c
index 836f7db4e548..1d6565e81030 100644
--- a/lib/irq_poll.c
+++ b/lib/irq_poll.c
@@ -74,7 +74,7 @@ void irq_poll_complete(struct irq_poll *iop)
}
EXPORT_SYMBOL(irq_poll_complete);
-static void irq_poll_softirq(struct softirq_action *h)
+static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
{
struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
int rearm = 0, budget = irq_poll_budget;
@@ -184,30 +184,21 @@ void irq_poll_init(struct irq_poll *iop, int weight, irq_poll_fn *poll_fn)
}
EXPORT_SYMBOL(irq_poll_init);
-static int irq_poll_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static int irq_poll_cpu_dead(unsigned int cpu)
{
/*
* If a CPU goes away, splice its entries to the current CPU
* and trigger a run of the softirq
*/
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- int cpu = (unsigned long) hcpu;
-
- local_irq_disable();
- list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
- this_cpu_ptr(&blk_cpu_iopoll));
- __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
- local_irq_enable();
- }
+ local_irq_disable();
+ list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
+ this_cpu_ptr(&blk_cpu_iopoll));
+ __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
+ local_irq_enable();
- return NOTIFY_OK;
+ return 0;
}
-static struct notifier_block irq_poll_cpu_notifier = {
- .notifier_call = irq_poll_cpu_notify,
-};
-
static __init int irq_poll_setup(void)
{
int i;
@@ -216,7 +207,8 @@ static __init int irq_poll_setup(void)
INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
open_softirq(IRQ_POLL_SOFTIRQ, irq_poll_softirq);
- register_hotcpu_notifier(&irq_poll_cpu_notifier);
+ cpuhp_setup_state_nocalls(CPUHP_IRQ_POLL_DEAD, "irq_poll:dead", NULL,
+ irq_poll_cpu_dead);
return 0;
}
subsys_initcall(irq_poll_setup);
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index f6c2c1e7779c..9a2b811966eb 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -56,7 +56,7 @@ static const char *kobject_actions[] = {
* kobject_action_type - translate action string to numeric type
*
* @buf: buffer containing the action string, newline is ignored
- * @len: length of buffer
+ * @count: length of buffer
* @type: pointer to the location to store the action type
*
* Returns 0 if the action string was recognized.
@@ -154,8 +154,8 @@ static void cleanup_uevent_env(struct subprocess_info *info)
/**
* kobject_uevent_env - send an uevent with environmental data
*
- * @action: action that is happening
* @kobj: struct kobject that the action is happening to
+ * @action: action that is happening
* @envp_ext: pointer to environmental data
*
* Returns 0 if kobject_uevent_env() is completed with success or the
@@ -363,8 +363,8 @@ EXPORT_SYMBOL_GPL(kobject_uevent_env);
/**
* kobject_uevent - notify userspace by sending an uevent
*
- * @action: action that is happening
* @kobj: struct kobject that the action is happening to
+ * @action: action that is happening
*
* Returns 0 if kobject_uevent() is completed with success or the
* corresponding error when it fails.
diff --git a/lib/kstrtox.c b/lib/kstrtox.c
index d8a5cf66c316..b8e2080c1a47 100644
--- a/lib/kstrtox.c
+++ b/lib/kstrtox.c
@@ -48,11 +48,9 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long
{
unsigned long long res;
unsigned int rv;
- int overflow;
res = 0;
rv = 0;
- overflow = 0;
while (*s) {
unsigned int val;
@@ -71,15 +69,13 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long
*/
if (unlikely(res & (~0ull << 60))) {
if (res > div_u64(ULLONG_MAX - val, base))
- overflow = 1;
+ rv |= KSTRTOX_OVERFLOW;
}
res = res * base + val;
rv++;
s++;
}
*p = res;
- if (overflow)
- rv |= KSTRTOX_OVERFLOW;
return rv;
}
diff --git a/lib/list_debug.c b/lib/list_debug.c
index 3859bf63561c..7f7bfa55eb6d 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -2,8 +2,7 @@
* Copyright 2006, Red Hat, Inc., Dave Jones
* Released under the General Public License (GPL).
*
- * This file contains the linked list implementations for
- * DEBUG_LIST.
+ * This file contains the linked list validation for DEBUG_LIST.
*/
#include <linux/export.h>
@@ -13,88 +12,48 @@
#include <linux/rculist.h>
/*
- * Insert a new entry between two known consecutive entries.
- *
- * This is only for internal list manipulation where we know
- * the prev/next entries already!
+ * Check that the data structures for the list manipulations are reasonably
+ * valid. Failures here indicate memory corruption (and possibly an exploit
+ * attempt).
*/
-void __list_add(struct list_head *new,
- struct list_head *prev,
- struct list_head *next)
+bool __list_add_valid(struct list_head *new, struct list_head *prev,
+ struct list_head *next)
{
- WARN(next->prev != prev,
- "list_add corruption. next->prev should be "
- "prev (%p), but was %p. (next=%p).\n",
+ CHECK_DATA_CORRUPTION(next->prev != prev,
+ "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
prev, next->prev, next);
- WARN(prev->next != next,
- "list_add corruption. prev->next should be "
- "next (%p), but was %p. (prev=%p).\n",
+ CHECK_DATA_CORRUPTION(prev->next != next,
+ "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
next, prev->next, prev);
- WARN(new == prev || new == next,
- "list_add double add: new=%p, prev=%p, next=%p.\n",
- new, prev, next);
- next->prev = new;
- new->next = next;
- new->prev = prev;
- WRITE_ONCE(prev->next, new);
+ CHECK_DATA_CORRUPTION(new == prev || new == next,
+ "list_add double add: new=%p, prev=%p, next=%p.\n",
+ new, prev, next);
+
+ return true;
}
-EXPORT_SYMBOL(__list_add);
+EXPORT_SYMBOL(__list_add_valid);
-void __list_del_entry(struct list_head *entry)
+bool __list_del_entry_valid(struct list_head *entry)
{
struct list_head *prev, *next;
prev = entry->prev;
next = entry->next;
- if (WARN(next == LIST_POISON1,
+ CHECK_DATA_CORRUPTION(next == LIST_POISON1,
"list_del corruption, %p->next is LIST_POISON1 (%p)\n",
- entry, LIST_POISON1) ||
- WARN(prev == LIST_POISON2,
+ entry, LIST_POISON1);
+ CHECK_DATA_CORRUPTION(prev == LIST_POISON2,
"list_del corruption, %p->prev is LIST_POISON2 (%p)\n",
- entry, LIST_POISON2) ||
- WARN(prev->next != entry,
- "list_del corruption. prev->next should be %p, "
- "but was %p\n", entry, prev->next) ||
- WARN(next->prev != entry,
- "list_del corruption. next->prev should be %p, "
- "but was %p\n", entry, next->prev))
- return;
-
- __list_del(prev, next);
-}
-EXPORT_SYMBOL(__list_del_entry);
+ entry, LIST_POISON2);
+ CHECK_DATA_CORRUPTION(prev->next != entry,
+ "list_del corruption. prev->next should be %p, but was %p\n",
+ entry, prev->next);
+ CHECK_DATA_CORRUPTION(next->prev != entry,
+ "list_del corruption. next->prev should be %p, but was %p\n",
+ entry, next->prev);
+ return true;
-/**
- * list_del - deletes entry from list.
- * @entry: the element to delete from the list.
- * Note: list_empty on entry does not return true after this, the entry is
- * in an undefined state.
- */
-void list_del(struct list_head *entry)
-{
- __list_del_entry(entry);
- entry->next = LIST_POISON1;
- entry->prev = LIST_POISON2;
-}
-EXPORT_SYMBOL(list_del);
-
-/*
- * RCU variants.
- */
-void __list_add_rcu(struct list_head *new,
- struct list_head *prev, struct list_head *next)
-{
- WARN(next->prev != prev,
- "list_add_rcu corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
- prev, next->prev, next);
- WARN(prev->next != next,
- "list_add_rcu corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
- next, prev->next, prev);
- new->next = next;
- new->prev = prev;
- rcu_assign_pointer(list_next_rcu(prev), new);
- next->prev = new;
}
-EXPORT_SYMBOL(__list_add_rcu);
+EXPORT_SYMBOL(__list_del_entry_valid);
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 872a15a2a637..f3a217ea0388 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -980,23 +980,23 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
#ifndef CONFIG_PROVE_LOCKING
if (expected == FAILURE && debug_locks) {
expected_testcase_failures++;
- printk("failed|");
+ pr_cont("failed|");
}
else
#endif
if (debug_locks != expected) {
unexpected_testcase_failures++;
- printk("FAILED|");
+ pr_cont("FAILED|");
dump_stack();
} else {
testcase_successes++;
- printk(" ok |");
+ pr_cont(" ok |");
}
testcase_total++;
if (debug_locks_verbose)
- printk(" lockclass mask: %x, debug_locks: %d, expected: %d\n",
+ pr_cont(" lockclass mask: %x, debug_locks: %d, expected: %d\n",
lockclass_mask, debug_locks, expected);
/*
* Some tests (e.g. double-unlock) might corrupt the preemption
@@ -1021,26 +1021,26 @@ static inline void print_testname(const char *testname)
#define DO_TESTCASE_1(desc, name, nr) \
print_testname(desc"/"#nr); \
dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK); \
- printk("\n");
+ pr_cont("\n");
#define DO_TESTCASE_1B(desc, name, nr) \
print_testname(desc"/"#nr); \
dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK); \
- printk("\n");
+ pr_cont("\n");
#define DO_TESTCASE_3(desc, name, nr) \
print_testname(desc"/"#nr); \
dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN); \
dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \
dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \
- printk("\n");
+ pr_cont("\n");
#define DO_TESTCASE_3RW(desc, name, nr) \
print_testname(desc"/"#nr); \
dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN|LOCKTYPE_RWLOCK);\
dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \
dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \
- printk("\n");
+ pr_cont("\n");
#define DO_TESTCASE_6(desc, name) \
print_testname(desc); \
@@ -1050,7 +1050,7 @@ static inline void print_testname(const char *testname)
dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \
dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \
dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \
- printk("\n");
+ pr_cont("\n");
#define DO_TESTCASE_6_SUCCESS(desc, name) \
print_testname(desc); \
@@ -1060,7 +1060,7 @@ static inline void print_testname(const char *testname)
dotest(name##_mutex, SUCCESS, LOCKTYPE_MUTEX); \
dotest(name##_wsem, SUCCESS, LOCKTYPE_RWSEM); \
dotest(name##_rsem, SUCCESS, LOCKTYPE_RWSEM); \
- printk("\n");
+ pr_cont("\n");
/*
* 'read' variant: rlocks must not trigger.
@@ -1073,7 +1073,7 @@ static inline void print_testname(const char *testname)
dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \
dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \
dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \
- printk("\n");
+ pr_cont("\n");
#define DO_TESTCASE_2I(desc, name, nr) \
DO_TESTCASE_1("hard-"desc, name##_hard, nr); \
@@ -1726,25 +1726,25 @@ static void ww_tests(void)
dotest(ww_test_fail_acquire, SUCCESS, LOCKTYPE_WW);
dotest(ww_test_normal, SUCCESS, LOCKTYPE_WW);
dotest(ww_test_unneeded_slow, FAILURE, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
print_testname("ww contexts mixing");
dotest(ww_test_two_contexts, FAILURE, LOCKTYPE_WW);
dotest(ww_test_diff_class, FAILURE, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
print_testname("finishing ww context");
dotest(ww_test_context_done_twice, FAILURE, LOCKTYPE_WW);
dotest(ww_test_context_unlock_twice, FAILURE, LOCKTYPE_WW);
dotest(ww_test_context_fini_early, FAILURE, LOCKTYPE_WW);
dotest(ww_test_context_lock_after_done, FAILURE, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
print_testname("locking mismatches");
dotest(ww_test_object_unlock_twice, FAILURE, LOCKTYPE_WW);
dotest(ww_test_object_lock_unbalanced, FAILURE, LOCKTYPE_WW);
dotest(ww_test_object_lock_stale_context, FAILURE, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
print_testname("EDEADLK handling");
dotest(ww_test_edeadlk_normal, SUCCESS, LOCKTYPE_WW);
@@ -1757,11 +1757,11 @@ static void ww_tests(void)
dotest(ww_test_edeadlk_acquire_more_edeadlk_slow, FAILURE, LOCKTYPE_WW);
dotest(ww_test_edeadlk_acquire_wrong, FAILURE, LOCKTYPE_WW);
dotest(ww_test_edeadlk_acquire_wrong_slow, FAILURE, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
print_testname("spinlock nest unlocked");
dotest(ww_test_spin_nest_unlocked, FAILURE, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
printk(" -----------------------------------------------------\n");
printk(" |block | try |context|\n");
@@ -1771,25 +1771,25 @@ static void ww_tests(void)
dotest(ww_test_context_block, FAILURE, LOCKTYPE_WW);
dotest(ww_test_context_try, SUCCESS, LOCKTYPE_WW);
dotest(ww_test_context_context, SUCCESS, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
print_testname("try");
dotest(ww_test_try_block, FAILURE, LOCKTYPE_WW);
dotest(ww_test_try_try, SUCCESS, LOCKTYPE_WW);
dotest(ww_test_try_context, FAILURE, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
print_testname("block");
dotest(ww_test_block_block, FAILURE, LOCKTYPE_WW);
dotest(ww_test_block_try, SUCCESS, LOCKTYPE_WW);
dotest(ww_test_block_context, FAILURE, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
print_testname("spinlock");
dotest(ww_test_spin_block, FAILURE, LOCKTYPE_WW);
dotest(ww_test_spin_try, SUCCESS, LOCKTYPE_WW);
dotest(ww_test_spin_context, FAILURE, LOCKTYPE_WW);
- printk("\n");
+ pr_cont("\n");
}
void locking_selftest(void)
@@ -1829,32 +1829,32 @@ void locking_selftest(void)
printk(" --------------------------------------------------------------------------\n");
print_testname("recursive read-lock");
- printk(" |");
+ pr_cont(" |");
dotest(rlock_AA1, SUCCESS, LOCKTYPE_RWLOCK);
- printk(" |");
+ pr_cont(" |");
dotest(rsem_AA1, FAILURE, LOCKTYPE_RWSEM);
- printk("\n");
+ pr_cont("\n");
print_testname("recursive read-lock #2");
- printk(" |");
+ pr_cont(" |");
dotest(rlock_AA1B, SUCCESS, LOCKTYPE_RWLOCK);
- printk(" |");
+ pr_cont(" |");
dotest(rsem_AA1B, FAILURE, LOCKTYPE_RWSEM);
- printk("\n");
+ pr_cont("\n");
print_testname("mixed read-write-lock");
- printk(" |");
+ pr_cont(" |");
dotest(rlock_AA2, FAILURE, LOCKTYPE_RWLOCK);
- printk(" |");
+ pr_cont(" |");
dotest(rsem_AA2, FAILURE, LOCKTYPE_RWSEM);
- printk("\n");
+ pr_cont("\n");
print_testname("mixed write-read-lock");
- printk(" |");
+ pr_cont(" |");
dotest(rlock_AA3, FAILURE, LOCKTYPE_RWLOCK);
- printk(" |");
+ pr_cont(" |");
dotest(rsem_AA3, FAILURE, LOCKTYPE_RWSEM);
- printk("\n");
+ pr_cont("\n");
printk(" --------------------------------------------------------------------------\n");
diff --git a/lib/lockref.c b/lib/lockref.c
index 5a92189ad711..c4bfcb8836cd 100644
--- a/lib/lockref.c
+++ b/lib/lockref.c
@@ -20,7 +20,7 @@
if (likely(old.lock_count == prev.lock_count)) { \
SUCCESS; \
} \
- cpu_relax_lowlatency(); \
+ cpu_relax(); \
} \
} while (0)
diff --git a/lib/mpi/mpi-pow.c b/lib/mpi/mpi-pow.c
index 5464c8744ea9..e24388a863a7 100644
--- a/lib/mpi/mpi-pow.c
+++ b/lib/mpi/mpi-pow.c
@@ -64,8 +64,13 @@ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod)
if (!esize) {
/* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0
* depending on if MOD equals 1. */
- rp[0] = 1;
res->nlimbs = (msize == 1 && mod->d[0] == 1) ? 0 : 1;
+ if (res->nlimbs) {
+ if (mpi_resize(res, 1) < 0)
+ goto enomem;
+ rp = res->d;
+ rp[0] = 1;
+ }
res->sign = 0;
goto leave;
}
diff --git a/lib/nlattr.c b/lib/nlattr.c
index fce1e9afc6d9..b42b8577fc23 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -14,7 +14,7 @@
#include <linux/types.h>
#include <net/netlink.h>
-static const u16 nla_attr_minlen[NLA_TYPE_MAX+1] = {
+static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
[NLA_U8] = sizeof(u8),
[NLA_U16] = sizeof(u16),
[NLA_U32] = sizeof(u32),
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 26caf51cc238..75554754eadf 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -16,21 +16,23 @@
#include <linux/delay.h>
#include <linux/kprobes.h>
#include <linux/nmi.h>
+#include <linux/cpu.h>
-#ifdef arch_trigger_all_cpu_backtrace
+#ifdef arch_trigger_cpumask_backtrace
/* For reliability, we're prepared to waste bits here. */
static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-/* "in progress" flag of arch_trigger_all_cpu_backtrace */
+/* "in progress" flag of arch_trigger_cpumask_backtrace */
static unsigned long backtrace_flag;
/*
- * When raise() is called it will be is passed a pointer to the
+ * When raise() is called it will be passed a pointer to the
* backtrace_mask. Architectures that call nmi_cpu_backtrace()
* directly from their raise() functions may rely on the mask
* they are passed being updated as a side effect of this call.
*/
-void nmi_trigger_all_cpu_backtrace(bool include_self,
+void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
+ bool exclude_self,
void (*raise)(cpumask_t *mask))
{
int i, this_cpu = get_cpu();
@@ -44,13 +46,22 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
return;
}
- cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
- if (!include_self)
+ cpumask_copy(to_cpumask(backtrace_mask), mask);
+ if (exclude_self)
cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
+ /*
+ * Don't try to send an NMI to this cpu; it may work on some
+ * architectures, but on others it may not, and we'll get
+ * information at least as useful just by doing a dump_stack() here.
+ * Note that nmi_cpu_backtrace(NULL) will clear the cpu bit.
+ */
+ if (cpumask_test_cpu(this_cpu, to_cpumask(backtrace_mask)))
+ nmi_cpu_backtrace(NULL);
+
if (!cpumask_empty(to_cpumask(backtrace_mask))) {
- pr_info("Sending NMI to %s CPUs:\n",
- (include_self ? "all" : "other"));
+ pr_info("Sending NMI from CPU %d to CPUs %*pbl:\n",
+ this_cpu, nr_cpumask_bits, to_cpumask(backtrace_mask));
raise(to_cpumask(backtrace_mask));
}
@@ -77,11 +88,16 @@ bool nmi_cpu_backtrace(struct pt_regs *regs)
int cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- pr_warn("NMI backtrace for cpu %d\n", cpu);
- if (regs)
- show_regs(regs);
- else
- dump_stack();
+ if (regs && cpu_in_idle(instruction_pointer(regs))) {
+ pr_warn("NMI backtrace for cpu %d skipped: idling at pc %#lx\n",
+ cpu, instruction_pointer(regs));
+ } else {
+ pr_warn("NMI backtrace for cpu %d\n", cpu);
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+ }
cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
return true;
}
diff --git a/lib/parser.c b/lib/parser.c
index b6d11631231b..3278958b472a 100644
--- a/lib/parser.c
+++ b/lib/parser.c
@@ -152,6 +152,36 @@ static int match_number(substring_t *s, int *result, int base)
}
/**
+ * match_u64int: scan a number in the given base from a substring_t
+ * @s: substring to be scanned
+ * @result: resulting u64 on success
+ * @base: base to use when converting string
+ *
+ * Description: Given a &substring_t and a base, attempts to parse the substring
+ * as a number in that base. On success, sets @result to the integer represented
+ * by the string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ */
+static int match_u64int(substring_t *s, u64 *result, int base)
+{
+ char *buf;
+ int ret;
+ u64 val;
+ size_t len = s->to - s->from;
+
+ buf = kmalloc(len + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ memcpy(buf, s->from, len);
+ buf[len] = '\0';
+
+ ret = kstrtoull(buf, base, &val);
+ if (!ret)
+ *result = val;
+ kfree(buf);
+ return ret;
+}
+
+/**
* match_int: - scan a decimal representation of an integer from a substring_t
* @s: substring_t to be scanned
* @result: resulting integer on success
@@ -167,6 +197,23 @@ int match_int(substring_t *s, int *result)
EXPORT_SYMBOL(match_int);
/**
+ * match_u64: - scan a decimal representation of a u64 from
+ * a substring_t
+ * @s: substring_t to be scanned
+ * @result: resulting unsigned long long on success
+ *
+ * Description: Attempts to parse the &substring_t @s as a long decimal
+ * integer. On success, sets @result to the integer represented by the
+ * string and returns 0.
+ * Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ */
+int match_u64(substring_t *s, u64 *result)
+{
+ return match_u64int(s, result, 0);
+}
+EXPORT_SYMBOL(match_u64);
+
+/**
* match_octal: - scan an octal representation of an integer from a substring_t
* @s: substring_t to be scanned
* @result: resulting integer on success
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 27fe74948882..9ac959ef4cae 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -33,6 +33,7 @@
#define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1))
+static DEFINE_SPINLOCK(percpu_ref_switch_lock);
static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq);
static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref)
@@ -82,6 +83,7 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
atomic_long_set(&ref->count, start_count);
ref->release = release;
+ ref->confirm_switch = NULL;
return 0;
}
EXPORT_SYMBOL_GPL(percpu_ref_init);
@@ -101,6 +103,8 @@ void percpu_ref_exit(struct percpu_ref *ref)
unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
if (percpu_count) {
+ /* non-NULL confirm_switch indicates switching in progress */
+ WARN_ON_ONCE(ref->confirm_switch);
free_percpu(percpu_count);
ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD;
}
@@ -161,66 +165,23 @@ static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref)
static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
percpu_ref_func_t *confirm_switch)
{
- if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) {
- /* switching from percpu to atomic */
- ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
-
- /*
- * Non-NULL ->confirm_switch is used to indicate that
- * switching is in progress. Use noop one if unspecified.
- */
- WARN_ON_ONCE(ref->confirm_switch);
- ref->confirm_switch =
- confirm_switch ?: percpu_ref_noop_confirm_switch;
-
- percpu_ref_get(ref); /* put after confirmation */
- call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu);
- } else if (confirm_switch) {
- /*
- * Somebody already set ATOMIC. Switching may still be in
- * progress. @confirm_switch must be invoked after the
- * switching is complete and a full sched RCU grace period
- * has passed. Wait synchronously for the previous
- * switching and schedule @confirm_switch invocation.
- */
- wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
- ref->confirm_switch = confirm_switch;
-
- percpu_ref_get(ref); /* put after confirmation */
- call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu);
+ if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) {
+ if (confirm_switch)
+ confirm_switch(ref);
+ return;
}
-}
-/**
- * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode
- * @ref: percpu_ref to switch to atomic mode
- * @confirm_switch: optional confirmation callback
- *
- * There's no reason to use this function for the usual reference counting.
- * Use percpu_ref_kill[_and_confirm]().
- *
- * Schedule switching of @ref to atomic mode. All its percpu counts will
- * be collected to the main atomic counter. On completion, when all CPUs
- * are guaraneed to be in atomic mode, @confirm_switch, which may not
- * block, is invoked. This function may be invoked concurrently with all
- * the get/put operations and can safely be mixed with kill and reinit
- * operations. Note that @ref will stay in atomic mode across kill/reinit
- * cycles until percpu_ref_switch_to_percpu() is called.
- *
- * This function normally doesn't block and can be called from any context
- * but it may block if @confirm_kill is specified and @ref is already in
- * the process of switching to atomic mode. In such cases, @confirm_switch
- * will be invoked after the switching is complete.
- *
- * Due to the way percpu_ref is implemented, @confirm_switch will be called
- * after at least one full sched RCU grace period has passed but this is an
- * implementation detail and must not be depended upon.
- */
-void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
- percpu_ref_func_t *confirm_switch)
-{
- ref->force_atomic = true;
- __percpu_ref_switch_to_atomic(ref, confirm_switch);
+ /* switching from percpu to atomic */
+ ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
+
+ /*
+ * Non-NULL ->confirm_switch is used to indicate that switching is
+ * in progress. Use noop one if unspecified.
+ */
+ ref->confirm_switch = confirm_switch ?: percpu_ref_noop_confirm_switch;
+
+ percpu_ref_get(ref); /* put after confirmation */
+ call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu);
}
static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
@@ -233,8 +194,6 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
return;
- wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
-
atomic_long_add(PERCPU_COUNT_BIAS, &ref->count);
/*
@@ -250,6 +209,58 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
}
+static void __percpu_ref_switch_mode(struct percpu_ref *ref,
+ percpu_ref_func_t *confirm_switch)
+{
+ lockdep_assert_held(&percpu_ref_switch_lock);
+
+ /*
+ * If the previous ATOMIC switching hasn't finished yet, wait for
+ * its completion. If the caller ensures that ATOMIC switching
+ * isn't in progress, this function can be called from any context.
+ */
+ wait_event_lock_irq(percpu_ref_switch_waitq, !ref->confirm_switch,
+ percpu_ref_switch_lock);
+
+ if (ref->force_atomic || (ref->percpu_count_ptr & __PERCPU_REF_DEAD))
+ __percpu_ref_switch_to_atomic(ref, confirm_switch);
+ else
+ __percpu_ref_switch_to_percpu(ref);
+}
+
+/**
+ * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode
+ * @ref: percpu_ref to switch to atomic mode
+ * @confirm_switch: optional confirmation callback
+ *
+ * There's no reason to use this function for the usual reference counting.
+ * Use percpu_ref_kill[_and_confirm]().
+ *
+ * Schedule switching of @ref to atomic mode. All its percpu counts will
+ * be collected to the main atomic counter. On completion, when all CPUs
+ * are guaraneed to be in atomic mode, @confirm_switch, which may not
+ * block, is invoked. This function may be invoked concurrently with all
+ * the get/put operations and can safely be mixed with kill and reinit
+ * operations. Note that @ref will stay in atomic mode across kill/reinit
+ * cycles until percpu_ref_switch_to_percpu() is called.
+ *
+ * This function may block if @ref is in the process of switching to atomic
+ * mode. If the caller ensures that @ref is not in the process of
+ * switching to atomic mode, this function can be called from any context.
+ */
+void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
+ percpu_ref_func_t *confirm_switch)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
+ ref->force_atomic = true;
+ __percpu_ref_switch_mode(ref, confirm_switch);
+
+ spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
+}
+
/**
* percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
* @ref: percpu_ref to switch to percpu mode
@@ -264,17 +275,20 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
* dying or dead, the actual switching takes place on the following
* percpu_ref_reinit().
*
- * This function normally doesn't block and can be called from any context
- * but it may block if @ref is in the process of switching to atomic mode
- * by percpu_ref_switch_atomic().
+ * This function may block if @ref is in the process of switching to atomic
+ * mode. If the caller ensures that @ref is not in the process of
+ * switching to atomic mode, this function can be called from any context.
*/
void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
ref->force_atomic = false;
+ __percpu_ref_switch_mode(ref, NULL);
- /* a dying or dead ref can't be switched to percpu mode w/o reinit */
- if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD))
- __percpu_ref_switch_to_percpu(ref);
+ spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
/**
@@ -290,21 +304,23 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
*
* This function normally doesn't block and can be called from any context
* but it may block if @confirm_kill is specified and @ref is in the
- * process of switching to atomic mode by percpu_ref_switch_atomic().
- *
- * Due to the way percpu_ref is implemented, @confirm_switch will be called
- * after at least one full sched RCU grace period has passed but this is an
- * implementation detail and must not be depended upon.
+ * process of switching to atomic mode by percpu_ref_switch_to_atomic().
*/
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
percpu_ref_func_t *confirm_kill)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD,
"%s called more than once on %pf!", __func__, ref->release);
ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
- __percpu_ref_switch_to_atomic(ref, confirm_kill);
+ __percpu_ref_switch_mode(ref, confirm_kill);
percpu_ref_put(ref);
+
+ spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
@@ -321,11 +337,16 @@ EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
*/
void percpu_ref_reinit(struct percpu_ref *ref)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
WARN_ON_ONCE(!percpu_ref_is_zero(ref));
ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD;
percpu_ref_get(ref);
- if (!ref->force_atomic)
- __percpu_ref_switch_to_percpu(ref);
+ __percpu_ref_switch_mode(ref, NULL);
+
+ spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_reinit);
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 72d36113ccaa..c8cebb137076 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -158,25 +158,21 @@ EXPORT_SYMBOL(percpu_counter_destroy);
int percpu_counter_batch __read_mostly = 32;
EXPORT_SYMBOL(percpu_counter_batch);
-static void compute_batch_value(void)
+static int compute_batch_value(unsigned int cpu)
{
int nr = num_online_cpus();
percpu_counter_batch = max(32, nr*2);
+ return 0;
}
-static int percpu_counter_hotcpu_callback(struct notifier_block *nb,
- unsigned long action, void *hcpu)
+static int percpu_counter_cpu_dead(unsigned int cpu)
{
#ifdef CONFIG_HOTPLUG_CPU
- unsigned int cpu;
struct percpu_counter *fbc;
- compute_batch_value();
- if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
- return NOTIFY_OK;
+ compute_batch_value(cpu);
- cpu = (unsigned long)hcpu;
spin_lock_irq(&percpu_counters_lock);
list_for_each_entry(fbc, &percpu_counters, list) {
s32 *pcount;
@@ -190,7 +186,7 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb,
}
spin_unlock_irq(&percpu_counters_lock);
#endif
- return NOTIFY_OK;
+ return 0;
}
/*
@@ -222,8 +218,15 @@ EXPORT_SYMBOL(__percpu_counter_compare);
static int __init percpu_counter_startup(void)
{
- compute_batch_value();
- hotcpu_notifier(percpu_counter_hotcpu_callback, 0);
+ int ret;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online",
+ compute_batch_value, NULL);
+ WARN_ON(ret < 0);
+ ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD,
+ "lib/percpu_cnt:dead", NULL,
+ percpu_counter_cpu_dead);
+ WARN_ON(ret < 0);
return 0;
}
module_init(percpu_counter_startup);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 91f0727e3cad..2e8c6f7aa56e 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -220,10 +220,10 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
{
unsigned long i;
- pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n",
+ pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d exceptional %d parent %p\n",
node, node->offset,
node->tags[0][0], node->tags[1][0], node->tags[2][0],
- node->shift, node->count, node->parent);
+ node->shift, node->count, node->exceptional, node->parent);
for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
unsigned long first = index | (i << node->shift);
@@ -325,7 +325,6 @@ static void radix_tree_node_rcu_free(struct rcu_head *head)
tag_clear(node, i, 0);
node->slots[0] = NULL;
- node->count = 0;
kmem_cache_free(radix_tree_node_cachep, node);
}
@@ -522,8 +521,13 @@ static int radix_tree_extend(struct radix_tree_root *root,
node->offset = 0;
node->count = 1;
node->parent = NULL;
- if (radix_tree_is_internal_node(slot))
+ if (radix_tree_is_internal_node(slot)) {
entry_to_node(slot)->parent = node;
+ } else {
+ /* Moving an exceptional root->rnode to a node */
+ if (radix_tree_exceptional_entry(slot))
+ node->exceptional = 1;
+ }
node->slots[0] = slot;
slot = node_to_entry(node);
rcu_assign_pointer(root->rnode, slot);
@@ -534,6 +538,104 @@ out:
}
/**
+ * radix_tree_shrink - shrink radix tree to minimum height
+ * @root radix tree root
+ */
+static inline void radix_tree_shrink(struct radix_tree_root *root,
+ radix_tree_update_node_t update_node,
+ void *private)
+{
+ for (;;) {
+ struct radix_tree_node *node = root->rnode;
+ struct radix_tree_node *child;
+
+ if (!radix_tree_is_internal_node(node))
+ break;
+ node = entry_to_node(node);
+
+ /*
+ * The candidate node has more than one child, or its child
+ * is not at the leftmost slot, or the child is a multiorder
+ * entry, we cannot shrink.
+ */
+ if (node->count != 1)
+ break;
+ child = node->slots[0];
+ if (!child)
+ break;
+ if (!radix_tree_is_internal_node(child) && node->shift)
+ break;
+
+ if (radix_tree_is_internal_node(child))
+ entry_to_node(child)->parent = NULL;
+
+ /*
+ * We don't need rcu_assign_pointer(), since we are simply
+ * moving the node from one part of the tree to another: if it
+ * was safe to dereference the old pointer to it
+ * (node->slots[0]), it will be safe to dereference the new
+ * one (root->rnode) as far as dependent read barriers go.
+ */
+ root->rnode = child;
+
+ /*
+ * We have a dilemma here. The node's slot[0] must not be
+ * NULLed in case there are concurrent lookups expecting to
+ * find the item. However if this was a bottom-level node,
+ * then it may be subject to the slot pointer being visible
+ * to callers dereferencing it. If item corresponding to
+ * slot[0] is subsequently deleted, these callers would expect
+ * their slot to become empty sooner or later.
+ *
+ * For example, lockless pagecache will look up a slot, deref
+ * the page pointer, and if the page has 0 refcount it means it
+ * was concurrently deleted from pagecache so try the deref
+ * again. Fortunately there is already a requirement for logic
+ * to retry the entire slot lookup -- the indirect pointer
+ * problem (replacing direct root node with an indirect pointer
+ * also results in a stale slot). So tag the slot as indirect
+ * to force callers to retry.
+ */
+ node->count = 0;
+ if (!radix_tree_is_internal_node(child)) {
+ node->slots[0] = RADIX_TREE_RETRY;
+ if (update_node)
+ update_node(node, private);
+ }
+
+ radix_tree_node_free(node);
+ }
+}
+
+static void delete_node(struct radix_tree_root *root,
+ struct radix_tree_node *node,
+ radix_tree_update_node_t update_node, void *private)
+{
+ do {
+ struct radix_tree_node *parent;
+
+ if (node->count) {
+ if (node == entry_to_node(root->rnode))
+ radix_tree_shrink(root, update_node, private);
+ return;
+ }
+
+ parent = node->parent;
+ if (parent) {
+ parent->slots[node->offset] = NULL;
+ parent->count--;
+ } else {
+ root_tag_clear_all(root);
+ root->rnode = NULL;
+ }
+
+ radix_tree_node_free(node);
+
+ node = parent;
+ } while (node);
+}
+
+/**
* __radix_tree_create - create a slot in a radix tree
* @root: radix tree root
* @index: index key
@@ -649,6 +751,8 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
if (node) {
unsigned offset = get_slot_offset(node, slot);
node->count++;
+ if (radix_tree_exceptional_entry(item))
+ node->exceptional++;
BUG_ON(tag_get(node, 0, offset));
BUG_ON(tag_get(node, 1, offset));
BUG_ON(tag_get(node, 2, offset));
@@ -746,6 +850,85 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
}
EXPORT_SYMBOL(radix_tree_lookup);
+static void replace_slot(struct radix_tree_root *root,
+ struct radix_tree_node *node,
+ void **slot, void *item,
+ bool warn_typeswitch)
+{
+ void *old = rcu_dereference_raw(*slot);
+ int count, exceptional;
+
+ WARN_ON_ONCE(radix_tree_is_internal_node(item));
+
+ count = !!item - !!old;
+ exceptional = !!radix_tree_exceptional_entry(item) -
+ !!radix_tree_exceptional_entry(old);
+
+ WARN_ON_ONCE(warn_typeswitch && (count || exceptional));
+
+ if (node) {
+ node->count += count;
+ node->exceptional += exceptional;
+ }
+
+ rcu_assign_pointer(*slot, item);
+}
+
+/**
+ * __radix_tree_replace - replace item in a slot
+ * @root: radix tree root
+ * @node: pointer to tree node
+ * @slot: pointer to slot in @node
+ * @item: new item to store in the slot.
+ * @update_node: callback for changing leaf nodes
+ * @private: private data to pass to @update_node
+ *
+ * For use with __radix_tree_lookup(). Caller must hold tree write locked
+ * across slot lookup and replacement.
+ */
+void __radix_tree_replace(struct radix_tree_root *root,
+ struct radix_tree_node *node,
+ void **slot, void *item,
+ radix_tree_update_node_t update_node, void *private)
+{
+ /*
+ * This function supports replacing exceptional entries and
+ * deleting entries, but that needs accounting against the
+ * node unless the slot is root->rnode.
+ */
+ replace_slot(root, node, slot, item,
+ !node && slot != (void **)&root->rnode);
+
+ if (!node)
+ return;
+
+ if (update_node)
+ update_node(node, private);
+
+ delete_node(root, node, update_node, private);
+}
+
+/**
+ * radix_tree_replace_slot - replace item in a slot
+ * @root: radix tree root
+ * @slot: pointer to slot
+ * @item: new item to store in the slot.
+ *
+ * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(),
+ * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked
+ * across slot lookup and replacement.
+ *
+ * NOTE: This cannot be used to switch between non-entries (empty slots),
+ * regular entries, and exceptional entries, as that requires accounting
+ * inside the radix tree node. When switching from one type of entry or
+ * deleting, use __radix_tree_lookup() and __radix_tree_replace().
+ */
+void radix_tree_replace_slot(struct radix_tree_root *root,
+ void **slot, void *item)
+{
+ replace_slot(root, NULL, slot, item, true);
+}
+
/**
* radix_tree_tag_set - set a tag on a radix tree node
* @root: radix tree root
@@ -1394,75 +1577,6 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
#endif /* CONFIG_SHMEM && CONFIG_SWAP */
/**
- * radix_tree_shrink - shrink radix tree to minimum height
- * @root radix tree root
- */
-static inline bool radix_tree_shrink(struct radix_tree_root *root)
-{
- bool shrunk = false;
-
- for (;;) {
- struct radix_tree_node *node = root->rnode;
- struct radix_tree_node *child;
-
- if (!radix_tree_is_internal_node(node))
- break;
- node = entry_to_node(node);
-
- /*
- * The candidate node has more than one child, or its child
- * is not at the leftmost slot, or the child is a multiorder
- * entry, we cannot shrink.
- */
- if (node->count != 1)
- break;
- child = node->slots[0];
- if (!child)
- break;
- if (!radix_tree_is_internal_node(child) && node->shift)
- break;
-
- if (radix_tree_is_internal_node(child))
- entry_to_node(child)->parent = NULL;
-
- /*
- * We don't need rcu_assign_pointer(), since we are simply
- * moving the node from one part of the tree to another: if it
- * was safe to dereference the old pointer to it
- * (node->slots[0]), it will be safe to dereference the new
- * one (root->rnode) as far as dependent read barriers go.
- */
- root->rnode = child;
-
- /*
- * We have a dilemma here. The node's slot[0] must not be
- * NULLed in case there are concurrent lookups expecting to
- * find the item. However if this was a bottom-level node,
- * then it may be subject to the slot pointer being visible
- * to callers dereferencing it. If item corresponding to
- * slot[0] is subsequently deleted, these callers would expect
- * their slot to become empty sooner or later.
- *
- * For example, lockless pagecache will look up a slot, deref
- * the page pointer, and if the page has 0 refcount it means it
- * was concurrently deleted from pagecache so try the deref
- * again. Fortunately there is already a requirement for logic
- * to retry the entire slot lookup -- the indirect pointer
- * problem (replacing direct root node with an indirect pointer
- * also results in a stale slot). So tag the slot as indirect
- * to force callers to retry.
- */
- if (!radix_tree_is_internal_node(child))
- node->slots[0] = RADIX_TREE_RETRY;
-
- radix_tree_node_free(node);
- shrunk = true;
- }
-
- return shrunk;
-}
-
-/**
* __radix_tree_delete_node - try to free node after clearing a slot
* @root: radix tree root
* @node: node containing @index
@@ -1470,39 +1584,11 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
* After clearing the slot at @index in @node from radix tree
* rooted at @root, call this function to attempt freeing the
* node and shrinking the tree.
- *
- * Returns %true if @node was freed, %false otherwise.
*/
-bool __radix_tree_delete_node(struct radix_tree_root *root,
+void __radix_tree_delete_node(struct radix_tree_root *root,
struct radix_tree_node *node)
{
- bool deleted = false;
-
- do {
- struct radix_tree_node *parent;
-
- if (node->count) {
- if (node == entry_to_node(root->rnode))
- deleted |= radix_tree_shrink(root);
- return deleted;
- }
-
- parent = node->parent;
- if (parent) {
- parent->slots[node->offset] = NULL;
- parent->count--;
- } else {
- root_tag_clear_all(root);
- root->rnode = NULL;
- }
-
- radix_tree_node_free(node);
- deleted = true;
-
- node = parent;
- } while (node);
-
- return deleted;
+ delete_node(root, node, NULL, NULL);
}
static inline void delete_sibling_entries(struct radix_tree_node *node,
@@ -1559,10 +1645,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
node_tag_clear(root, node, tag, offset);
delete_sibling_entries(node, node_to_entry(slot), offset);
- node->slots[offset] = NULL;
- node->count--;
-
- __radix_tree_delete_node(root, node);
+ __radix_tree_replace(root, node, slot, NULL, NULL, NULL);
return entry;
}
@@ -1583,15 +1666,10 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
}
EXPORT_SYMBOL(radix_tree_delete);
-struct radix_tree_node *radix_tree_replace_clear_tags(
- struct radix_tree_root *root,
- unsigned long index, void *entry)
+void radix_tree_clear_tags(struct radix_tree_root *root,
+ struct radix_tree_node *node,
+ void **slot)
{
- struct radix_tree_node *node;
- void **slot;
-
- __radix_tree_lookup(root, index, &node, &slot);
-
if (node) {
unsigned int tag, offset = get_slot_offset(node, slot);
for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
@@ -1600,9 +1678,6 @@ struct radix_tree_node *radix_tree_replace_clear_tags(
/* Clear root node tags */
root->gfp_mask &= __GFP_BITS_MASK;
}
-
- radix_tree_replace_slot(slot, entry);
- return node;
}
/**
@@ -1650,32 +1725,31 @@ static __init void radix_tree_init_maxnodes(void)
}
}
-static int radix_tree_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int radix_tree_cpu_dead(unsigned int cpu)
{
- int cpu = (long)hcpu;
struct radix_tree_preload *rtp;
struct radix_tree_node *node;
/* Free per-cpu pool of preloaded nodes */
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- rtp = &per_cpu(radix_tree_preloads, cpu);
- while (rtp->nr) {
- node = rtp->nodes;
- rtp->nodes = node->private_data;
- kmem_cache_free(radix_tree_node_cachep, node);
- rtp->nr--;
- }
+ rtp = &per_cpu(radix_tree_preloads, cpu);
+ while (rtp->nr) {
+ node = rtp->nodes;
+ rtp->nodes = node->private_data;
+ kmem_cache_free(radix_tree_node_cachep, node);
+ rtp->nr--;
}
- return NOTIFY_OK;
+ return 0;
}
void __init radix_tree_init(void)
{
+ int ret;
radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
sizeof(struct radix_tree_node), 0,
SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
radix_tree_node_ctor);
radix_tree_init_maxnodes();
- hotcpu_notifier(radix_tree_callback, 0);
+ ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead",
+ NULL, radix_tree_cpu_dead);
+ WARN_ON(ret < 0);
}
diff --git a/lib/raid6/.gitignore b/lib/raid6/.gitignore
index 0a7e494b2bcd..f01b1cb04f91 100644
--- a/lib/raid6/.gitignore
+++ b/lib/raid6/.gitignore
@@ -3,3 +3,4 @@ altivec*.c
int*.c
tables.c
neon?.c
+s390vx?.c
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 3b10a48fa040..3057011f5599 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -3,10 +3,11 @@ obj-$(CONFIG_RAID6_PQ) += raid6_pq.o
raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \
int8.o int16.o int32.o
-raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o
+raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o
raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
+raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
hostprogs-y += mktables
@@ -116,6 +117,11 @@ $(obj)/tilegx8.c: UNROLL := 8
$(obj)/tilegx8.c: $(src)/tilegx.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
+targets += s390vx8.c
+$(obj)/s390vx8.c: UNROLL := 8
+$(obj)/s390vx8.c: $(src)/s390vx.uc $(src)/unroll.awk FORCE
+ $(call if_changed,unroll)
+
quiet_cmd_mktable = TABLE $@
cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 )
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 975c6e0434bd..7857049fd7d3 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -49,6 +49,10 @@ const struct raid6_calls * const raid6_algos[] = {
&raid6_avx2x1,
&raid6_avx2x2,
#endif
+#ifdef CONFIG_AS_AVX512
+ &raid6_avx512x1,
+ &raid6_avx512x2,
+#endif
#endif
#if defined(__x86_64__) && !defined(__arch_um__)
&raid6_sse2x1,
@@ -59,6 +63,11 @@ const struct raid6_calls * const raid6_algos[] = {
&raid6_avx2x2,
&raid6_avx2x4,
#endif
+#ifdef CONFIG_AS_AVX512
+ &raid6_avx512x1,
+ &raid6_avx512x2,
+ &raid6_avx512x4,
+#endif
#endif
#ifdef CONFIG_ALTIVEC
&raid6_altivec1,
@@ -69,6 +78,9 @@ const struct raid6_calls * const raid6_algos[] = {
#if defined(CONFIG_TILEGX)
&raid6_tilegx8,
#endif
+#if defined(CONFIG_S390)
+ &raid6_s390vx8,
+#endif
&raid6_intx1,
&raid6_intx2,
&raid6_intx4,
@@ -89,12 +101,18 @@ void (*raid6_datap_recov)(int, size_t, int, void **);
EXPORT_SYMBOL_GPL(raid6_datap_recov);
const struct raid6_recov_calls *const raid6_recov_algos[] = {
+#ifdef CONFIG_AS_AVX512
+ &raid6_recov_avx512,
+#endif
#ifdef CONFIG_AS_AVX2
&raid6_recov_avx2,
#endif
#ifdef CONFIG_AS_SSSE3
&raid6_recov_ssse3,
#endif
+#ifdef CONFIG_S390
+ &raid6_recov_s390xc,
+#endif
&raid6_recov_intx1,
NULL
};
diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c
new file mode 100644
index 000000000000..f524a7972006
--- /dev/null
+++ b/lib/raid6/avx512.c
@@ -0,0 +1,569 @@
+/* -*- linux-c -*- --------------------------------------------------------
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Gayatri Kammela <gayatri.kammela@intel.com>
+ * Author: Megha Dey <megha.dey@linux.intel.com>
+ *
+ * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
+ * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ * Boston MA 02111-1307, USA; either version 2 of the License, or
+ * (at your option) any later version; incorporated herein by reference.
+ *
+ * -----------------------------------------------------------------------
+ */
+
+/*
+ * AVX512 implementation of RAID-6 syndrome functions
+ *
+ */
+
+#ifdef CONFIG_AS_AVX512
+
+#include <linux/raid/pq.h>
+#include "x86.h"
+
+static const struct raid6_avx512_constants {
+ u64 x1d[8];
+} raid6_avx512_constants __aligned(512) = {
+ { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
+ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
+ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
+ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
+};
+
+static int raid6_have_avx512(void)
+{
+ return boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ boot_cpu_has(X86_FEATURE_AVX512BW) &&
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ boot_cpu_has(X86_FEATURE_AVX512DQ);
+}
+
+static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa64 %0,%%zmm0\n\t"
+ "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
+ :
+ : "m" (raid6_avx512_constants.x1d[0]));
+
+ for (d = 0; d < bytes; d += 64) {
+ asm volatile("prefetchnta %0\n\t"
+ "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */
+ "prefetchnta %1\n\t"
+ "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
+ "vmovdqa64 %1,%%zmm6"
+ :
+ : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
+ for (z = z0-2; z >= 0; z--) {
+ asm volatile("prefetchnta %0\n\t"
+ "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
+ "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
+ "vmovdqa64 %0,%%zmm6"
+ :
+ : "m" (dptr[z][d]));
+ }
+ asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
+ "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
+ "vmovntdq %%zmm2,%0\n\t"
+ "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
+ "vmovntdq %%zmm4,%1\n\t"
+ "vpxorq %%zmm4,%%zmm4,%%zmm4"
+ :
+ : "m" (p[d]), "m" (q[d]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa64 %0,%%zmm0"
+ : : "m" (raid6_avx512_constants.x1d[0]));
+
+ for (d = 0 ; d < bytes ; d += 64) {
+ asm volatile("vmovdqa64 %0,%%zmm4\n\t"
+ "vmovdqa64 %1,%%zmm2\n\t"
+ "vpxorq %%zmm4,%%zmm2,%%zmm2"
+ :
+ : "m" (dptr[z0][d]), "m" (p[d]));
+ /* P/Q data pages */
+ for (z = z0-1 ; z >= start ; z--) {
+ asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+ "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vmovdqa64 %0,%%zmm5\n\t"
+ "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4"
+ :
+ : "m" (dptr[z][d]));
+ }
+ /* P/Q left side optimization */
+ for (z = start-1 ; z >= 0 ; z--) {
+ asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+ "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4"
+ :
+ : );
+ }
+ asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
+ /* Don't use movntdq for r/w memory area < cache line */
+ "vmovdqa64 %%zmm4,%0\n\t"
+ "vmovdqa64 %%zmm2,%1"
+ :
+ : "m" (q[d]), "m" (p[d]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_avx512x1 = {
+ raid6_avx5121_gen_syndrome,
+ raid6_avx5121_xor_syndrome,
+ raid6_have_avx512,
+ "avx512x1",
+ 1 /* Has cache hints */
+};
+
+/*
+ * Unrolled-by-2 AVX512 implementation
+ */
+static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa64 %0,%%zmm0\n\t"
+ "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
+ :
+ : "m" (raid6_avx512_constants.x1d[0]));
+
+ /* We uniformly assume a single prefetch covers at least 64 bytes */
+ for (d = 0; d < bytes; d += 128) {
+ asm volatile("prefetchnta %0\n\t"
+ "prefetchnta %1\n\t"
+ "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */
+ "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */
+ "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
+ "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */
+ :
+ : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
+ for (z = z0-1; z >= 0; z--) {
+ asm volatile("prefetchnta %0\n\t"
+ "prefetchnta %1\n\t"
+ "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+ "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpmovm2b %%k2,%%zmm7\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+ "vmovdqa64 %0,%%zmm5\n\t"
+ "vmovdqa64 %1,%%zmm7\n\t"
+ "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+ "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6"
+ :
+ : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
+ }
+ asm volatile("vmovntdq %%zmm2,%0\n\t"
+ "vmovntdq %%zmm3,%1\n\t"
+ "vmovntdq %%zmm4,%2\n\t"
+ "vmovntdq %%zmm6,%3"
+ :
+ : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
+ "m" (q[d+64]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa64 %0,%%zmm0"
+ : : "m" (raid6_avx512_constants.x1d[0]));
+
+ for (d = 0 ; d < bytes ; d += 128) {
+ asm volatile("vmovdqa64 %0,%%zmm4\n\t"
+ "vmovdqa64 %1,%%zmm6\n\t"
+ "vmovdqa64 %2,%%zmm2\n\t"
+ "vmovdqa64 %3,%%zmm3\n\t"
+ "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
+ "vpxorq %%zmm6,%%zmm3,%%zmm3"
+ :
+ : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
+ "m" (p[d]), "m" (p[d+64]));
+ /* P/Q data pages */
+ for (z = z0-1 ; z >= start ; z--) {
+ asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+ "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
+ "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+ "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpmovm2b %%k2,%%zmm7\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+ "vmovdqa64 %0,%%zmm5\n\t"
+ "vmovdqa64 %1,%%zmm7\n\t"
+ "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+ "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6"
+ :
+ : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
+ }
+ /* P/Q left side optimization */
+ for (z = start-1 ; z >= 0 ; z--) {
+ asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+ "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
+ "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+ "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpmovm2b %%k2,%%zmm7\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6"
+ :
+ : );
+ }
+ asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
+ "vpxorq %1,%%zmm6,%%zmm6\n\t"
+ /* Don't use movntdq for r/w
+ * memory area < cache line
+ */
+ "vmovdqa64 %%zmm4,%0\n\t"
+ "vmovdqa64 %%zmm6,%1\n\t"
+ "vmovdqa64 %%zmm2,%2\n\t"
+ "vmovdqa64 %%zmm3,%3"
+ :
+ : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
+ "m" (p[d+64]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_avx512x2 = {
+ raid6_avx5122_gen_syndrome,
+ raid6_avx5122_xor_syndrome,
+ raid6_have_avx512,
+ "avx512x2",
+ 1 /* Has cache hints */
+};
+
+#ifdef CONFIG_X86_64
+
+/*
+ * Unrolled-by-4 AVX2 implementation
+ */
+static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa64 %0,%%zmm0\n\t"
+ "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */
+ "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */
+ "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */
+ "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */
+ "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */
+ "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */
+ "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */
+ "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */
+ "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */
+ :
+ : "m" (raid6_avx512_constants.x1d[0]));
+
+ for (d = 0; d < bytes; d += 256) {
+ for (z = z0; z >= 0; z--) {
+ asm volatile("prefetchnta %0\n\t"
+ "prefetchnta %1\n\t"
+ "prefetchnta %2\n\t"
+ "prefetchnta %3\n\t"
+ "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+ "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
+ "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
+ "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpmovm2b %%k2,%%zmm7\n\t"
+ "vpmovm2b %%k3,%%zmm13\n\t"
+ "vpmovm2b %%k4,%%zmm15\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+ "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
+ "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+ "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
+ "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+ "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+ "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
+ "vmovdqa64 %0,%%zmm5\n\t"
+ "vmovdqa64 %1,%%zmm7\n\t"
+ "vmovdqa64 %2,%%zmm13\n\t"
+ "vmovdqa64 %3,%%zmm15\n\t"
+ "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+ "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
+ "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
+ "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+ "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+ "vpxorq %%zmm15,%%zmm14,%%zmm14"
+ :
+ : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
+ "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
+ }
+ asm volatile("vmovntdq %%zmm2,%0\n\t"
+ "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
+ "vmovntdq %%zmm3,%1\n\t"
+ "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
+ "vmovntdq %%zmm10,%2\n\t"
+ "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
+ "vmovntdq %%zmm11,%3\n\t"
+ "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
+ "vmovntdq %%zmm4,%4\n\t"
+ "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vmovntdq %%zmm6,%5\n\t"
+ "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
+ "vmovntdq %%zmm12,%6\n\t"
+ "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
+ "vmovntdq %%zmm14,%7\n\t"
+ "vpxorq %%zmm14,%%zmm14,%%zmm14"
+ :
+ : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
+ "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
+ "m" (q[d+128]), "m" (q[d+192]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa64 %0,%%zmm0"
+ :: "m" (raid6_avx512_constants.x1d[0]));
+
+ for (d = 0 ; d < bytes ; d += 256) {
+ asm volatile("vmovdqa64 %0,%%zmm4\n\t"
+ "vmovdqa64 %1,%%zmm6\n\t"
+ "vmovdqa64 %2,%%zmm12\n\t"
+ "vmovdqa64 %3,%%zmm14\n\t"
+ "vmovdqa64 %4,%%zmm2\n\t"
+ "vmovdqa64 %5,%%zmm3\n\t"
+ "vmovdqa64 %6,%%zmm10\n\t"
+ "vmovdqa64 %7,%%zmm11\n\t"
+ "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
+ "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
+ "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
+ "vpxorq %%zmm14,%%zmm11,%%zmm11"
+ :
+ : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
+ "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
+ "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
+ "m" (p[d+192]));
+ /* P/Q data pages */
+ for (z = z0-1 ; z >= start ; z--) {
+ asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+ "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
+ "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
+ "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
+ "prefetchnta %0\n\t"
+ "prefetchnta %2\n\t"
+ "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+ "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
+ "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
+ "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpmovm2b %%k2,%%zmm7\n\t"
+ "vpmovm2b %%k3,%%zmm13\n\t"
+ "vpmovm2b %%k4,%%zmm15\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+ "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
+ "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+ "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
+ "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+ "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+ "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
+ "vmovdqa64 %0,%%zmm5\n\t"
+ "vmovdqa64 %1,%%zmm7\n\t"
+ "vmovdqa64 %2,%%zmm13\n\t"
+ "vmovdqa64 %3,%%zmm15\n\t"
+ "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+ "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
+ "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
+ "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+ "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+ "vpxorq %%zmm15,%%zmm14,%%zmm14"
+ :
+ : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
+ "m" (dptr[z][d+128]),
+ "m" (dptr[z][d+192]));
+ }
+ asm volatile("prefetchnta %0\n\t"
+ "prefetchnta %1\n\t"
+ :
+ : "m" (q[d]), "m" (q[d+128]));
+ /* P/Q left side optimization */
+ for (z = start-1 ; z >= 0 ; z--) {
+ asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+ "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
+ "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
+ "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
+ "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+ "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
+ "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
+ "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpmovm2b %%k2,%%zmm7\n\t"
+ "vpmovm2b %%k3,%%zmm13\n\t"
+ "vpmovm2b %%k4,%%zmm15\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+ "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
+ "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+ "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
+ "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+ "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+ "vpxorq %%zmm15,%%zmm14,%%zmm14"
+ :
+ : );
+ }
+ asm volatile("vmovntdq %%zmm2,%0\n\t"
+ "vmovntdq %%zmm3,%1\n\t"
+ "vmovntdq %%zmm10,%2\n\t"
+ "vmovntdq %%zmm11,%3\n\t"
+ "vpxorq %4,%%zmm4,%%zmm4\n\t"
+ "vpxorq %5,%%zmm6,%%zmm6\n\t"
+ "vpxorq %6,%%zmm12,%%zmm12\n\t"
+ "vpxorq %7,%%zmm14,%%zmm14\n\t"
+ "vmovntdq %%zmm4,%4\n\t"
+ "vmovntdq %%zmm6,%5\n\t"
+ "vmovntdq %%zmm12,%6\n\t"
+ "vmovntdq %%zmm14,%7"
+ :
+ : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
+ "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
+ "m" (q[d+128]), "m" (q[d+192]));
+ }
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+const struct raid6_calls raid6_avx512x4 = {
+ raid6_avx5124_gen_syndrome,
+ raid6_avx5124_xor_syndrome,
+ raid6_have_avx512,
+ "avx512x4",
+ 1 /* Has cache hints */
+};
+#endif
+
+#endif /* CONFIG_AS_AVX512 */
diff --git a/lib/raid6/recov_avx512.c b/lib/raid6/recov_avx512.c
new file mode 100644
index 000000000000..625aafa33b61
--- /dev/null
+++ b/lib/raid6/recov_avx512.c
@@ -0,0 +1,388 @@
+/*
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Gayatri Kammela <gayatri.kammela@intel.com>
+ * Author: Megha Dey <megha.dey@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ */
+
+#ifdef CONFIG_AS_AVX512
+
+#include <linux/raid/pq.h>
+#include "x86.h"
+
+static int raid6_has_avx512(void)
+{
+ return boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ boot_cpu_has(X86_FEATURE_AVX512BW) &&
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ boot_cpu_has(X86_FEATURE_AVX512DQ);
+}
+
+static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila,
+ int failb, void **ptrs)
+{
+ u8 *p, *q, *dp, *dq;
+ const u8 *pbmul; /* P multiplier table for B data */
+ const u8 *qmul; /* Q multiplier table (for both) */
+ const u8 x0f = 0x0f;
+
+ p = (u8 *)ptrs[disks-2];
+ q = (u8 *)ptrs[disks-1];
+
+ /*
+ * Compute syndrome with zero for the missing data pages
+ * Use the dead data pages as temporary storage for
+ * delta p and delta q
+ */
+
+ dp = (u8 *)ptrs[faila];
+ ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[disks-2] = dp;
+ dq = (u8 *)ptrs[failb];
+ ptrs[failb] = (void *)raid6_empty_zero_page;
+ ptrs[disks-1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dp;
+ ptrs[failb] = dq;
+ ptrs[disks-2] = p;
+ ptrs[disks-1] = q;
+
+ /* Now, pick the proper data tables */
+ pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+ raid6_gfexp[failb]]];
+
+ kernel_fpu_begin();
+
+ /* zmm0 = x0f[16] */
+ asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f));
+
+ while (bytes) {
+#ifdef CONFIG_X86_64
+ asm volatile("vmovdqa64 %0, %%zmm1\n\t"
+ "vmovdqa64 %1, %%zmm9\n\t"
+ "vmovdqa64 %2, %%zmm0\n\t"
+ "vmovdqa64 %3, %%zmm8\n\t"
+ "vpxorq %4, %%zmm1, %%zmm1\n\t"
+ "vpxorq %5, %%zmm9, %%zmm9\n\t"
+ "vpxorq %6, %%zmm0, %%zmm0\n\t"
+ "vpxorq %7, %%zmm8, %%zmm8"
+ :
+ : "m" (q[0]), "m" (q[64]), "m" (p[0]),
+ "m" (p[64]), "m" (dq[0]), "m" (dq[64]),
+ "m" (dp[0]), "m" (dp[64]));
+
+ /*
+ * 1 = dq[0] ^ q[0]
+ * 9 = dq[64] ^ q[64]
+ * 0 = dp[0] ^ p[0]
+ * 8 = dp[64] ^ p[64]
+ */
+
+ asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+ "vbroadcasti64x2 %1, %%zmm5"
+ :
+ : "m" (qmul[0]), "m" (qmul[16]));
+
+ asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t"
+ "vpsraw $4, %%zmm9, %%zmm12\n\t"
+ "vpandq %%zmm7, %%zmm1, %%zmm1\n\t"
+ "vpandq %%zmm7, %%zmm9, %%zmm9\n\t"
+ "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+ "vpandq %%zmm7, %%zmm12, %%zmm12\n\t"
+ "vpshufb %%zmm9, %%zmm4, %%zmm14\n\t"
+ "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t"
+ "vpshufb %%zmm12, %%zmm5, %%zmm15\n\t"
+ "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t"
+ "vpxorq %%zmm14, %%zmm15, %%zmm15\n\t"
+ "vpxorq %%zmm4, %%zmm5, %%zmm5"
+ :
+ : );
+
+ /*
+ * 5 = qx[0]
+ * 15 = qx[64]
+ */
+
+ asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+ "vbroadcasti64x2 %1, %%zmm1\n\t"
+ "vpsraw $4, %%zmm0, %%zmm2\n\t"
+ "vpsraw $4, %%zmm8, %%zmm6\n\t"
+ "vpandq %%zmm7, %%zmm0, %%zmm3\n\t"
+ "vpandq %%zmm7, %%zmm8, %%zmm14\n\t"
+ "vpandq %%zmm7, %%zmm2, %%zmm2\n\t"
+ "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
+ "vpshufb %%zmm14, %%zmm4, %%zmm12\n\t"
+ "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t"
+ "vpshufb %%zmm6, %%zmm1, %%zmm13\n\t"
+ "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t"
+ "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t"
+ "vpxorq %%zmm12, %%zmm13, %%zmm13"
+ :
+ : "m" (pbmul[0]), "m" (pbmul[16]));
+
+ /*
+ * 1 = pbmul[px[0]]
+ * 13 = pbmul[px[64]]
+ */
+ asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t"
+ "vpxorq %%zmm15, %%zmm13, %%zmm13"
+ :
+ : );
+
+ /*
+ * 1 = db = DQ
+ * 13 = db[64] = DQ[64]
+ */
+ asm volatile("vmovdqa64 %%zmm1, %0\n\t"
+ "vmovdqa64 %%zmm13,%1\n\t"
+ "vpxorq %%zmm1, %%zmm0, %%zmm0\n\t"
+ "vpxorq %%zmm13, %%zmm8, %%zmm8"
+ :
+ : "m" (dq[0]), "m" (dq[64]));
+
+ asm volatile("vmovdqa64 %%zmm0, %0\n\t"
+ "vmovdqa64 %%zmm8, %1"
+ :
+ : "m" (dp[0]), "m" (dp[64]));
+
+ bytes -= 128;
+ p += 128;
+ q += 128;
+ dp += 128;
+ dq += 128;
+#else
+ asm volatile("vmovdqa64 %0, %%zmm1\n\t"
+ "vmovdqa64 %1, %%zmm0\n\t"
+ "vpxorq %2, %%zmm1, %%zmm1\n\t"
+ "vpxorq %3, %%zmm0, %%zmm0"
+ :
+ : "m" (*q), "m" (*p), "m"(*dq), "m" (*dp));
+
+ /* 1 = dq ^ q; 0 = dp ^ p */
+
+ asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+ "vbroadcasti64x2 %1, %%zmm5"
+ :
+ : "m" (qmul[0]), "m" (qmul[16]));
+
+ /*
+ * 1 = dq ^ q
+ * 3 = dq ^ p >> 4
+ */
+ asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t"
+ "vpandq %%zmm7, %%zmm1, %%zmm1\n\t"
+ "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+ "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t"
+ "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t"
+ "vpxorq %%zmm4, %%zmm5, %%zmm5"
+ :
+ : );
+
+ /* 5 = qx */
+
+ asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+ "vbroadcasti64x2 %1, %%zmm1"
+ :
+ : "m" (pbmul[0]), "m" (pbmul[16]));
+
+ asm volatile("vpsraw $4, %%zmm0, %%zmm2\n\t"
+ "vpandq %%zmm7, %%zmm0, %%zmm3\n\t"
+ "vpandq %%zmm7, %%zmm2, %%zmm2\n\t"
+ "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t"
+ "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t"
+ "vpxorq %%zmm4, %%zmm1, %%zmm1"
+ :
+ : );
+
+ /* 1 = pbmul[px] */
+ asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t"
+ /* 1 = db = DQ */
+ "vmovdqa64 %%zmm1, %0\n\t"
+ :
+ : "m" (dq[0]));
+
+ asm volatile("vpxorq %%zmm1, %%zmm0, %%zmm0\n\t"
+ "vmovdqa64 %%zmm0, %0"
+ :
+ : "m" (dp[0]));
+
+ bytes -= 64;
+ p += 64;
+ q += 64;
+ dp += 64;
+ dq += 64;
+#endif
+ }
+
+ kernel_fpu_end();
+}
+
+static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila,
+ void **ptrs)
+{
+ u8 *p, *q, *dq;
+ const u8 *qmul; /* Q multiplier table */
+ const u8 x0f = 0x0f;
+
+ p = (u8 *)ptrs[disks-2];
+ q = (u8 *)ptrs[disks-1];
+
+ /*
+ * Compute syndrome with zero for the missing data page
+ * Use the dead data page as temporary storage for delta q
+ */
+
+ dq = (u8 *)ptrs[faila];
+ ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[disks-1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dq;
+ ptrs[disks-1] = q;
+
+ /* Now, pick the proper data tables */
+ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+ kernel_fpu_begin();
+
+ asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f));
+
+ while (bytes) {
+#ifdef CONFIG_X86_64
+ asm volatile("vmovdqa64 %0, %%zmm3\n\t"
+ "vmovdqa64 %1, %%zmm8\n\t"
+ "vpxorq %2, %%zmm3, %%zmm3\n\t"
+ "vpxorq %3, %%zmm8, %%zmm8"
+ :
+ : "m" (dq[0]), "m" (dq[64]), "m" (q[0]),
+ "m" (q[64]));
+
+ /*
+ * 3 = q[0] ^ dq[0]
+ * 8 = q[64] ^ dq[64]
+ */
+ asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t"
+ "vmovapd %%zmm0, %%zmm13\n\t"
+ "vbroadcasti64x2 %1, %%zmm1\n\t"
+ "vmovapd %%zmm1, %%zmm14"
+ :
+ : "m" (qmul[0]), "m" (qmul[16]));
+
+ asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t"
+ "vpsraw $4, %%zmm8, %%zmm12\n\t"
+ "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+ "vpandq %%zmm7, %%zmm8, %%zmm8\n\t"
+ "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
+ "vpandq %%zmm7, %%zmm12, %%zmm12\n\t"
+ "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t"
+ "vpshufb %%zmm8, %%zmm13, %%zmm13\n\t"
+ "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t"
+ "vpshufb %%zmm12, %%zmm14, %%zmm14\n\t"
+ "vpxorq %%zmm0, %%zmm1, %%zmm1\n\t"
+ "vpxorq %%zmm13, %%zmm14, %%zmm14"
+ :
+ : );
+
+ /*
+ * 1 = qmul[q[0] ^ dq[0]]
+ * 14 = qmul[q[64] ^ dq[64]]
+ */
+ asm volatile("vmovdqa64 %0, %%zmm2\n\t"
+ "vmovdqa64 %1, %%zmm12\n\t"
+ "vpxorq %%zmm1, %%zmm2, %%zmm2\n\t"
+ "vpxorq %%zmm14, %%zmm12, %%zmm12"
+ :
+ : "m" (p[0]), "m" (p[64]));
+
+ /*
+ * 2 = p[0] ^ qmul[q[0] ^ dq[0]]
+ * 12 = p[64] ^ qmul[q[64] ^ dq[64]]
+ */
+
+ asm volatile("vmovdqa64 %%zmm1, %0\n\t"
+ "vmovdqa64 %%zmm14, %1\n\t"
+ "vmovdqa64 %%zmm2, %2\n\t"
+ "vmovdqa64 %%zmm12,%3"
+ :
+ : "m" (dq[0]), "m" (dq[64]), "m" (p[0]),
+ "m" (p[64]));
+
+ bytes -= 128;
+ p += 128;
+ q += 128;
+ dq += 128;
+#else
+ asm volatile("vmovdqa64 %0, %%zmm3\n\t"
+ "vpxorq %1, %%zmm3, %%zmm3"
+ :
+ : "m" (dq[0]), "m" (q[0]));
+
+ /* 3 = q ^ dq */
+
+ asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t"
+ "vbroadcasti64x2 %1, %%zmm1"
+ :
+ : "m" (qmul[0]), "m" (qmul[16]));
+
+ asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t"
+ "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+ "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
+ "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t"
+ "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t"
+ "vpxorq %%zmm0, %%zmm1, %%zmm1"
+ :
+ : );
+
+ /* 1 = qmul[q ^ dq] */
+
+ asm volatile("vmovdqa64 %0, %%zmm2\n\t"
+ "vpxorq %%zmm1, %%zmm2, %%zmm2"
+ :
+ : "m" (p[0]));
+
+ /* 2 = p ^ qmul[q ^ dq] */
+
+ asm volatile("vmovdqa64 %%zmm1, %0\n\t"
+ "vmovdqa64 %%zmm2, %1"
+ :
+ : "m" (dq[0]), "m" (p[0]));
+
+ bytes -= 64;
+ p += 64;
+ q += 64;
+ dq += 64;
+#endif
+ }
+
+ kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_avx512 = {
+ .data2 = raid6_2data_recov_avx512,
+ .datap = raid6_datap_recov_avx512,
+ .valid = raid6_has_avx512,
+#ifdef CONFIG_X86_64
+ .name = "avx512x2",
+#else
+ .name = "avx512x1",
+#endif
+ .priority = 3,
+};
+
+#else
+#warning "your version of binutils lacks AVX512 support"
+#endif
diff --git a/lib/raid6/recov_s390xc.c b/lib/raid6/recov_s390xc.c
new file mode 100644
index 000000000000..b042dac826cc
--- /dev/null
+++ b/lib/raid6/recov_s390xc.c
@@ -0,0 +1,116 @@
+/*
+ * RAID-6 data recovery in dual failure mode based on the XC instruction.
+ *
+ * Copyright IBM Corp. 2016
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#include <linux/export.h>
+#include <linux/raid/pq.h>
+
+static inline void xor_block(u8 *p1, u8 *p2)
+{
+ typedef struct { u8 _[256]; } addrtype;
+
+ asm volatile(
+ " xc 0(256,%[p1]),0(%[p2])\n"
+ : "+m" (*(addrtype *) p1) : "m" (*(addrtype *) p2),
+ [p1] "a" (p1), [p2] "a" (p2) : "cc");
+}
+
+/* Recover two failed data blocks. */
+static void raid6_2data_recov_s390xc(int disks, size_t bytes, int faila,
+ int failb, void **ptrs)
+{
+ u8 *p, *q, *dp, *dq;
+ const u8 *pbmul; /* P multiplier table for B data */
+ const u8 *qmul; /* Q multiplier table (for both) */
+ int i;
+
+ p = (u8 *)ptrs[disks-2];
+ q = (u8 *)ptrs[disks-1];
+
+ /* Compute syndrome with zero for the missing data pages
+ Use the dead data pages as temporary storage for
+ delta p and delta q */
+ dp = (u8 *)ptrs[faila];
+ ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[disks-2] = dp;
+ dq = (u8 *)ptrs[failb];
+ ptrs[failb] = (void *)raid6_empty_zero_page;
+ ptrs[disks-1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dp;
+ ptrs[failb] = dq;
+ ptrs[disks-2] = p;
+ ptrs[disks-1] = q;
+
+ /* Now, pick the proper data tables */
+ pbmul = raid6_gfmul[raid6_gfexi[failb-faila]];
+ qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]];
+
+ /* Now do it... */
+ while (bytes) {
+ xor_block(dp, p);
+ xor_block(dq, q);
+ for (i = 0; i < 256; i++)
+ dq[i] = pbmul[dp[i]] ^ qmul[dq[i]];
+ xor_block(dp, dq);
+ p += 256;
+ q += 256;
+ dp += 256;
+ dq += 256;
+ bytes -= 256;
+ }
+}
+
+/* Recover failure of one data block plus the P block */
+static void raid6_datap_recov_s390xc(int disks, size_t bytes, int faila,
+ void **ptrs)
+{
+ u8 *p, *q, *dq;
+ const u8 *qmul; /* Q multiplier table */
+ int i;
+
+ p = (u8 *)ptrs[disks-2];
+ q = (u8 *)ptrs[disks-1];
+
+ /* Compute syndrome with zero for the missing data page
+ Use the dead data page as temporary storage for delta q */
+ dq = (u8 *)ptrs[faila];
+ ptrs[faila] = (void *)raid6_empty_zero_page;
+ ptrs[disks-1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dq;
+ ptrs[disks-1] = q;
+
+ /* Now, pick the proper data tables */
+ qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+ /* Now do it... */
+ while (bytes) {
+ xor_block(dq, q);
+ for (i = 0; i < 256; i++)
+ dq[i] = qmul[dq[i]];
+ xor_block(p, dq);
+ p += 256;
+ q += 256;
+ dq += 256;
+ bytes -= 256;
+ }
+}
+
+
+const struct raid6_recov_calls raid6_recov_s390xc = {
+ .data2 = raid6_2data_recov_s390xc,
+ .datap = raid6_datap_recov_s390xc,
+ .valid = NULL,
+ .name = "s390xc",
+ .priority = 1,
+};
diff --git a/lib/raid6/s390vx.uc b/lib/raid6/s390vx.uc
new file mode 100644
index 000000000000..7b45191a655f
--- /dev/null
+++ b/lib/raid6/s390vx.uc
@@ -0,0 +1,168 @@
+/*
+ * raid6_vx$#.c
+ *
+ * $#-way unrolled RAID6 gen/xor functions for s390
+ * based on the vector facility
+ *
+ * Copyright IBM Corp. 2016
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *
+ * This file is postprocessed using unroll.awk.
+ */
+
+#include <linux/raid/pq.h>
+#include <asm/fpu/api.h>
+
+asm(".include \"asm/vx-insn.h\"\n");
+
+#define NSIZE 16
+
+static inline void LOAD_CONST(void)
+{
+ asm volatile("VREPIB %v24,7");
+ asm volatile("VREPIB %v25,0x1d");
+}
+
+/*
+ * The SHLBYTE() operation shifts each of the 16 bytes in
+ * vector register y left by 1 bit and stores the result in
+ * vector register x.
+ */
+static inline void SHLBYTE(int x, int y)
+{
+ asm volatile ("VAB %0,%1,%1" : : "i" (x), "i" (y));
+}
+
+/*
+ * For each of the 16 bytes in the vector register y the MASK()
+ * operation returns 0xFF if the high bit of the byte is 1,
+ * or 0x00 if the high bit is 0. The result is stored in vector
+ * register x.
+ */
+static inline void MASK(int x, int y)
+{
+ asm volatile ("VESRAVB %0,%1,24" : : "i" (x), "i" (y));
+}
+
+static inline void AND(int x, int y, int z)
+{
+ asm volatile ("VN %0,%1,%2" : : "i" (x), "i" (y), "i" (z));
+}
+
+static inline void XOR(int x, int y, int z)
+{
+ asm volatile ("VX %0,%1,%2" : : "i" (x), "i" (y), "i" (z));
+}
+
+static inline void LOAD_DATA(int x, int n, u8 *ptr)
+{
+ typedef struct { u8 _[16*n]; } addrtype;
+ register addrtype *__ptr asm("1") = (addrtype *) ptr;
+
+ asm volatile ("VLM %2,%3,0,%r1"
+ : : "m" (*__ptr), "a" (__ptr), "i" (x), "i" (x + n - 1));
+}
+
+static inline void STORE_DATA(int x, int n, u8 *ptr)
+{
+ typedef struct { u8 _[16*n]; } addrtype;
+ register addrtype *__ptr asm("1") = (addrtype *) ptr;
+
+ asm volatile ("VSTM %2,%3,0,1"
+ : "=m" (*__ptr) : "a" (__ptr), "i" (x), "i" (x + n - 1));
+}
+
+static inline void COPY_VEC(int x, int y)
+{
+ asm volatile ("VLR %0,%1" : : "i" (x), "i" (y));
+}
+
+static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ struct kernel_fpu vxstate;
+ u8 **dptr, *p, *q;
+ int d, z, z0;
+
+ kernel_fpu_begin(&vxstate, KERNEL_VXR);
+ LOAD_CONST();
+
+ dptr = (u8 **) ptrs;
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0 + 1]; /* XOR parity */
+ q = dptr[z0 + 2]; /* RS syndrome */
+
+ for (d = 0; d < bytes; d += $#*NSIZE) {
+ LOAD_DATA(0,$#,&dptr[z0][d]);
+ COPY_VEC(8+$$,0+$$);
+ for (z = z0 - 1; z >= 0; z--) {
+ MASK(16+$$,8+$$);
+ AND(16+$$,16+$$,25);
+ SHLBYTE(8+$$,8+$$);
+ XOR(8+$$,8+$$,16+$$);
+ LOAD_DATA(16,$#,&dptr[z][d]);
+ XOR(0+$$,0+$$,16+$$);
+ XOR(8+$$,8+$$,16+$$);
+ }
+ STORE_DATA(0,$#,&p[d]);
+ STORE_DATA(8,$#,&q[d]);
+ }
+ kernel_fpu_end(&vxstate, KERNEL_VXR);
+}
+
+static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ struct kernel_fpu vxstate;
+ u8 **dptr, *p, *q;
+ int d, z, z0;
+
+ dptr = (u8 **) ptrs;
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks - 2]; /* XOR parity */
+ q = dptr[disks - 1]; /* RS syndrome */
+
+ kernel_fpu_begin(&vxstate, KERNEL_VXR);
+ LOAD_CONST();
+
+ for (d = 0; d < bytes; d += $#*NSIZE) {
+ /* P/Q data pages */
+ LOAD_DATA(0,$#,&dptr[z0][d]);
+ COPY_VEC(8+$$,0+$$);
+ for (z = z0 - 1; z >= start; z--) {
+ MASK(16+$$,8+$$);
+ AND(16+$$,16+$$,25);
+ SHLBYTE(8+$$,8+$$);
+ XOR(8+$$,8+$$,16+$$);
+ LOAD_DATA(16,$#,&dptr[z][d]);
+ XOR(0+$$,0+$$,16+$$);
+ XOR(8+$$,8+$$,16+$$);
+ }
+ /* P/Q left side optimization */
+ for (z = start - 1; z >= 0; z--) {
+ MASK(16+$$,8+$$);
+ AND(16+$$,16+$$,25);
+ SHLBYTE(8+$$,8+$$);
+ XOR(8+$$,8+$$,16+$$);
+ }
+ LOAD_DATA(16,$#,&p[d]);
+ XOR(16+$$,16+$$,0+$$);
+ STORE_DATA(16,$#,&p[d]);
+ LOAD_DATA(16,$#,&q[d]);
+ XOR(16+$$,16+$$,8+$$);
+ STORE_DATA(16,$#,&q[d]);
+ }
+ kernel_fpu_end(&vxstate, KERNEL_VXR);
+}
+
+static int raid6_s390vx$#_valid(void)
+{
+ return MACHINE_HAS_VX;
+}
+
+const struct raid6_calls raid6_s390vx$# = {
+ raid6_s390vx$#_gen_syndrome,
+ raid6_s390vx$#_xor_syndrome,
+ raid6_s390vx$#_valid,
+ "vx128x$#",
+ 1
+};
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index 29090f3db677..2c7b60edea04 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -32,10 +32,13 @@ ifeq ($(ARCH),arm64)
endif
ifeq ($(IS_X86),yes)
- OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o
+ OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o
CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \
gcc -c -x assembler - >&/dev/null && \
rm ./-.o && echo -DCONFIG_AS_AVX2=1)
+ CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \
+ gcc -c -x assembler - >&/dev/null && \
+ rm ./-.o && echo -DCONFIG_AS_AVX512=1)
else ifeq ($(HAS_NEON),yes)
OBJS += neon.o neon1.o neon2.o neon4.o neon8.o
CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1
diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c
index 3bebbabdb510..b07f4d8e6b03 100644
--- a/lib/raid6/test/test.c
+++ b/lib/raid6/test/test.c
@@ -21,12 +21,13 @@
#define NDISKS 16 /* Including P and Q */
-const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
+const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
struct raid6_calls raid6_call;
char *dataptrs[NDISKS];
-char data[NDISKS][PAGE_SIZE];
-char recovi[PAGE_SIZE], recovj[PAGE_SIZE];
+char data[NDISKS][PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
+char recovi[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
+char recovj[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
static void makedata(int start, int stop)
{
diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h
index 8fe9d9662abb..834d268a4b05 100644
--- a/lib/raid6/x86.h
+++ b/lib/raid6/x86.h
@@ -46,6 +46,16 @@ static inline void kernel_fpu_end(void)
#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */
#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */
+#define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */
+#define X86_FEATURE_AVX512DQ (9*32+17) /* AVX-512 DQ (Double/Quad granular)
+ * Instructions
+ */
+#define X86_FEATURE_AVX512BW (9*32+30) /* AVX-512 BW (Byte/Word granular)
+ * Instructions
+ */
+#define X86_FEATURE_AVX512VL (9*32+31) /* AVX-512 VL (128/256 Vector Length)
+ * Extensions
+ */
#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */
/* Should work well enough on modern CPUs for testing */
diff --git a/lib/random32.c b/lib/random32.c
index 69ed593aab07..fa594b1140e6 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -47,7 +47,7 @@ static inline void prandom_state_selftest(void)
}
#endif
-static DEFINE_PER_CPU(struct rnd_state, net_rand_state);
+static DEFINE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy;
/**
* prandom_u32_state - seeded pseudo-random number generator.
@@ -81,7 +81,7 @@ u32 prandom_u32(void)
u32 res;
res = prandom_u32_state(state);
- put_cpu_var(state);
+ put_cpu_var(net_rand_state);
return res;
}
@@ -128,7 +128,7 @@ void prandom_bytes(void *buf, size_t bytes)
struct rnd_state *state = &get_cpu_var(net_rand_state);
prandom_bytes_state(state, buf, bytes);
- put_cpu_var(state);
+ put_cpu_var(net_rand_state);
}
EXPORT_SYMBOL(prandom_bytes);
diff --git a/lib/rbtree.c b/lib/rbtree.c
index eb8a19fee110..1f8b112a7c35 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -296,11 +296,26 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
*
* (p) (p)
* / \ / \
- * N S --> N Sl
+ * N S --> N sl
* / \ \
- * sl Sr s
+ * sl Sr S
* \
* Sr
+ *
+ * Note: p might be red, and then both
+ * p and sl are red after rotation(which
+ * breaks property 4). This is fixed in
+ * Case 4 (in __rb_rotate_set_parents()
+ * which set sl the color of p
+ * and set p RB_BLACK)
+ *
+ * (p) (sl)
+ * / \ / \
+ * N sl --> P S
+ * \ / \
+ * S N Sr
+ * \
+ * Sr
*/
tmp1 = tmp2->rb_right;
WRITE_ONCE(sibling->rb_left, tmp1);
@@ -365,7 +380,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
}
break;
}
- /* Case 3 - right rotate at sibling */
+ /* Case 3 - left rotate at sibling */
tmp1 = tmp2->rb_left;
WRITE_ONCE(sibling->rb_right, tmp1);
WRITE_ONCE(tmp2->rb_left, sibling);
@@ -377,7 +392,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
tmp1 = sibling;
sibling = tmp2;
}
- /* Case 4 - left rotate at parent + color flips */
+ /* Case 4 - right rotate at parent + color flips */
tmp2 = sibling->rb_right;
WRITE_ONCE(parent->rb_left, tmp2);
WRITE_ONCE(sibling->rb_right, parent);
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 56054e541a0f..32d0ad058380 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -378,22 +378,8 @@ static void rht_deferred_worker(struct work_struct *work)
schedule_work(&ht->run_work);
}
-static bool rhashtable_check_elasticity(struct rhashtable *ht,
- struct bucket_table *tbl,
- unsigned int hash)
-{
- unsigned int elasticity = ht->elasticity;
- struct rhash_head *head;
-
- rht_for_each(head, tbl, hash)
- if (!--elasticity)
- return true;
-
- return false;
-}
-
-int rhashtable_insert_rehash(struct rhashtable *ht,
- struct bucket_table *tbl)
+static int rhashtable_insert_rehash(struct rhashtable *ht,
+ struct bucket_table *tbl)
{
struct bucket_table *old_tbl;
struct bucket_table *new_tbl;
@@ -439,61 +425,172 @@ fail:
return err;
}
-EXPORT_SYMBOL_GPL(rhashtable_insert_rehash);
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
- const void *key,
- struct rhash_head *obj,
- struct bucket_table *tbl)
+static void *rhashtable_lookup_one(struct rhashtable *ht,
+ struct bucket_table *tbl, unsigned int hash,
+ const void *key, struct rhash_head *obj)
{
+ struct rhashtable_compare_arg arg = {
+ .ht = ht,
+ .key = key,
+ };
+ struct rhash_head __rcu **pprev;
struct rhash_head *head;
- unsigned int hash;
- int err;
+ int elasticity;
- tbl = rhashtable_last_table(ht, tbl);
- hash = head_hashfn(ht, tbl, obj);
- spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
+ elasticity = ht->elasticity;
+ pprev = &tbl->buckets[hash];
+ rht_for_each(head, tbl, hash) {
+ struct rhlist_head *list;
+ struct rhlist_head *plist;
- err = -EEXIST;
- if (key && rhashtable_lookup_fast(ht, key, ht->p))
- goto exit;
+ elasticity--;
+ if (!key ||
+ (ht->p.obj_cmpfn ?
+ ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
+ rhashtable_compare(&arg, rht_obj(ht, head))))
+ continue;
- err = -E2BIG;
- if (unlikely(rht_grow_above_max(ht, tbl)))
- goto exit;
+ if (!ht->rhlist)
+ return rht_obj(ht, head);
+
+ list = container_of(obj, struct rhlist_head, rhead);
+ plist = container_of(head, struct rhlist_head, rhead);
+
+ RCU_INIT_POINTER(list->next, plist);
+ head = rht_dereference_bucket(head->next, tbl, hash);
+ RCU_INIT_POINTER(list->rhead.next, head);
+ rcu_assign_pointer(*pprev, obj);
+
+ return NULL;
+ }
+
+ if (elasticity <= 0)
+ return ERR_PTR(-EAGAIN);
+
+ return ERR_PTR(-ENOENT);
+}
+
+static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
+ struct bucket_table *tbl,
+ unsigned int hash,
+ struct rhash_head *obj,
+ void *data)
+{
+ struct bucket_table *new_tbl;
+ struct rhash_head *head;
+
+ if (!IS_ERR_OR_NULL(data))
+ return ERR_PTR(-EEXIST);
+
+ if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
+ return ERR_CAST(data);
- err = -EAGAIN;
- if (rhashtable_check_elasticity(ht, tbl, hash) ||
- rht_grow_above_100(ht, tbl))
- goto exit;
+ new_tbl = rcu_dereference(tbl->future_tbl);
+ if (new_tbl)
+ return new_tbl;
- err = 0;
+ if (PTR_ERR(data) != -ENOENT)
+ return ERR_CAST(data);
+
+ if (unlikely(rht_grow_above_max(ht, tbl)))
+ return ERR_PTR(-E2BIG);
+
+ if (unlikely(rht_grow_above_100(ht, tbl)))
+ return ERR_PTR(-EAGAIN);
head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
RCU_INIT_POINTER(obj->next, head);
+ if (ht->rhlist) {
+ struct rhlist_head *list;
+
+ list = container_of(obj, struct rhlist_head, rhead);
+ RCU_INIT_POINTER(list->next, NULL);
+ }
rcu_assign_pointer(tbl->buckets[hash], obj);
atomic_inc(&ht->nelems);
+ if (rht_grow_above_75(ht, tbl))
+ schedule_work(&ht->run_work);
-exit:
- spin_unlock(rht_bucket_lock(tbl, hash));
+ return NULL;
+}
- if (err == 0)
- return NULL;
- else if (err == -EAGAIN)
- return tbl;
- else
- return ERR_PTR(err);
+static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
+ struct rhash_head *obj)
+{
+ struct bucket_table *new_tbl;
+ struct bucket_table *tbl;
+ unsigned int hash;
+ spinlock_t *lock;
+ void *data;
+
+ tbl = rcu_dereference(ht->tbl);
+
+ /* All insertions must grab the oldest table containing
+ * the hashed bucket that is yet to be rehashed.
+ */
+ for (;;) {
+ hash = rht_head_hashfn(ht, tbl, obj, ht->p);
+ lock = rht_bucket_lock(tbl, hash);
+ spin_lock_bh(lock);
+
+ if (tbl->rehash <= hash)
+ break;
+
+ spin_unlock_bh(lock);
+ tbl = rcu_dereference(tbl->future_tbl);
+ }
+
+ data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
+ new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
+ if (PTR_ERR(new_tbl) != -EEXIST)
+ data = ERR_CAST(new_tbl);
+
+ while (!IS_ERR_OR_NULL(new_tbl)) {
+ tbl = new_tbl;
+ hash = rht_head_hashfn(ht, tbl, obj, ht->p);
+ spin_lock_nested(rht_bucket_lock(tbl, hash),
+ SINGLE_DEPTH_NESTING);
+
+ data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
+ new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
+ if (PTR_ERR(new_tbl) != -EEXIST)
+ data = ERR_CAST(new_tbl);
+
+ spin_unlock(rht_bucket_lock(tbl, hash));
+ }
+
+ spin_unlock_bh(lock);
+
+ if (PTR_ERR(data) == -EAGAIN)
+ data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
+ -EAGAIN);
+
+ return data;
+}
+
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+ struct rhash_head *obj)
+{
+ void *data;
+
+ do {
+ rcu_read_lock();
+ data = rhashtable_try_insert(ht, key, obj);
+ rcu_read_unlock();
+ } while (PTR_ERR(data) == -EAGAIN);
+
+ return data;
}
EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
/**
- * rhashtable_walk_init - Initialise an iterator
+ * rhashtable_walk_enter - Initialise an iterator
* @ht: Table to walk over
* @iter: Hash table Iterator
- * @gfp: GFP flags for allocations
*
* This function prepares a hash table walk.
*
@@ -508,30 +605,22 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
* This function may sleep so you must not call it from interrupt
* context or with spin locks held.
*
- * You must call rhashtable_walk_exit if this function returns
- * successfully.
+ * You must call rhashtable_walk_exit after this function returns.
*/
-int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter,
- gfp_t gfp)
+void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
{
iter->ht = ht;
iter->p = NULL;
iter->slot = 0;
iter->skip = 0;
- iter->walker = kmalloc(sizeof(*iter->walker), gfp);
- if (!iter->walker)
- return -ENOMEM;
-
spin_lock(&ht->lock);
- iter->walker->tbl =
+ iter->walker.tbl =
rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
- list_add(&iter->walker->list, &iter->walker->tbl->walkers);
+ list_add(&iter->walker.list, &iter->walker.tbl->walkers);
spin_unlock(&ht->lock);
-
- return 0;
}
-EXPORT_SYMBOL_GPL(rhashtable_walk_init);
+EXPORT_SYMBOL_GPL(rhashtable_walk_enter);
/**
* rhashtable_walk_exit - Free an iterator
@@ -542,10 +631,9 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_init);
void rhashtable_walk_exit(struct rhashtable_iter *iter)
{
spin_lock(&iter->ht->lock);
- if (iter->walker->tbl)
- list_del(&iter->walker->list);
+ if (iter->walker.tbl)
+ list_del(&iter->walker.list);
spin_unlock(&iter->ht->lock);
- kfree(iter->walker);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
@@ -571,12 +659,12 @@ int rhashtable_walk_start(struct rhashtable_iter *iter)
rcu_read_lock();
spin_lock(&ht->lock);
- if (iter->walker->tbl)
- list_del(&iter->walker->list);
+ if (iter->walker.tbl)
+ list_del(&iter->walker.list);
spin_unlock(&ht->lock);
- if (!iter->walker->tbl) {
- iter->walker->tbl = rht_dereference_rcu(ht->tbl, ht);
+ if (!iter->walker.tbl) {
+ iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
return -EAGAIN;
}
@@ -598,12 +686,17 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_start);
*/
void *rhashtable_walk_next(struct rhashtable_iter *iter)
{
- struct bucket_table *tbl = iter->walker->tbl;
+ struct bucket_table *tbl = iter->walker.tbl;
+ struct rhlist_head *list = iter->list;
struct rhashtable *ht = iter->ht;
struct rhash_head *p = iter->p;
+ bool rhlist = ht->rhlist;
if (p) {
- p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot);
+ if (!rhlist || !(list = rcu_dereference(list->next))) {
+ p = rcu_dereference(p->next);
+ list = container_of(p, struct rhlist_head, rhead);
+ }
goto next;
}
@@ -611,6 +704,18 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
int skip = iter->skip;
rht_for_each_rcu(p, tbl, iter->slot) {
+ if (rhlist) {
+ list = container_of(p, struct rhlist_head,
+ rhead);
+ do {
+ if (!skip)
+ goto next;
+ skip--;
+ list = rcu_dereference(list->next);
+ } while (list);
+
+ continue;
+ }
if (!skip)
break;
skip--;
@@ -620,7 +725,8 @@ next:
if (!rht_is_a_nulls(p)) {
iter->skip++;
iter->p = p;
- return rht_obj(ht, p);
+ iter->list = list;
+ return rht_obj(ht, rhlist ? &list->rhead : p);
}
iter->skip = 0;
@@ -631,8 +737,8 @@ next:
/* Ensure we see any new tables. */
smp_rmb();
- iter->walker->tbl = rht_dereference_rcu(tbl->future_tbl, ht);
- if (iter->walker->tbl) {
+ iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+ if (iter->walker.tbl) {
iter->slot = 0;
iter->skip = 0;
return ERR_PTR(-EAGAIN);
@@ -652,7 +758,7 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
__releases(RCU)
{
struct rhashtable *ht;
- struct bucket_table *tbl = iter->walker->tbl;
+ struct bucket_table *tbl = iter->walker.tbl;
if (!tbl)
goto out;
@@ -661,9 +767,9 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
spin_lock(&ht->lock);
if (tbl->rehash < tbl->size)
- list_add(&iter->walker->list, &tbl->walkers);
+ list_add(&iter->walker.list, &tbl->walkers);
else
- iter->walker->tbl = NULL;
+ iter->walker.tbl = NULL;
spin_unlock(&ht->lock);
iter->p = NULL;
@@ -809,6 +915,48 @@ int rhashtable_init(struct rhashtable *ht,
EXPORT_SYMBOL_GPL(rhashtable_init);
/**
+ * rhltable_init - initialize a new hash list table
+ * @hlt: hash list table to be initialized
+ * @params: configuration parameters
+ *
+ * Initializes a new hash list table.
+ *
+ * See documentation for rhashtable_init.
+ */
+int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
+{
+ int err;
+
+ /* No rhlist NULLs marking for now. */
+ if (params->nulls_base)
+ return -EINVAL;
+
+ err = rhashtable_init(&hlt->ht, params);
+ hlt->ht.rhlist = true;
+ return err;
+}
+EXPORT_SYMBOL_GPL(rhltable_init);
+
+static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
+ void (*free_fn)(void *ptr, void *arg),
+ void *arg)
+{
+ struct rhlist_head *list;
+
+ if (!ht->rhlist) {
+ free_fn(rht_obj(ht, obj), arg);
+ return;
+ }
+
+ list = container_of(obj, struct rhlist_head, rhead);
+ do {
+ obj = &list->rhead;
+ list = rht_dereference(list->next, ht);
+ free_fn(rht_obj(ht, obj), arg);
+ } while (list);
+}
+
+/**
* rhashtable_free_and_destroy - free elements and destroy hash table
* @ht: the hash table to destroy
* @free_fn: callback to release resources of element
@@ -845,7 +993,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
pos = next,
next = !rht_is_a_nulls(pos) ?
rht_dereference(pos->next, ht) : NULL)
- free_fn(rht_obj(ht, pos), arg);
+ rhashtable_free_one(ht, pos, free_fn, arg);
}
}
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
new file mode 100644
index 000000000000..2cecf05c82fd
--- /dev/null
+++ b/lib/sbitmap.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright (C) 2016 Facebook
+ * Copyright (C) 2013-2014 Jens Axboe
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <linux/random.h>
+#include <linux/sbitmap.h>
+
+int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
+ gfp_t flags, int node)
+{
+ unsigned int bits_per_word;
+ unsigned int i;
+
+ if (shift < 0) {
+ shift = ilog2(BITS_PER_LONG);
+ /*
+ * If the bitmap is small, shrink the number of bits per word so
+ * we spread over a few cachelines, at least. If less than 4
+ * bits, just forget about it, it's not going to work optimally
+ * anyway.
+ */
+ if (depth >= 4) {
+ while ((4U << shift) > depth)
+ shift--;
+ }
+ }
+ bits_per_word = 1U << shift;
+ if (bits_per_word > BITS_PER_LONG)
+ return -EINVAL;
+
+ sb->shift = shift;
+ sb->depth = depth;
+ sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
+
+ if (depth == 0) {
+ sb->map = NULL;
+ return 0;
+ }
+
+ sb->map = kzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node);
+ if (!sb->map)
+ return -ENOMEM;
+
+ for (i = 0; i < sb->map_nr; i++) {
+ sb->map[i].depth = min(depth, bits_per_word);
+ depth -= sb->map[i].depth;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sbitmap_init_node);
+
+void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
+{
+ unsigned int bits_per_word = 1U << sb->shift;
+ unsigned int i;
+
+ sb->depth = depth;
+ sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
+
+ for (i = 0; i < sb->map_nr; i++) {
+ sb->map[i].depth = min(depth, bits_per_word);
+ depth -= sb->map[i].depth;
+ }
+}
+EXPORT_SYMBOL_GPL(sbitmap_resize);
+
+static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint,
+ bool wrap)
+{
+ unsigned int orig_hint = hint;
+ int nr;
+
+ while (1) {
+ nr = find_next_zero_bit(&word->word, word->depth, hint);
+ if (unlikely(nr >= word->depth)) {
+ /*
+ * We started with an offset, and we didn't reset the
+ * offset to 0 in a failure case, so start from 0 to
+ * exhaust the map.
+ */
+ if (orig_hint && hint && wrap) {
+ hint = orig_hint = 0;
+ continue;
+ }
+ return -1;
+ }
+
+ if (!test_and_set_bit(nr, &word->word))
+ break;
+
+ hint = nr + 1;
+ if (hint >= word->depth - 1)
+ hint = 0;
+ }
+
+ return nr;
+}
+
+int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
+{
+ unsigned int i, index;
+ int nr = -1;
+
+ index = SB_NR_TO_INDEX(sb, alloc_hint);
+
+ for (i = 0; i < sb->map_nr; i++) {
+ nr = __sbitmap_get_word(&sb->map[index],
+ SB_NR_TO_BIT(sb, alloc_hint),
+ !round_robin);
+ if (nr != -1) {
+ nr += index << sb->shift;
+ break;
+ }
+
+ /* Jump to next index. */
+ index++;
+ alloc_hint = index << sb->shift;
+
+ if (index >= sb->map_nr) {
+ index = 0;
+ alloc_hint = 0;
+ }
+ }
+
+ return nr;
+}
+EXPORT_SYMBOL_GPL(sbitmap_get);
+
+bool sbitmap_any_bit_set(const struct sbitmap *sb)
+{
+ unsigned int i;
+
+ for (i = 0; i < sb->map_nr; i++) {
+ if (sb->map[i].word)
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(sbitmap_any_bit_set);
+
+bool sbitmap_any_bit_clear(const struct sbitmap *sb)
+{
+ unsigned int i;
+
+ for (i = 0; i < sb->map_nr; i++) {
+ const struct sbitmap_word *word = &sb->map[i];
+ unsigned long ret;
+
+ ret = find_first_zero_bit(&word->word, word->depth);
+ if (ret < word->depth)
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear);
+
+unsigned int sbitmap_weight(const struct sbitmap *sb)
+{
+ unsigned int i, weight = 0;
+
+ for (i = 0; i < sb->map_nr; i++) {
+ const struct sbitmap_word *word = &sb->map[i];
+
+ weight += bitmap_weight(&word->word, word->depth);
+ }
+ return weight;
+}
+EXPORT_SYMBOL_GPL(sbitmap_weight);
+
+static unsigned int sbq_calc_wake_batch(unsigned int depth)
+{
+ unsigned int wake_batch;
+
+ /*
+ * For each batch, we wake up one queue. We need to make sure that our
+ * batch size is small enough that the full depth of the bitmap is
+ * enough to wake up all of the queues.
+ */
+ wake_batch = SBQ_WAKE_BATCH;
+ if (wake_batch > depth / SBQ_WAIT_QUEUES)
+ wake_batch = max(1U, depth / SBQ_WAIT_QUEUES);
+
+ return wake_batch;
+}
+
+int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
+ int shift, bool round_robin, gfp_t flags, int node)
+{
+ int ret;
+ int i;
+
+ ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node);
+ if (ret)
+ return ret;
+
+ sbq->alloc_hint = alloc_percpu_gfp(unsigned int, flags);
+ if (!sbq->alloc_hint) {
+ sbitmap_free(&sbq->sb);
+ return -ENOMEM;
+ }
+
+ if (depth && !round_robin) {
+ for_each_possible_cpu(i)
+ *per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth;
+ }
+
+ sbq->wake_batch = sbq_calc_wake_batch(depth);
+ atomic_set(&sbq->wake_index, 0);
+
+ sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
+ if (!sbq->ws) {
+ free_percpu(sbq->alloc_hint);
+ sbitmap_free(&sbq->sb);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+ init_waitqueue_head(&sbq->ws[i].wait);
+ atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch);
+ }
+
+ sbq->round_robin = round_robin;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
+
+void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
+{
+ sbq->wake_batch = sbq_calc_wake_batch(depth);
+ sbitmap_resize(&sbq->sb, depth);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
+
+int __sbitmap_queue_get(struct sbitmap_queue *sbq)
+{
+ unsigned int hint, depth;
+ int nr;
+
+ hint = this_cpu_read(*sbq->alloc_hint);
+ depth = READ_ONCE(sbq->sb.depth);
+ if (unlikely(hint >= depth)) {
+ hint = depth ? prandom_u32() % depth : 0;
+ this_cpu_write(*sbq->alloc_hint, hint);
+ }
+ nr = sbitmap_get(&sbq->sb, hint, sbq->round_robin);
+
+ if (nr == -1) {
+ /* If the map is full, a hint won't do us much good. */
+ this_cpu_write(*sbq->alloc_hint, 0);
+ } else if (nr == hint || unlikely(sbq->round_robin)) {
+ /* Only update the hint if we used it. */
+ hint = nr + 1;
+ if (hint >= depth - 1)
+ hint = 0;
+ this_cpu_write(*sbq->alloc_hint, hint);
+ }
+
+ return nr;
+}
+EXPORT_SYMBOL_GPL(__sbitmap_queue_get);
+
+static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
+{
+ int i, wake_index;
+
+ wake_index = atomic_read(&sbq->wake_index);
+ for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+ struct sbq_wait_state *ws = &sbq->ws[wake_index];
+
+ if (waitqueue_active(&ws->wait)) {
+ int o = atomic_read(&sbq->wake_index);
+
+ if (wake_index != o)
+ atomic_cmpxchg(&sbq->wake_index, o, wake_index);
+ return ws;
+ }
+
+ wake_index = sbq_index_inc(wake_index);
+ }
+
+ return NULL;
+}
+
+static void sbq_wake_up(struct sbitmap_queue *sbq)
+{
+ struct sbq_wait_state *ws;
+ int wait_cnt;
+
+ /* Ensure that the wait list checks occur after clear_bit(). */
+ smp_mb();
+
+ ws = sbq_wake_ptr(sbq);
+ if (!ws)
+ return;
+
+ wait_cnt = atomic_dec_return(&ws->wait_cnt);
+ if (unlikely(wait_cnt < 0))
+ wait_cnt = atomic_inc_return(&ws->wait_cnt);
+ if (wait_cnt == 0) {
+ atomic_add(sbq->wake_batch, &ws->wait_cnt);
+ sbq_index_atomic_inc(&sbq->wake_index);
+ wake_up(&ws->wait);
+ }
+}
+
+void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
+ unsigned int cpu)
+{
+ sbitmap_clear_bit(&sbq->sb, nr);
+ sbq_wake_up(sbq);
+ if (likely(!sbq->round_robin && nr < sbq->sb.depth))
+ *per_cpu_ptr(sbq->alloc_hint, cpu) = nr;
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
+
+void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
+{
+ int i, wake_index;
+
+ /*
+ * Make sure all changes prior to this are visible from other CPUs.
+ */
+ smp_mb();
+ wake_index = atomic_read(&sbq->wake_index);
+ for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+ struct sbq_wait_state *ws = &sbq->ws[wake_index];
+
+ if (waitqueue_active(&ws->wait))
+ wake_up(&ws->wait);
+
+ wake_index = sbq_index_inc(wake_index);
+ }
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all);
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 60f77f1d470a..f87d138e9672 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -50,7 +50,7 @@
STACK_ALLOC_ALIGN)
#define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \
STACK_ALLOC_NULL_PROTECTION_BITS - STACK_ALLOC_OFFSET_BITS)
-#define STACK_ALLOC_SLABS_CAP 1024
+#define STACK_ALLOC_SLABS_CAP 8192
#define STACK_ALLOC_MAX_SLABS \
(((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \
(1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP)
@@ -192,6 +192,7 @@ void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace)
trace->entries = stack->entries;
trace->skip = 0;
}
+EXPORT_SYMBOL_GPL(depot_fetch_stack);
/**
* depot_save_stack - save stack in a stack depot.
@@ -283,3 +284,4 @@ exit:
fast_exit:
return retval;
}
+EXPORT_SYMBOL_GPL(depot_save_stack);
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index 9c5fe8110413..7e35fc450c5b 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -1,6 +1,7 @@
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/kasan-checks.h>
+#include <linux/thread_info.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/errno.h>
@@ -111,6 +112,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
long retval;
kasan_check_write(dst, count);
+ check_object_size(dst, count, false);
user_access_begin();
retval = do_strncpy_from_user(dst, src, count, max);
user_access_end();
diff --git a/lib/syscall.c b/lib/syscall.c
index e30e03932480..63239e097b13 100644
--- a/lib/syscall.c
+++ b/lib/syscall.c
@@ -7,9 +7,19 @@ static int collect_syscall(struct task_struct *target, long *callno,
unsigned long args[6], unsigned int maxargs,
unsigned long *sp, unsigned long *pc)
{
- struct pt_regs *regs = task_pt_regs(target);
- if (unlikely(!regs))
+ struct pt_regs *regs;
+
+ if (!try_get_task_stack(target)) {
+ /* Task has no stack, so the task isn't in a syscall. */
+ *callno = -1;
+ return 0;
+ }
+
+ regs = task_pt_regs(target);
+ if (unlikely(!regs)) {
+ put_task_stack(target);
return -EAGAIN;
+ }
*sp = user_stack_pointer(regs);
*pc = instruction_pointer(regs);
@@ -18,6 +28,7 @@ static int collect_syscall(struct task_struct *target, long *callno,
if (*callno != -1L && maxargs > 0)
syscall_get_arguments(target, regs, 0, maxargs, args);
+ put_task_stack(target);
return 0;
}
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 93f45011a59d..0362da0b66c3 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -4831,7 +4831,7 @@ static struct bpf_test tests[] = {
{ },
INTERNAL,
{ 0x34 },
- { { 1, 0xbef } },
+ { { ETH_HLEN, 0xbef } },
.fill_helper = bpf_fill_ld_abs_vlan_push_pop,
},
/*
@@ -5485,6 +5485,7 @@ static struct sk_buff *populate_skb(char *buf, int size)
skb->hash = SKB_HASH;
skb->queue_mapping = SKB_QUEUE_MAP;
skb->vlan_tci = SKB_VLAN_TCI;
+ skb->vlan_proto = htons(ETH_P_IP);
skb->dev = &dev;
skb->dev->ifindex = SKB_DEV_IFINDEX;
skb->dev->type = SKB_DEV_TYPE;
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 5e51872b3fc1..fbdf87920093 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -20,6 +20,11 @@
#include <linux/uaccess.h>
#include <linux/module.h>
+/*
+ * Note: test functions are marked noinline so that their names appear in
+ * reports.
+ */
+
static noinline void __init kmalloc_oob_right(void)
{
char *ptr;
@@ -411,6 +416,29 @@ static noinline void __init copy_user_test(void)
kfree(kmem);
}
+static noinline void __init use_after_scope_test(void)
+{
+ volatile char *volatile p;
+
+ pr_info("use-after-scope on int\n");
+ {
+ int local = 0;
+
+ p = (char *)&local;
+ }
+ p[0] = 1;
+ p[3] = 1;
+
+ pr_info("use-after-scope on array\n");
+ {
+ char local[1024] = {0};
+
+ p = local;
+ }
+ p[0] = 1;
+ p[1023] = 1;
+}
+
static int __init kmalloc_tests_init(void)
{
kmalloc_oob_right();
@@ -436,6 +464,7 @@ static int __init kmalloc_tests_init(void)
kasan_global_oob();
ksize_unpoisons_memory();
copy_user_test();
+ use_after_scope_test();
return -EAGAIN;
}
diff --git a/lib/ucs2_string.c b/lib/ucs2_string.c
index f0b323abb4c6..ae8d2491133c 100644
--- a/lib/ucs2_string.c
+++ b/lib/ucs2_string.c
@@ -56,7 +56,7 @@ ucs2_utf8size(const ucs2_char_t *src)
unsigned long i;
unsigned long j = 0;
- for (i = 0; i < ucs2_strlen(src); i++) {
+ for (i = 0; src[i]; i++) {
u16 c = src[i];
if (c >= 0x800)
diff --git a/lib/win_minmax.c b/lib/win_minmax.c
new file mode 100644
index 000000000000..c8420d404926
--- /dev/null
+++ b/lib/win_minmax.c
@@ -0,0 +1,98 @@
+/**
+ * lib/minmax.c: windowed min/max tracker
+ *
+ * Kathleen Nichols' algorithm for tracking the minimum (or maximum)
+ * value of a data stream over some fixed time interval. (E.g.,
+ * the minimum RTT over the past five minutes.) It uses constant
+ * space and constant time per update yet almost always delivers
+ * the same minimum as an implementation that has to keep all the
+ * data in the window.
+ *
+ * The algorithm keeps track of the best, 2nd best & 3rd best min
+ * values, maintaining an invariant that the measurement time of
+ * the n'th best >= n-1'th best. It also makes sure that the three
+ * values are widely separated in the time window since that bounds
+ * the worse case error when that data is monotonically increasing
+ * over the window.
+ *
+ * Upon getting a new min, we can forget everything earlier because
+ * it has no value - the new min is <= everything else in the window
+ * by definition and it's the most recent. So we restart fresh on
+ * every new min and overwrites 2nd & 3rd choices. The same property
+ * holds for 2nd & 3rd best.
+ */
+#include <linux/module.h>
+#include <linux/win_minmax.h>
+
+/* As time advances, update the 1st, 2nd, and 3rd choices. */
+static u32 minmax_subwin_update(struct minmax *m, u32 win,
+ const struct minmax_sample *val)
+{
+ u32 dt = val->t - m->s[0].t;
+
+ if (unlikely(dt > win)) {
+ /*
+ * Passed entire window without a new val so make 2nd
+ * choice the new val & 3rd choice the new 2nd choice.
+ * we may have to iterate this since our 2nd choice
+ * may also be outside the window (we checked on entry
+ * that the third choice was in the window).
+ */
+ m->s[0] = m->s[1];
+ m->s[1] = m->s[2];
+ m->s[2] = *val;
+ if (unlikely(val->t - m->s[0].t > win)) {
+ m->s[0] = m->s[1];
+ m->s[1] = m->s[2];
+ m->s[2] = *val;
+ }
+ } else if (unlikely(m->s[1].t == m->s[0].t) && dt > win/4) {
+ /*
+ * We've passed a quarter of the window without a new val
+ * so take a 2nd choice from the 2nd quarter of the window.
+ */
+ m->s[2] = m->s[1] = *val;
+ } else if (unlikely(m->s[2].t == m->s[1].t) && dt > win/2) {
+ /*
+ * We've passed half the window without finding a new val
+ * so take a 3rd choice from the last half of the window
+ */
+ m->s[2] = *val;
+ }
+ return m->s[0].v;
+}
+
+/* Check if new measurement updates the 1st, 2nd or 3rd choice max. */
+u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas)
+{
+ struct minmax_sample val = { .t = t, .v = meas };
+
+ if (unlikely(val.v >= m->s[0].v) || /* found new max? */
+ unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */
+ return minmax_reset(m, t, meas); /* forget earlier samples */
+
+ if (unlikely(val.v >= m->s[1].v))
+ m->s[2] = m->s[1] = val;
+ else if (unlikely(val.v >= m->s[2].v))
+ m->s[2] = val;
+
+ return minmax_subwin_update(m, win, &val);
+}
+EXPORT_SYMBOL(minmax_running_max);
+
+/* Check if new measurement updates the 1st, 2nd or 3rd choice min. */
+u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas)
+{
+ struct minmax_sample val = { .t = t, .v = meas };
+
+ if (unlikely(val.v <= m->s[0].v) || /* found new min? */
+ unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */
+ return minmax_reset(m, t, meas); /* forget earlier samples */
+
+ if (unlikely(val.v <= m->s[1].v))
+ m->s[2] = m->s[1] = val;
+ else if (unlikely(val.v <= m->s[2].v))
+ m->s[2] = val;
+
+ return minmax_subwin_update(m, win, &val);
+}