From 8e5a2b9893f36457582596fdade10f6feb2797ee Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Thu, 21 Feb 2019 15:50:19 -0800 Subject: percpu: update free path with correct new free region When updating the chunk's contig_hint on the free path of a hint that does not touch the page boundaries, it was incorrectly using the starting offset of the free region and the block's contig_hint. This could lead to incorrect assumptions about fit given a size and better alignment of the start. Fix this by using (end - start) as this is only called when updating a hint within a block. Signed-off-by: Dennis Zhou Reviewed-by: Peng Fan --- mm/percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index 2e6fc8d552c9..938f295a60d4 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -871,7 +871,7 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, pcpu_chunk_refresh_hint(chunk); else pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start), - s_block->contig_hint); + end - start); } /** -- cgit v1.2.3-55-g7522 From 8c43004af01635cc9fbb11031d070e5e0d327ef2 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Thu, 21 Feb 2019 15:54:11 -0800 Subject: percpu: do not search past bitmap when allocating an area pcpu_find_block_fit() guarantees that a fit is found within PCPU_BITMAP_BLOCK_BITS. Iteration is used to determine the first fit as it compares against the block's contig_hint. This can lead to incorrectly scanning past the end of the bitmap. The behavior was okay given the check after for bit_off >= end and the correctness of the hints from pcpu_find_block_fit(). This patch fixes this by bounding the end offset by the number of bits in a chunk. Signed-off-by: Dennis Zhou Reviewed-by: Peng Fan --- mm/percpu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index 938f295a60d4..769b7583975b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -988,7 +988,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, /* * Search to find a fit. */ - end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS; + end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS, + pcpu_chunk_map_bits(chunk)); bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start, alloc_bits, align_mask); if (bit_off >= end) -- cgit v1.2.3-55-g7522 From d9f3a01eebe80180babd8541406490020f184d17 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Thu, 21 Feb 2019 15:44:35 -0800 Subject: percpu: introduce helper to determine if two regions overlap While block hints were always accurate, it's possible when spanning across blocks that we miss updating the chunk's contig_hint. Rather than rely on correctness of the boundaries of hints, do a full overlap comparison. A future patch introduces the scan_hint which makes the contig_hint slightly fuzzy as they can at times be smaller than the actual hint. Signed-off-by: Dennis Zhou --- mm/percpu.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index 769b7583975b..cbace9e79f2d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -546,6 +546,21 @@ static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off, bitmap_weight(chunk->populated, page_start); } +/* + * pcpu_region_overlap - determines if two regions overlap + * @a: start of first region, inclusive + * @b: end of first region, exclusive + * @x: start of second region, inclusive + * @y: end of second region, exclusive + * + * This is used to determine if the hint region [a, b) overlaps with the + * allocated region [x, y). + */ +static inline bool pcpu_region_overlap(int a, int b, int x, int y) +{ + return (a < y) && (x < b); +} + /** * pcpu_chunk_update - updates the chunk metadata given a free area * @chunk: chunk of interest @@ -710,8 +725,11 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, PCPU_BITMAP_BLOCK_BITS, s_off + bits); - if (s_off >= s_block->contig_hint_start && - s_off < s_block->contig_hint_start + s_block->contig_hint) { + if (pcpu_region_overlap(s_block->contig_hint_start, + s_block->contig_hint_start + + s_block->contig_hint, + s_off, + s_off + bits)) { /* block contig hint is broken - scan to fix it */ pcpu_block_refresh_hint(chunk, s_index); } else { @@ -764,8 +782,10 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, * contig hint is broken. Otherwise, it means a smaller space * was used and therefore the chunk contig hint is still correct. */ - if (bit_off >= chunk->contig_bits_start && - bit_off < chunk->contig_bits_start + chunk->contig_bits) + if (pcpu_region_overlap(chunk->contig_bits_start, + chunk->contig_bits_start + chunk->contig_bits, + bit_off, + bit_off + bits)) pcpu_chunk_refresh_hint(chunk); } -- cgit v1.2.3-55-g7522 From 3e54097beb228ddcd73bb2fd18bafaa1062e9fe4 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Mon, 25 Feb 2019 13:43:38 -0800 Subject: percpu: manage chunks based on contig_bits instead of free_bytes When a chunk becomes fragmented, it can end up having a large number of small allocation areas free. The free_bytes sorting of chunks leads to unnecessary checking of chunks that cannot satisfy the allocation. Switch to contig_bits sorting to prevent scanning chunks that may not be able to service the allocation request. Signed-off-by: Dennis Zhou Reviewed-by: Peng Fan --- mm/percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index cbace9e79f2d..fc1e8efb5222 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -234,7 +234,7 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0) return 0; - return pcpu_size_to_slot(chunk->free_bytes); + return pcpu_size_to_slot(chunk->contig_bits * PCPU_MIN_ALLOC_SIZE); } /* set the pointer to a chunk in a page struct */ -- cgit v1.2.3-55-g7522 From 8744d859427c6198dce490619809754336954297 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Mon, 25 Feb 2019 09:03:50 -0800 Subject: percpu: relegate chunks unusable when failing small allocations In certain cases, requestors of percpu memory may want specific alignments. However, it is possible to end up in situations where the contig_hint matches, but the alignment does not. This causes excess scanning of chunks that will fail. To prevent this, if a small allocation fails (< 32B), the chunk is moved to the empty list. Once an allocation is freed from that chunk, it is placed back into rotation. Signed-off-by: Dennis Zhou Reviewed-by: Peng Fan --- mm/percpu.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index fc1e8efb5222..2c1a9a2ca13b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -94,6 +94,8 @@ /* the slots are sorted by free bytes left, 1-31 bytes share the same slot */ #define PCPU_SLOT_BASE_SHIFT 5 +/* chunks in slots below this are subject to being sidelined on failed alloc */ +#define PCPU_SLOT_FAIL_THRESHOLD 3 #define PCPU_EMPTY_POP_PAGES_LOW 2 #define PCPU_EMPTY_POP_PAGES_HIGH 4 @@ -488,6 +490,22 @@ static void pcpu_mem_free(void *ptr) kvfree(ptr); } +static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot, + bool move_front) +{ + if (chunk != pcpu_reserved_chunk) { + if (move_front) + list_move(&chunk->list, &pcpu_slot[slot]); + else + list_move_tail(&chunk->list, &pcpu_slot[slot]); + } +} + +static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot) +{ + __pcpu_chunk_move(chunk, slot, true); +} + /** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest @@ -505,12 +523,8 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) { int nslot = pcpu_chunk_slot(chunk); - if (chunk != pcpu_reserved_chunk && oslot != nslot) { - if (oslot < nslot) - list_move(&chunk->list, &pcpu_slot[nslot]); - else - list_move_tail(&chunk->list, &pcpu_slot[nslot]); - } + if (oslot != nslot) + __pcpu_chunk_move(chunk, nslot, oslot < nslot); } /** @@ -1395,7 +1409,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; bool do_warn = !(gfp & __GFP_NOWARN); static int warn_limit = 10; - struct pcpu_chunk *chunk; + struct pcpu_chunk *chunk, *next; const char *err; int slot, off, cpu, ret; unsigned long flags; @@ -1457,11 +1471,14 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, restart: /* search through normal chunks */ for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { - list_for_each_entry(chunk, &pcpu_slot[slot], list) { + list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) { off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); - if (off < 0) + if (off < 0) { + if (slot < PCPU_SLOT_FAIL_THRESHOLD) + pcpu_chunk_move(chunk, 0); continue; + } off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) -- cgit v1.2.3-55-g7522 From b239f7daf5530f562000bf55f02cc8028703f507 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Wed, 13 Feb 2019 11:10:30 -0800 Subject: percpu: set PCPU_BITMAP_BLOCK_SIZE to PAGE_SIZE Previously, block size was flexible based on the constraint that the GCD(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) > 1. However, this carried the overhead that keeping a floating number of populated free pages required scanning over the free regions of a chunk. Setting the block size to be fixed at PAGE_SIZE lets us know when an empty page becomes used as we will break a full contig_hint of a block. This means we no longer have to scan the whole chunk upon breaking a contig_hint which empty page management piggybacked off. A later patch takes advantage of this to optimize the allocation path by only scanning forward using the scan_hint introduced later too. Signed-off-by: Dennis Zhou Reviewed-by: Peng Fan --- include/linux/percpu.h | 12 ++---- mm/percpu-km.c | 2 +- mm/percpu.c | 110 ++++++++++++++++++++----------------------------- 3 files changed, 48 insertions(+), 76 deletions(-) (limited to 'mm/percpu.c') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 70b7123f38c7..9909dc0e273a 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -26,16 +26,10 @@ #define PCPU_MIN_ALLOC_SHIFT 2 #define PCPU_MIN_ALLOC_SIZE (1 << PCPU_MIN_ALLOC_SHIFT) -/* number of bits per page, used to trigger a scan if blocks are > PAGE_SIZE */ -#define PCPU_BITS_PER_PAGE (PAGE_SIZE >> PCPU_MIN_ALLOC_SHIFT) - /* - * This determines the size of each metadata block. There are several subtle - * constraints around this constant. The reserved region must be a multiple of - * PCPU_BITMAP_BLOCK_SIZE. Additionally, PCPU_BITMAP_BLOCK_SIZE must be a - * multiple of PAGE_SIZE or PAGE_SIZE must be a multiple of - * PCPU_BITMAP_BLOCK_SIZE to align with the populated page map. The unit_size - * also has to be a multiple of PCPU_BITMAP_BLOCK_SIZE to ensure full blocks. + * The PCPU_BITMAP_BLOCK_SIZE must be the same size as PAGE_SIZE as the + * updating of hints is used to manage the nr_empty_pop_pages in both + * the chunk and globally. */ #define PCPU_BITMAP_BLOCK_SIZE PAGE_SIZE #define PCPU_BITMAP_BLOCK_BITS (PCPU_BITMAP_BLOCK_SIZE >> \ diff --git a/mm/percpu-km.c b/mm/percpu-km.c index b68d5df14731..3a2ff5c9192c 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -70,7 +70,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) chunk->base_addr = page_address(pages); spin_lock_irqsave(&pcpu_lock, flags); - pcpu_chunk_populated(chunk, 0, nr_pages, false); + pcpu_chunk_populated(chunk, 0, nr_pages); spin_unlock_irqrestore(&pcpu_lock, flags); pcpu_stats_chunk_alloc(); diff --git a/mm/percpu.c b/mm/percpu.c index 2c1a9a2ca13b..0e98616501b3 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -527,37 +527,20 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) __pcpu_chunk_move(chunk, nslot, oslot < nslot); } -/** - * pcpu_cnt_pop_pages- counts populated backing pages in range +/* + * pcpu_update_empty_pages - update empty page counters * @chunk: chunk of interest - * @bit_off: start offset - * @bits: size of area to check + * @nr: nr of empty pages * - * Calculates the number of populated pages in the region - * [page_start, page_end). This keeps track of how many empty populated - * pages are available and decide if async work should be scheduled. - * - * RETURNS: - * The nr of populated pages. + * This is used to keep track of the empty pages now based on the premise + * a md_block covers a page. The hint update functions recognize if a block + * is made full or broken to calculate deltas for keeping track of free pages. */ -static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off, - int bits) +static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr) { - int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE); - int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); - - if (page_start >= page_end) - return 0; - - /* - * bitmap_weight counts the number of bits set in a bitmap up to - * the specified number of bits. This is counting the populated - * pages up to page_end and then subtracting the populated pages - * up to page_start to count the populated pages in - * [page_start, page_end). - */ - return bitmap_weight(chunk->populated, page_end) - - bitmap_weight(chunk->populated, page_start); + chunk->nr_empty_pop_pages += nr; + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages += nr; } /* @@ -608,36 +591,19 @@ static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits) * Updates: * chunk->contig_bits * chunk->contig_bits_start - * nr_empty_pop_pages (chunk and global) */ static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk) { - int bit_off, bits, nr_empty_pop_pages; + int bit_off, bits; /* clear metadata */ chunk->contig_bits = 0; bit_off = chunk->first_bit; - bits = nr_empty_pop_pages = 0; + bits = 0; pcpu_for_each_md_free_region(chunk, bit_off, bits) { pcpu_chunk_update(chunk, bit_off, bits); - - nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, bit_off, bits); } - - /* - * Keep track of nr_empty_pop_pages. - * - * The chunk maintains the previous number of free pages it held, - * so the delta is used to update the global counter. The reserved - * chunk is not part of the free page count as they are populated - * at init and are special to serving reserved allocations. - */ - if (chunk != pcpu_reserved_chunk) - pcpu_nr_empty_pop_pages += - (nr_empty_pop_pages - chunk->nr_empty_pop_pages); - - chunk->nr_empty_pop_pages = nr_empty_pop_pages; } /** @@ -709,6 +675,7 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, int bits) { + int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */ @@ -733,6 +700,9 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, * If the allocation breaks the contig_hint, a scan is required to * restore this hint. */ + if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) + nr_empty_pages++; + if (s_off == s_block->first_free) s_block->first_free = find_next_zero_bit( pcpu_index_alloc_map(chunk, s_index), @@ -760,6 +730,9 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, * Update e_block. */ if (s_index != e_index) { + if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) + nr_empty_pages++; + /* * When the allocation is across blocks, the end is along * the left part of the e_block. @@ -784,6 +757,7 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, } /* update in-between md_blocks */ + nr_empty_pages += (e_index - s_index - 1); for (block = s_block + 1; block < e_block; block++) { block->contig_hint = 0; block->left_free = 0; @@ -791,6 +765,9 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, } } + if (nr_empty_pages) + pcpu_update_empty_pages(chunk, -nr_empty_pages); + /* * The only time a full chunk scan is required is if the chunk * contig hint is broken. Otherwise, it means a smaller space @@ -823,6 +800,7 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, int bits) { + int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */ @@ -876,14 +854,19 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, /* update s_block */ e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS; + if (!start && e_off == PCPU_BITMAP_BLOCK_BITS) + nr_empty_pages++; pcpu_block_update(s_block, start, e_off); /* freeing in the same block */ if (s_index != e_index) { /* update e_block */ + if (end == PCPU_BITMAP_BLOCK_BITS) + nr_empty_pages++; pcpu_block_update(e_block, 0, end); /* reset md_blocks in the middle */ + nr_empty_pages += (e_index - s_index - 1); for (block = s_block + 1; block < e_block; block++) { block->first_free = 0; block->contig_hint_start = 0; @@ -893,15 +876,16 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, } } + if (nr_empty_pages) + pcpu_update_empty_pages(chunk, nr_empty_pages); + /* - * Refresh chunk metadata when the free makes a page free, a block - * free, or spans across blocks. The contig hint may be off by up to - * a page, but if the hint is contained in a block, it will be accurate - * with the else condition below. + * Refresh chunk metadata when the free makes a block free or spans + * across blocks. The contig_hint may be off by up to a page, but if + * the contig_hint is contained in a block, it will be accurate with + * the else condition below. */ - if ((ALIGN_DOWN(end, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS)) > - ALIGN(start, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS))) || - s_index != e_index) + if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index) pcpu_chunk_refresh_hint(chunk); else pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start), @@ -1178,9 +1162,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, chunk->immutable = true; bitmap_fill(chunk->populated, chunk->nr_pages); chunk->nr_populated = chunk->nr_pages; - chunk->nr_empty_pop_pages = - pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE, - map_size / PCPU_MIN_ALLOC_SIZE); + chunk->nr_empty_pop_pages = chunk->nr_pages; chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE; chunk->free_bytes = map_size; @@ -1275,7 +1257,6 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) * @chunk: pcpu_chunk which got populated * @page_start: the start page * @page_end: the end page - * @for_alloc: if this is to populate for allocation * * Pages in [@page_start,@page_end) have been populated to @chunk. Update * the bookkeeping information accordingly. Must be called after each @@ -1285,7 +1266,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) * is to serve an allocation in that area. */ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, - int page_end, bool for_alloc) + int page_end) { int nr = page_end - page_start; @@ -1295,10 +1276,7 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, chunk->nr_populated += nr; pcpu_nr_populated += nr; - if (!for_alloc) { - chunk->nr_empty_pop_pages += nr; - pcpu_nr_empty_pop_pages += nr; - } + pcpu_update_empty_pages(chunk, nr); } /** @@ -1320,9 +1298,9 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, bitmap_clear(chunk->populated, page_start, nr); chunk->nr_populated -= nr; - chunk->nr_empty_pop_pages -= nr; - pcpu_nr_empty_pop_pages -= nr; pcpu_nr_populated -= nr; + + pcpu_update_empty_pages(chunk, -nr); } /* @@ -1537,7 +1515,7 @@ area_found: err = "failed to populate"; goto fail_unlock; } - pcpu_chunk_populated(chunk, rs, re, true); + pcpu_chunk_populated(chunk, rs, re); spin_unlock_irqrestore(&pcpu_lock, flags); } @@ -1736,7 +1714,7 @@ retry_pop: if (!ret) { nr_to_pop -= nr; spin_lock_irq(&pcpu_lock); - pcpu_chunk_populated(chunk, rs, rs + nr, false); + pcpu_chunk_populated(chunk, rs, rs + nr); spin_unlock_irq(&pcpu_lock); } else { nr_to_pop = 0; -- cgit v1.2.3-55-g7522 From 382b88e961c7a4196e01cef3249297583d02d608 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Mon, 25 Feb 2019 13:41:45 -0800 Subject: percpu: add block level scan_hint Fragmentation can cause both blocks and chunks to have an early first_firee bit available, but only able to satisfy allocations much later on. This patch introduces a scan_hint to help mitigate some unnecessary scanning. The scan_hint remembers the largest area prior to the contig_hint. If the contig_hint == scan_hint, then scan_hint_start > contig_hint_start. This is necessary for scan_hint discovery when refreshing a block. Signed-off-by: Dennis Zhou Reviewed-by: Peng Fan --- mm/percpu-internal.h | 9 +++++ mm/percpu.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 103 insertions(+), 7 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index b1739dc06b73..ec58b244545d 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -9,8 +9,17 @@ * pcpu_block_md is the metadata block struct. * Each chunk's bitmap is split into a number of full blocks. * All units are in terms of bits. + * + * The scan hint is the largest known contiguous area before the contig hint. + * It is not necessarily the actual largest contig hint though. There is an + * invariant that the scan_hint_start > contig_hint_start iff + * scan_hint == contig_hint. This is necessary because when scanning forward, + * we don't know if a new contig hint would be better than the current one. */ struct pcpu_block_md { + int scan_hint; /* scan hint for block */ + int scan_hint_start; /* block relative starting + position of the scan hint */ int contig_hint; /* contig hint for block */ int contig_hint_start; /* block relative starting position of the contig hint */ diff --git a/mm/percpu.c b/mm/percpu.c index 0e98616501b3..48c3da6cff7f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -320,6 +320,34 @@ static unsigned long pcpu_block_off_to_off(int index, int off) return index * PCPU_BITMAP_BLOCK_BITS + off; } +/* + * pcpu_next_hint - determine which hint to use + * @block: block of interest + * @alloc_bits: size of allocation + * + * This determines if we should scan based on the scan_hint or first_free. + * In general, we want to scan from first_free to fulfill allocations by + * first fit. However, if we know a scan_hint at position scan_hint_start + * cannot fulfill an allocation, we can begin scanning from there knowing + * the contig_hint will be our fallback. + */ +static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits) +{ + /* + * The three conditions below determine if we can skip past the + * scan_hint. First, does the scan hint exist. Second, is the + * contig_hint after the scan_hint (possibly not true iff + * contig_hint == scan_hint). Third, is the allocation request + * larger than the scan_hint. + */ + if (block->scan_hint && + block->contig_hint_start > block->scan_hint_start && + alloc_bits > block->scan_hint) + return block->scan_hint_start + block->scan_hint; + + return block->first_free; +} + /** * pcpu_next_md_free_region - finds the next hint free area * @chunk: chunk of interest @@ -415,9 +443,11 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, if (block->contig_hint && block->contig_hint_start >= block_off && block->contig_hint >= *bits + alloc_bits) { + int start = pcpu_next_hint(block, alloc_bits); + *bits += alloc_bits + block->contig_hint_start - - block->first_free; - *bit_off = pcpu_block_off_to_off(i, block->first_free); + start; + *bit_off = pcpu_block_off_to_off(i, start); return; } /* reset to satisfy the second predicate above */ @@ -628,12 +658,57 @@ static void pcpu_block_update(struct pcpu_block_md *block, int start, int end) block->right_free = contig; if (contig > block->contig_hint) { + /* promote the old contig_hint to be the new scan_hint */ + if (start > block->contig_hint_start) { + if (block->contig_hint > block->scan_hint) { + block->scan_hint_start = + block->contig_hint_start; + block->scan_hint = block->contig_hint; + } else if (start < block->scan_hint_start) { + /* + * The old contig_hint == scan_hint. But, the + * new contig is larger so hold the invariant + * scan_hint_start < contig_hint_start. + */ + block->scan_hint = 0; + } + } else { + block->scan_hint = 0; + } block->contig_hint_start = start; block->contig_hint = contig; - } else if (block->contig_hint_start && contig == block->contig_hint && - (!start || __ffs(start) > __ffs(block->contig_hint_start))) { - /* use the start with the best alignment */ - block->contig_hint_start = start; + } else if (contig == block->contig_hint) { + if (block->contig_hint_start && + (!start || + __ffs(start) > __ffs(block->contig_hint_start))) { + /* start has a better alignment so use it */ + block->contig_hint_start = start; + if (start < block->scan_hint_start && + block->contig_hint > block->scan_hint) + block->scan_hint = 0; + } else if (start > block->scan_hint_start || + block->contig_hint > block->scan_hint) { + /* + * Knowing contig == contig_hint, update the scan_hint + * if it is farther than or larger than the current + * scan_hint. + */ + block->scan_hint_start = start; + block->scan_hint = contig; + } + } else { + /* + * The region is smaller than the contig_hint. So only update + * the scan_hint if it is larger than or equal and farther than + * the current scan_hint. + */ + if ((start < block->contig_hint_start && + (contig > block->scan_hint || + (contig == block->scan_hint && + start > block->scan_hint_start)))) { + block->scan_hint_start = start; + block->scan_hint = contig; + } } } @@ -652,7 +727,7 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) int rs, re; /* region start, region end */ /* clear hints */ - block->contig_hint = 0; + block->contig_hint = block->scan_hint = 0; block->left_free = block->right_free = 0; /* iterate over free areas and update the contig hints */ @@ -709,6 +784,12 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, PCPU_BITMAP_BLOCK_BITS, s_off + bits); + if (pcpu_region_overlap(s_block->scan_hint_start, + s_block->scan_hint_start + s_block->scan_hint, + s_off, + s_off + bits)) + s_block->scan_hint = 0; + if (pcpu_region_overlap(s_block->contig_hint_start, s_block->contig_hint_start + s_block->contig_hint, @@ -745,6 +826,9 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, /* reset the block */ e_block++; } else { + if (e_off > e_block->scan_hint_start) + e_block->scan_hint = 0; + if (e_off > e_block->contig_hint_start) { /* contig hint is broken - scan to fix it */ pcpu_block_refresh_hint(chunk, e_index); @@ -759,6 +843,7 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, /* update in-between md_blocks */ nr_empty_pages += (e_index - s_index - 1); for (block = s_block + 1; block < e_block; block++) { + block->scan_hint = 0; block->contig_hint = 0; block->left_free = 0; block->right_free = 0; @@ -869,6 +954,7 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, nr_empty_pages += (e_index - s_index - 1); for (block = s_block + 1; block < e_block; block++) { block->first_free = 0; + block->scan_hint = 0; block->contig_hint_start = 0; block->contig_hint = PCPU_BITMAP_BLOCK_BITS; block->left_free = PCPU_BITMAP_BLOCK_BITS; @@ -1080,6 +1166,7 @@ static void pcpu_init_md_blocks(struct pcpu_chunk *chunk) for (md_block = chunk->md_blocks; md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk); md_block++) { + md_block->scan_hint = 0; md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS; md_block->left_free = PCPU_BITMAP_BLOCK_BITS; md_block->right_free = PCPU_BITMAP_BLOCK_BITS; -- cgit v1.2.3-55-g7522 From b89462a9c5f4a3ac5160e7b3599bb09c94b94880 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Fri, 22 Feb 2019 09:03:16 -0800 Subject: percpu: remember largest area skipped during allocation Percpu allocations attempt to do first fit by scanning forward from the first_free of a block. However, fragmentation from allocation requests can cause holes not seen by block hint update functions. To address this, create a local version of bitmap_find_next_zero_area_off() that remembers the largest area skipped over. The caveat is that it only sees regions skipped over due to not fitting, not regions skipped due to alignment. Prior to updating the scan_hint, a scan backwards is done to try and recover free bits skipped due to alignment. While this can cause scanning to miss earlier possible free areas, smaller allocations will eventually fill those holes due to first fit. Signed-off-by: Dennis Zhou --- mm/percpu.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 99 insertions(+), 2 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index 48c3da6cff7f..b82f3cacf2ed 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -712,6 +712,43 @@ static void pcpu_block_update(struct pcpu_block_md *block, int start, int end) } } +/* + * pcpu_block_update_scan - update a block given a free area from a scan + * @chunk: chunk of interest + * @bit_off: chunk offset + * @bits: size of free area + * + * Finding the final allocation spot first goes through pcpu_find_block_fit() + * to find a block that can hold the allocation and then pcpu_alloc_area() + * where a scan is used. When allocations require specific alignments, + * we can inadvertently create holes which will not be seen in the alloc + * or free paths. + * + * This takes a given free area hole and updates a block as it may change the + * scan_hint. We need to scan backwards to ensure we don't miss free bits + * from alignment. + */ +static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off, + int bits) +{ + int s_off = pcpu_off_to_block_off(bit_off); + int e_off = s_off + bits; + int s_index, l_bit; + struct pcpu_block_md *block; + + if (e_off > PCPU_BITMAP_BLOCK_BITS) + return; + + s_index = pcpu_off_to_block_index(bit_off); + block = chunk->md_blocks + s_index; + + /* scan backwards in case of alignment skipping free bits */ + l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off); + s_off = (s_off == l_bit) ? 0 : l_bit + 1; + + pcpu_block_update(block, s_off, e_off); +} + /** * pcpu_block_refresh_hint * @chunk: chunk of interest @@ -1060,6 +1097,62 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, return bit_off; } +/* + * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off() + * @map: the address to base the search on + * @size: the bitmap size in bits + * @start: the bitnumber to start searching at + * @nr: the number of zeroed bits we're looking for + * @align_mask: alignment mask for zero area + * @largest_off: offset of the largest area skipped + * @largest_bits: size of the largest area skipped + * + * The @align_mask should be one less than a power of 2. + * + * This is a modified version of bitmap_find_next_zero_area_off() to remember + * the largest area that was skipped. This is imperfect, but in general is + * good enough. The largest remembered region is the largest failed region + * seen. This does not include anything we possibly skipped due to alignment. + * pcpu_block_update_scan() does scan backwards to try and recover what was + * lost to alignment. While this can cause scanning to miss earlier possible + * free areas, smaller allocations will eventually fill those holes. + */ +static unsigned long pcpu_find_zero_area(unsigned long *map, + unsigned long size, + unsigned long start, + unsigned long nr, + unsigned long align_mask, + unsigned long *largest_off, + unsigned long *largest_bits) +{ + unsigned long index, end, i, area_off, area_bits; +again: + index = find_next_zero_bit(map, size, start); + + /* Align allocation */ + index = __ALIGN_MASK(index, align_mask); + area_off = index; + + end = index + nr; + if (end > size) + return end; + i = find_next_bit(map, end, index); + if (i < end) { + area_bits = i - area_off; + /* remember largest unused area with best alignment */ + if (area_bits > *largest_bits || + (area_bits == *largest_bits && *largest_off && + (!area_off || __ffs(area_off) > __ffs(*largest_off)))) { + *largest_off = area_off; + *largest_bits = area_bits; + } + + start = i + 1; + goto again; + } + return index; +} + /** * pcpu_alloc_area - allocates an area from a pcpu_chunk * @chunk: chunk of interest @@ -1083,6 +1176,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, size_t align, int start) { size_t align_mask = (align) ? (align - 1) : 0; + unsigned long area_off = 0, area_bits = 0; int bit_off, end, oslot; lockdep_assert_held(&pcpu_lock); @@ -1094,11 +1188,14 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, */ end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS, pcpu_chunk_map_bits(chunk)); - bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start, - alloc_bits, align_mask); + bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits, + align_mask, &area_off, &area_bits); if (bit_off >= end) return -1; + if (area_bits) + pcpu_block_update_scan(chunk, area_off, area_bits); + /* update alloc map */ bitmap_set(chunk->alloc_map, bit_off, alloc_bits); -- cgit v1.2.3-55-g7522 From da3afdd5bb5428fd38b4b64f2d5e897c3bb78354 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Mon, 25 Feb 2019 14:10:15 -0800 Subject: percpu: use block scan_hint to only scan forward Blocks now remember the latest scan_hint. This can be used on the allocation path as when a contig_hint is broken, we can promote the scan_hint to the contig_hint and scan forward from there. This works because pcpu_block_refresh_hint() is only called on the allocation path while block free regions are updated manually in pcpu_block_update_hint_free(). Signed-off-by: Dennis Zhou --- mm/percpu.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index b82f3cacf2ed..c5250e162d4d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -761,14 +761,23 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) { struct pcpu_block_md *block = chunk->md_blocks + index; unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index); - int rs, re; /* region start, region end */ + int rs, re, start; /* region start, region end */ + + /* promote scan_hint to contig_hint */ + if (block->scan_hint) { + start = block->scan_hint_start + block->scan_hint; + block->contig_hint_start = block->scan_hint_start; + block->contig_hint = block->scan_hint; + block->scan_hint = 0; + } else { + start = block->first_free; + block->contig_hint = 0; + } - /* clear hints */ - block->contig_hint = block->scan_hint = 0; - block->left_free = block->right_free = 0; + block->right_free = 0; /* iterate over free areas and update the contig hints */ - pcpu_for_each_unpop_region(alloc_map, rs, re, block->first_free, + pcpu_for_each_unpop_region(alloc_map, rs, re, start, PCPU_BITMAP_BLOCK_BITS) { pcpu_block_update(block, rs, re); } @@ -833,6 +842,8 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, s_off, s_off + bits)) { /* block contig hint is broken - scan to fix it */ + if (!s_off) + s_block->left_free = 0; pcpu_block_refresh_hint(chunk, s_index); } else { /* update left and right contig manually */ @@ -866,11 +877,11 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, if (e_off > e_block->scan_hint_start) e_block->scan_hint = 0; + e_block->left_free = 0; if (e_off > e_block->contig_hint_start) { /* contig hint is broken - scan to fix it */ pcpu_block_refresh_hint(chunk, e_index); } else { - e_block->left_free = 0; e_block->right_free = min_t(int, e_block->right_free, PCPU_BITMAP_BLOCK_BITS - e_off); -- cgit v1.2.3-55-g7522 From 047924c96898266e9a37412434abd1db72600384 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Tue, 26 Feb 2019 09:56:16 -0800 Subject: percpu: make pcpu_block_md generic In reality, a chunk is just a block covering a larger number of bits. The hints themselves are one in the same. Rather than maintaining the hints separately, first introduce nr_bits to genericize pcpu_block_update() to correctly maintain block->right_free. The next patch will convert chunk hints to be managed as a pcpu_block_md. Signed-off-by: Dennis Zhou Reviewed-by: Peng Fan --- mm/percpu-internal.h | 1 + mm/percpu.c | 20 +++++++++++++------- 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index ec58b244545d..119bd1119aa7 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -28,6 +28,7 @@ struct pcpu_block_md { int right_free; /* size of free space along the right side of the block */ int first_free; /* block position of first free */ + int nr_bits; /* total bits responsible for */ }; struct pcpu_chunk { diff --git a/mm/percpu.c b/mm/percpu.c index c5250e162d4d..acc72d37a830 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -654,7 +654,7 @@ static void pcpu_block_update(struct pcpu_block_md *block, int start, int end) if (start == 0) block->left_free = contig; - if (end == PCPU_BITMAP_BLOCK_BITS) + if (end == block->nr_bits) block->right_free = contig; if (contig > block->contig_hint) { @@ -1267,18 +1267,24 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int off) pcpu_chunk_relocate(chunk, oslot); } +static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits) +{ + block->scan_hint = 0; + block->contig_hint = nr_bits; + block->left_free = nr_bits; + block->right_free = nr_bits; + block->first_free = 0; + block->nr_bits = nr_bits; +} + static void pcpu_init_md_blocks(struct pcpu_chunk *chunk) { struct pcpu_block_md *md_block; for (md_block = chunk->md_blocks; md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk); - md_block++) { - md_block->scan_hint = 0; - md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS; - md_block->left_free = PCPU_BITMAP_BLOCK_BITS; - md_block->right_free = PCPU_BITMAP_BLOCK_BITS; - } + md_block++) + pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS); } /** -- cgit v1.2.3-55-g7522 From 92c14cab43267411bc9160f23d55a7548d814483 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Tue, 26 Feb 2019 10:00:08 -0800 Subject: percpu: convert chunk hints to be based on pcpu_block_md As mentioned in the last patch, a chunk's hints are no different than a block just responsible for more bits. This converts chunk level hints to use a pcpu_block_md to maintain them. This lets us reuse the same hint helper functions as a block. The left_free and right_free are unused by the chunk's pcpu_block_md. Signed-off-by: Dennis Zhou Reviewed-by: Peng Fan --- mm/percpu-internal.h | 5 +-- mm/percpu-stats.c | 5 ++- mm/percpu.c | 120 +++++++++++++++++++++++---------------------------- 3 files changed, 57 insertions(+), 73 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 119bd1119aa7..0468ba500bd4 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -39,9 +39,7 @@ struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ int free_bytes; /* free bytes in the chunk */ - int contig_bits; /* max contiguous size hint */ - int contig_bits_start; /* contig_bits starting - offset */ + struct pcpu_block_md chunk_md; void *base_addr; /* base address of this chunk */ unsigned long *alloc_map; /* allocation map */ @@ -49,7 +47,6 @@ struct pcpu_chunk { struct pcpu_block_md *md_blocks; /* metadata blocks */ void *data; /* chunk data */ - int first_bit; /* no free below this */ bool immutable; /* no [de]population allowed */ int start_offset; /* the overlap with the previous region to have a page aligned diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index b5fdd43b60c9..ef5034a0464e 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -53,6 +53,7 @@ static int find_max_nr_alloc(void) static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, int *buffer) { + struct pcpu_block_md *chunk_md = &chunk->chunk_md; int i, last_alloc, as_len, start, end; int *alloc_sizes, *p; /* statistics */ @@ -121,9 +122,9 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, P("nr_alloc", chunk->nr_alloc); P("max_alloc_size", chunk->max_alloc_size); P("empty_pop_pages", chunk->nr_empty_pop_pages); - P("first_bit", chunk->first_bit); + P("first_bit", chunk_md->first_free); P("free_bytes", chunk->free_bytes); - P("contig_bytes", chunk->contig_bits * PCPU_MIN_ALLOC_SIZE); + P("contig_bytes", chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE); P("sum_frag", sum_frag); P("max_frag", max_frag); P("cur_min_alloc", cur_min_alloc); diff --git a/mm/percpu.c b/mm/percpu.c index acc72d37a830..daebf7a5343c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -233,10 +233,13 @@ static int pcpu_size_to_slot(int size) static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) { - if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0) + const struct pcpu_block_md *chunk_md = &chunk->chunk_md; + + if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || + chunk_md->contig_hint == 0) return 0; - return pcpu_size_to_slot(chunk->contig_bits * PCPU_MIN_ALLOC_SIZE); + return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE); } /* set the pointer to a chunk in a page struct */ @@ -588,54 +591,6 @@ static inline bool pcpu_region_overlap(int a, int b, int x, int y) return (a < y) && (x < b); } -/** - * pcpu_chunk_update - updates the chunk metadata given a free area - * @chunk: chunk of interest - * @bit_off: chunk offset - * @bits: size of free area - * - * This updates the chunk's contig hint and starting offset given a free area. - * Choose the best starting offset if the contig hint is equal. - */ -static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits) -{ - if (bits > chunk->contig_bits) { - chunk->contig_bits_start = bit_off; - chunk->contig_bits = bits; - } else if (bits == chunk->contig_bits && chunk->contig_bits_start && - (!bit_off || - __ffs(bit_off) > __ffs(chunk->contig_bits_start))) { - /* use the start with the best alignment */ - chunk->contig_bits_start = bit_off; - } -} - -/** - * pcpu_chunk_refresh_hint - updates metadata about a chunk - * @chunk: chunk of interest - * - * Iterates over the metadata blocks to find the largest contig area. - * It also counts the populated pages and uses the delta to update the - * global count. - * - * Updates: - * chunk->contig_bits - * chunk->contig_bits_start - */ -static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk) -{ - int bit_off, bits; - - /* clear metadata */ - chunk->contig_bits = 0; - - bit_off = chunk->first_bit; - bits = 0; - pcpu_for_each_md_free_region(chunk, bit_off, bits) { - pcpu_chunk_update(chunk, bit_off, bits); - } -} - /** * pcpu_block_update - updates a block given a free area * @block: block of interest @@ -749,6 +704,29 @@ static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off, pcpu_block_update(block, s_off, e_off); } +/** + * pcpu_chunk_refresh_hint - updates metadata about a chunk + * @chunk: chunk of interest + * + * Iterates over the metadata blocks to find the largest contig area. + * It also counts the populated pages and uses the delta to update the + * global count. + */ +static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk) +{ + struct pcpu_block_md *chunk_md = &chunk->chunk_md; + int bit_off, bits; + + /* clear metadata */ + chunk_md->contig_hint = 0; + + bit_off = chunk_md->first_free; + bits = 0; + pcpu_for_each_md_free_region(chunk, bit_off, bits) { + pcpu_block_update(chunk_md, bit_off, bit_off + bits); + } +} + /** * pcpu_block_refresh_hint * @chunk: chunk of interest @@ -796,6 +774,7 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, int bits) { + struct pcpu_block_md *chunk_md = &chunk->chunk_md; int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ @@ -906,8 +885,9 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, * contig hint is broken. Otherwise, it means a smaller space * was used and therefore the chunk contig hint is still correct. */ - if (pcpu_region_overlap(chunk->contig_bits_start, - chunk->contig_bits_start + chunk->contig_bits, + if (pcpu_region_overlap(chunk_md->contig_hint_start, + chunk_md->contig_hint_start + + chunk_md->contig_hint, bit_off, bit_off + bits)) pcpu_chunk_refresh_hint(chunk); @@ -926,9 +906,10 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, * * A chunk update is triggered if a page becomes free, a block becomes free, * or the free spans across blocks. This tradeoff is to minimize iterating - * over the block metadata to update chunk->contig_bits. chunk->contig_bits - * may be off by up to a page, but it will never be more than the available - * space. If the contig hint is contained in one block, it will be accurate. + * over the block metadata to update chunk_md->contig_hint. + * chunk_md->contig_hint may be off by up to a page, but it will never be more + * than the available space. If the contig hint is contained in one block, it + * will be accurate. */ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, int bits) @@ -1022,8 +1003,9 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index) pcpu_chunk_refresh_hint(chunk); else - pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start), - end - start); + pcpu_block_update(&chunk->chunk_md, + pcpu_block_off_to_off(s_index, start), + end); } /** @@ -1078,6 +1060,7 @@ static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, size_t align, bool pop_only) { + struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits, next_off; /* @@ -1086,12 +1069,12 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, * cannot fit in the global hint, there is memory pressure and creating * a new chunk would happen soon. */ - bit_off = ALIGN(chunk->contig_bits_start, align) - - chunk->contig_bits_start; - if (bit_off + alloc_bits > chunk->contig_bits) + bit_off = ALIGN(chunk_md->contig_hint_start, align) - + chunk_md->contig_hint_start; + if (bit_off + alloc_bits > chunk_md->contig_hint) return -1; - bit_off = chunk->first_bit; + bit_off = chunk_md->first_free; bits = 0; pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) { if (!pop_only || pcpu_is_populated(chunk, bit_off, bits, @@ -1186,6 +1169,7 @@ again: static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, size_t align, int start) { + struct pcpu_block_md *chunk_md = &chunk->chunk_md; size_t align_mask = (align) ? (align - 1) : 0; unsigned long area_off = 0, area_bits = 0; int bit_off, end, oslot; @@ -1218,8 +1202,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE; /* update first free bit */ - if (bit_off == chunk->first_bit) - chunk->first_bit = find_next_zero_bit( + if (bit_off == chunk_md->first_free) + chunk_md->first_free = find_next_zero_bit( chunk->alloc_map, pcpu_chunk_map_bits(chunk), bit_off + alloc_bits); @@ -1241,6 +1225,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, */ static void pcpu_free_area(struct pcpu_chunk *chunk, int off) { + struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits, end, oslot; lockdep_assert_held(&pcpu_lock); @@ -1260,7 +1245,7 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int off) chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE; /* update first free bit */ - chunk->first_bit = min(chunk->first_bit, bit_off); + chunk_md->first_free = min(chunk_md->first_free, bit_off); pcpu_block_update_hint_free(chunk, bit_off, bits); @@ -1281,6 +1266,9 @@ static void pcpu_init_md_blocks(struct pcpu_chunk *chunk) { struct pcpu_block_md *md_block; + /* init the chunk's block */ + pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk)); + for (md_block = chunk->md_blocks; md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk); md_block++) @@ -1365,7 +1353,6 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, chunk->nr_populated = chunk->nr_pages; chunk->nr_empty_pop_pages = chunk->nr_pages; - chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE; chunk->free_bytes = map_size; if (chunk->start_offset) { @@ -1375,7 +1362,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, set_bit(0, chunk->bound_map); set_bit(offset_bits, chunk->bound_map); - chunk->first_bit = offset_bits; + chunk->chunk_md.first_free = offset_bits; pcpu_block_update_hint_alloc(chunk, 0, offset_bits); } @@ -1428,7 +1415,6 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) pcpu_init_md_blocks(chunk); /* init metadata */ - chunk->contig_bits = region_bits; chunk->free_bytes = chunk->nr_pages * PAGE_SIZE; return chunk; -- cgit v1.2.3-55-g7522 From d33d9f3dd96ba290bad3929f82990a0a7467ae90 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Tue, 26 Feb 2019 10:46:48 -0800 Subject: percpu: use chunk scan_hint to skip some scanning Just like blocks, chunks now maintain a scan_hint. This can be used to skip some scanning by promoting the scan_hint to be the contig_hint. The chunk's scan_hint is primarily updated on the backside and relies on full scanning when a block becomes free or the free region spans across blocks. Signed-off-by: Dennis Zhou Reviewed-by: Peng Fan --- mm/percpu.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index daebf7a5343c..30e683f42861 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -707,20 +707,31 @@ static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off, /** * pcpu_chunk_refresh_hint - updates metadata about a chunk * @chunk: chunk of interest + * @full_scan: if we should scan from the beginning * * Iterates over the metadata blocks to find the largest contig area. - * It also counts the populated pages and uses the delta to update the - * global count. + * A full scan can be avoided on the allocation path as this is triggered + * if we broke the contig_hint. In doing so, the scan_hint will be before + * the contig_hint or after if the scan_hint == contig_hint. This cannot + * be prevented on freeing as we want to find the largest area possibly + * spanning blocks. */ -static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk) +static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits; - /* clear metadata */ - chunk_md->contig_hint = 0; + /* promote scan_hint to contig_hint */ + if (!full_scan && chunk_md->scan_hint) { + bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint; + chunk_md->contig_hint_start = chunk_md->scan_hint_start; + chunk_md->contig_hint = chunk_md->scan_hint; + chunk_md->scan_hint = 0; + } else { + bit_off = chunk_md->first_free; + chunk_md->contig_hint = 0; + } - bit_off = chunk_md->first_free; bits = 0; pcpu_for_each_md_free_region(chunk, bit_off, bits) { pcpu_block_update(chunk_md, bit_off, bit_off + bits); @@ -880,6 +891,13 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, if (nr_empty_pages) pcpu_update_empty_pages(chunk, -nr_empty_pages); + if (pcpu_region_overlap(chunk_md->scan_hint_start, + chunk_md->scan_hint_start + + chunk_md->scan_hint, + bit_off, + bit_off + bits)) + chunk_md->scan_hint = 0; + /* * The only time a full chunk scan is required is if the chunk * contig hint is broken. Otherwise, it means a smaller space @@ -890,7 +908,7 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, chunk_md->contig_hint, bit_off, bit_off + bits)) - pcpu_chunk_refresh_hint(chunk); + pcpu_chunk_refresh_hint(chunk, false); } /** @@ -1001,7 +1019,7 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, * the else condition below. */ if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index) - pcpu_chunk_refresh_hint(chunk); + pcpu_chunk_refresh_hint(chunk, true); else pcpu_block_update(&chunk->chunk_md, pcpu_block_off_to_off(s_index, start), @@ -1074,7 +1092,7 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, if (bit_off + alloc_bits > chunk_md->contig_hint) return -1; - bit_off = chunk_md->first_free; + bit_off = pcpu_next_hint(chunk_md, alloc_bits); bits = 0; pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) { if (!pop_only || pcpu_is_populated(chunk, bit_off, bits, -- cgit v1.2.3-55-g7522 From 198790d9a3aeaef5792d33a560020861126edc22 Mon Sep 17 00:00:00 2001 From: John Sperbeck Date: Tue, 7 May 2019 18:43:20 -0700 Subject: percpu: remove spurious lock dependency between percpu and sched In free_percpu() we sometimes call pcpu_schedule_balance_work() to queue a work item (which does a wakeup) while holding pcpu_lock. This creates an unnecessary lock dependency between pcpu_lock and the scheduler's pi_lock. There are other places where we call pcpu_schedule_balance_work() without hold pcpu_lock, and this case doesn't need to be different. Moving the call outside the lock prevents the following lockdep splat when running tools/testing/selftests/bpf/{test_maps,test_progs} in sequence with lockdep enabled: ====================================================== WARNING: possible circular locking dependency detected 5.1.0-dbg-DEV #1 Not tainted ------------------------------------------------------ kworker/23:255/18872 is trying to acquire lock: 000000000bc79290 (&(&pool->lock)->rlock){-.-.}, at: __queue_work+0xb2/0x520 but task is already holding lock: 00000000e3e7a6aa (pcpu_lock){..-.}, at: free_percpu+0x36/0x260 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #4 (pcpu_lock){..-.}: lock_acquire+0x9e/0x180 _raw_spin_lock_irqsave+0x3a/0x50 pcpu_alloc+0xfa/0x780 __alloc_percpu_gfp+0x12/0x20 alloc_htab_elem+0x184/0x2b0 __htab_percpu_map_update_elem+0x252/0x290 bpf_percpu_hash_update+0x7c/0x130 __do_sys_bpf+0x1912/0x1be0 __x64_sys_bpf+0x1a/0x20 do_syscall_64+0x59/0x400 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #3 (&htab->buckets[i].lock){....}: lock_acquire+0x9e/0x180 _raw_spin_lock_irqsave+0x3a/0x50 htab_map_update_elem+0x1af/0x3a0 -> #2 (&rq->lock){-.-.}: lock_acquire+0x9e/0x180 _raw_spin_lock+0x2f/0x40 task_fork_fair+0x37/0x160 sched_fork+0x211/0x310 copy_process.part.43+0x7b1/0x2160 _do_fork+0xda/0x6b0 kernel_thread+0x29/0x30 rest_init+0x22/0x260 arch_call_rest_init+0xe/0x10 start_kernel+0x4fd/0x520 x86_64_start_reservations+0x24/0x26 x86_64_start_kernel+0x6f/0x72 secondary_startup_64+0xa4/0xb0 -> #1 (&p->pi_lock){-.-.}: lock_acquire+0x9e/0x180 _raw_spin_lock_irqsave+0x3a/0x50 try_to_wake_up+0x41/0x600 wake_up_process+0x15/0x20 create_worker+0x16b/0x1e0 workqueue_init+0x279/0x2ee kernel_init_freeable+0xf7/0x288 kernel_init+0xf/0x180 ret_from_fork+0x24/0x30 -> #0 (&(&pool->lock)->rlock){-.-.}: __lock_acquire+0x101f/0x12a0 lock_acquire+0x9e/0x180 _raw_spin_lock+0x2f/0x40 __queue_work+0xb2/0x520 queue_work_on+0x38/0x80 free_percpu+0x221/0x260 pcpu_freelist_destroy+0x11/0x20 stack_map_free+0x2a/0x40 bpf_map_free_deferred+0x3c/0x50 process_one_work+0x1f7/0x580 worker_thread+0x54/0x410 kthread+0x10f/0x150 ret_from_fork+0x24/0x30 other info that might help us debug this: Chain exists of: &(&pool->lock)->rlock --> &htab->buckets[i].lock --> pcpu_lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(pcpu_lock); lock(&htab->buckets[i].lock); lock(pcpu_lock); lock(&(&pool->lock)->rlock); *** DEADLOCK *** 3 locks held by kworker/23:255/18872: #0: 00000000b36a6e16 ((wq_completion)events){+.+.}, at: process_one_work+0x17a/0x580 #1: 00000000dfd966f0 ((work_completion)(&map->work)){+.+.}, at: process_one_work+0x17a/0x580 #2: 00000000e3e7a6aa (pcpu_lock){..-.}, at: free_percpu+0x36/0x260 stack backtrace: CPU: 23 PID: 18872 Comm: kworker/23:255 Not tainted 5.1.0-dbg-DEV #1 Hardware name: ... Workqueue: events bpf_map_free_deferred Call Trace: dump_stack+0x67/0x95 print_circular_bug.isra.38+0x1c6/0x220 check_prev_add.constprop.50+0x9f6/0xd20 __lock_acquire+0x101f/0x12a0 lock_acquire+0x9e/0x180 _raw_spin_lock+0x2f/0x40 __queue_work+0xb2/0x520 queue_work_on+0x38/0x80 free_percpu+0x221/0x260 pcpu_freelist_destroy+0x11/0x20 stack_map_free+0x2a/0x40 bpf_map_free_deferred+0x3c/0x50 process_one_work+0x1f7/0x580 worker_thread+0x54/0x410 kthread+0x10f/0x150 ret_from_fork+0x24/0x30 Signed-off-by: John Sperbeck Signed-off-by: Dennis Zhou --- mm/percpu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index 30e683f42861..7d038393d8f5 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1959,6 +1959,7 @@ void free_percpu(void __percpu *ptr) struct pcpu_chunk *chunk; unsigned long flags; int off; + bool need_balance = false; if (!ptr) return; @@ -1980,7 +1981,7 @@ void free_percpu(void __percpu *ptr) list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { - pcpu_schedule_balance_work(); + need_balance = true; break; } } @@ -1988,6 +1989,9 @@ void free_percpu(void __percpu *ptr) trace_percpu_free_percpu(chunk->base_addr, off, ptr); spin_unlock_irqrestore(&pcpu_lock, flags); + + if (need_balance) + pcpu_schedule_balance_work(); } EXPORT_SYMBOL_GPL(free_percpu); -- cgit v1.2.3-55-g7522