From 148deab223b23734069abcacb5c7118b0e7deadc Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 14 Dec 2016 15:08:49 -0800 Subject: radix-tree: improve multiorder iterators This fixes several interlinked problems with the iterators in the presence of multiorder entries. 1. radix_tree_iter_next() would only advance by one slot, which would result in the iterators returning the same entry more than once if there were sibling entries. 2. radix_tree_next_slot() could return an internal pointer instead of a user pointer if a tagged multiorder entry was immediately followed by an entry of lower order. 3. radix_tree_next_slot() expanded to a lot more code than it used to when multiorder support was compiled in. And I wasn't comfortable with entry_to_node() being in a header file. Fixing radix_tree_iter_next() for the presence of sibling entries necessarily involves examining the contents of the radix tree, so we now need to pass 'slot' to radix_tree_iter_next(), and we need to change the calling convention so it is called *before* dropping the lock which protects the tree. Also rename it to radix_tree_iter_resume(), as some people thought it was necessary to call radix_tree_iter_next() each time around the loop. radix_tree_next_slot() becomes closer to how it looked before multiorder support was introduced. It only checks to see if the next entry in the chunk is a sibling entry or a pointer to a node; this should be rare enough that handling this case out of line is not a performance impact (and such impact is amortised by the fact that the entry we just processed was a multiorder entry). Also, radix_tree_next_slot() used to force a new chunk lookup for untagged entries, which is more expensive than the out of line sibling entry skipping. Link: http://lkml.kernel.org/r/1480369871-5271-55-git-send-email-mawilcox@linuxonhyperv.com Signed-off-by: Matthew Wilcox Tested-by: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Ross Zwisler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/radix-tree.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 121 insertions(+), 17 deletions(-) (limited to 'lib/radix-tree.c') diff --git a/lib/radix-tree.c b/lib/radix-tree.c index dbf4e8c8e100..0a7ac0fb4a65 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -69,6 +69,11 @@ struct radix_tree_preload { }; static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; +static inline struct radix_tree_node *entry_to_node(void *ptr) +{ + return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); +} + static inline void *node_to_entry(void *ptr) { return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); @@ -1104,6 +1109,120 @@ static inline void __set_iter_shift(struct radix_tree_iter *iter, #endif } +/* Construct iter->tags bit-mask from node->tags[tag] array */ +static void set_iter_tags(struct radix_tree_iter *iter, + struct radix_tree_node *node, unsigned offset, + unsigned tag) +{ + unsigned tag_long = offset / BITS_PER_LONG; + unsigned tag_bit = offset % BITS_PER_LONG; + + iter->tags = node->tags[tag][tag_long] >> tag_bit; + + /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ + if (tag_long < RADIX_TREE_TAG_LONGS - 1) { + /* Pick tags from next element */ + if (tag_bit) + iter->tags |= node->tags[tag][tag_long + 1] << + (BITS_PER_LONG - tag_bit); + /* Clip chunk size, here only BITS_PER_LONG tags */ + iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG); + } +} + +#ifdef CONFIG_RADIX_TREE_MULTIORDER +static void **skip_siblings(struct radix_tree_node **nodep, + void **slot, struct radix_tree_iter *iter) +{ + void *sib = node_to_entry(slot - 1); + + while (iter->index < iter->next_index) { + *nodep = rcu_dereference_raw(*slot); + if (*nodep && *nodep != sib) + return slot; + slot++; + iter->index = __radix_tree_iter_add(iter, 1); + iter->tags >>= 1; + } + + *nodep = NULL; + return NULL; +} + +void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, + unsigned flags) +{ + unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; + struct radix_tree_node *node = rcu_dereference_raw(*slot); + + slot = skip_siblings(&node, slot, iter); + + while (radix_tree_is_internal_node(node)) { + unsigned offset; + unsigned long next_index; + + if (node == RADIX_TREE_RETRY) + return slot; + node = entry_to_node(node); + iter->shift = node->shift; + + if (flags & RADIX_TREE_ITER_TAGGED) { + offset = radix_tree_find_next_bit(node, tag, 0); + if (offset == RADIX_TREE_MAP_SIZE) + return NULL; + slot = &node->slots[offset]; + iter->index = __radix_tree_iter_add(iter, offset); + set_iter_tags(iter, node, offset, tag); + node = rcu_dereference_raw(*slot); + } else { + offset = 0; + slot = &node->slots[0]; + for (;;) { + node = rcu_dereference_raw(*slot); + if (node) + break; + slot++; + offset++; + if (offset == RADIX_TREE_MAP_SIZE) + return NULL; + } + iter->index = __radix_tree_iter_add(iter, offset); + } + if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0)) + goto none; + next_index = (iter->index | shift_maxindex(iter->shift)) + 1; + if (next_index < iter->next_index) + iter->next_index = next_index; + } + + return slot; + none: + iter->next_index = 0; + return NULL; +} +EXPORT_SYMBOL(__radix_tree_next_slot); +#else +static void **skip_siblings(struct radix_tree_node **nodep, + void **slot, struct radix_tree_iter *iter) +{ + return slot; +} +#endif + +void **radix_tree_iter_resume(void **slot, struct radix_tree_iter *iter) +{ + struct radix_tree_node *node; + + slot++; + iter->index = __radix_tree_iter_add(iter, 1); + node = rcu_dereference_raw(*slot); + skip_siblings(&node, slot, iter); + iter->next_index = iter->index; + iter->tags = 0; + return NULL; +} +EXPORT_SYMBOL(radix_tree_iter_resume); + /** * radix_tree_next_chunk - find next chunk of slots for iteration * @@ -1191,23 +1310,8 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, iter->next_index = (index | node_maxindex(node)) + 1; __set_iter_shift(iter, node->shift); - /* Construct iter->tags bit-mask from node->tags[tag] array */ - if (flags & RADIX_TREE_ITER_TAGGED) { - unsigned tag_long, tag_bit; - - tag_long = offset / BITS_PER_LONG; - tag_bit = offset % BITS_PER_LONG; - iter->tags = node->tags[tag][tag_long] >> tag_bit; - /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ - if (tag_long < RADIX_TREE_TAG_LONGS - 1) { - /* Pick tags from next element */ - if (tag_bit) - iter->tags |= node->tags[tag][tag_long + 1] << - (BITS_PER_LONG - tag_bit); - /* Clip chunk size, here only BITS_PER_LONG tags */ - iter->next_index = index + BITS_PER_LONG; - } - } + if (flags & RADIX_TREE_ITER_TAGGED) + set_iter_tags(iter, node, offset, tag); return node->slots + offset; } -- cgit v1.2.3-55-g7522