summaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/async-thread.c848
-rw-r--r--fs/btrfs/async-thread.h121
-rw-r--r--fs/btrfs/backref.c84
-rw-r--r--fs/btrfs/btrfs_inode.h14
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.c11
-rw-r--r--fs/btrfs/ctree.h73
-rw-r--r--fs/btrfs/delayed-inode.c6
-rw-r--r--fs/btrfs/delayed-ref.c29
-rw-r--r--fs/btrfs/dev-replace.c79
-rw-r--r--fs/btrfs/disk-io.c282
-rw-r--r--fs/btrfs/extent-tree.c58
-rw-r--r--fs/btrfs/extent_io.c15
-rw-r--r--fs/btrfs/extent_map.c56
-rw-r--r--fs/btrfs/extent_map.h10
-rw-r--r--fs/btrfs/file.c160
-rw-r--r--fs/btrfs/inode.c125
-rw-r--r--fs/btrfs/ioctl.c226
-rw-r--r--fs/btrfs/ordered-data.c68
-rw-r--r--fs/btrfs/ordered-data.h6
-rw-r--r--fs/btrfs/qgroup.c15
-rw-r--r--fs/btrfs/raid56.c21
-rw-r--r--fs/btrfs/reada.c4
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/btrfs/root-tree.c3
-rw-r--r--fs/btrfs/scrub.c97
-rw-r--r--fs/btrfs/send.c831
-rw-r--r--fs/btrfs/super.c48
-rw-r--r--fs/btrfs/sysfs.c43
-rw-r--r--fs/btrfs/sysfs.h5
-rw-r--r--fs/btrfs/transaction.c39
-rw-r--r--fs/btrfs/tree-log.c236
-rw-r--r--fs/btrfs/tree-log.h18
-rw-r--r--fs/btrfs/volumes.c46
-rw-r--r--fs/btrfs/volumes.h1
35 files changed, 2031 insertions, 1651 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c1e0b0caf9cc..ecb5832c0967 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2007 Oracle. All rights reserved.
+ * Copyright (C) 2014 Fujitsu. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -21,708 +22,313 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/freezer.h>
+#include <linux/workqueue.h>
#include "async-thread.h"
+#include "ctree.h"
+
+#define WORK_DONE_BIT 0
+#define WORK_ORDER_DONE_BIT 1
+#define WORK_HIGH_PRIO_BIT 2
+
+#define NO_THRESHOLD (-1)
+#define DFT_THRESHOLD (32)
+
+struct __btrfs_workqueue {
+ struct workqueue_struct *normal_wq;
+ /* List head pointing to ordered work list */
+ struct list_head ordered_list;
+
+ /* Spinlock for ordered_list */
+ spinlock_t list_lock;
+
+ /* Thresholding related variants */
+ atomic_t pending;
+ int max_active;
+ int current_max;
+ int thresh;
+ unsigned int count;
+ spinlock_t thres_lock;
+};
-#define WORK_QUEUED_BIT 0
-#define WORK_DONE_BIT 1
-#define WORK_ORDER_DONE_BIT 2
-#define WORK_HIGH_PRIO_BIT 3
-
-/*
- * container for the kthread task pointer and the list of pending work
- * One of these is allocated per thread.
- */
-struct btrfs_worker_thread {
- /* pool we belong to */
- struct btrfs_workers *workers;
-
- /* list of struct btrfs_work that are waiting for service */
- struct list_head pending;
- struct list_head prio_pending;
-
- /* list of worker threads from struct btrfs_workers */
- struct list_head worker_list;
-
- /* kthread */
- struct task_struct *task;
+struct btrfs_workqueue {
+ struct __btrfs_workqueue *normal;
+ struct __btrfs_workqueue *high;
+};
- /* number of things on the pending list */
- atomic_t num_pending;
+static inline struct __btrfs_workqueue
+*__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
+ int thresh)
+{
+ struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
- /* reference counter for this struct */
- atomic_t refs;
+ if (unlikely(!ret))
+ return NULL;
- unsigned long sequence;
+ ret->max_active = max_active;
+ atomic_set(&ret->pending, 0);
+ if (thresh == 0)
+ thresh = DFT_THRESHOLD;
+ /* For low threshold, disabling threshold is a better choice */
+ if (thresh < DFT_THRESHOLD) {
+ ret->current_max = max_active;
+ ret->thresh = NO_THRESHOLD;
+ } else {
+ ret->current_max = 1;
+ ret->thresh = thresh;
+ }
- /* protects the pending list. */
- spinlock_t lock;
+ if (flags & WQ_HIGHPRI)
+ ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
+ ret->max_active,
+ "btrfs", name);
+ else
+ ret->normal_wq = alloc_workqueue("%s-%s", flags,
+ ret->max_active, "btrfs",
+ name);
+ if (unlikely(!ret->normal_wq)) {
+ kfree(ret);
+ return NULL;
+ }
- /* set to non-zero when this thread is already awake and kicking */
- int working;
+ INIT_LIST_HEAD(&ret->ordered_list);
+ spin_lock_init(&ret->list_lock);
+ spin_lock_init(&ret->thres_lock);
+ trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
+ return ret;
+}
- /* are we currently idle */
- int idle;
-};
+static inline void
+__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
-static int __btrfs_start_workers(struct btrfs_workers *workers);
+struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+ int flags,
+ int max_active,
+ int thresh)
+{
+ struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
-/*
- * btrfs_start_workers uses kthread_run, which can block waiting for memory
- * for a very long time. It will actually throttle on page writeback,
- * and so it may not make progress until after our btrfs worker threads
- * process all of the pending work structs in their queue
- *
- * This means we can't use btrfs_start_workers from inside a btrfs worker
- * thread that is used as part of cleaning dirty memory, which pretty much
- * involves all of the worker threads.
- *
- * Instead we have a helper queue who never has more than one thread
- * where we scheduler thread start operations. This worker_start struct
- * is used to contain the work and hold a pointer to the queue that needs
- * another worker.
- */
-struct worker_start {
- struct btrfs_work work;
- struct btrfs_workers *queue;
-};
+ if (unlikely(!ret))
+ return NULL;
-static void start_new_worker_func(struct btrfs_work *work)
-{
- struct worker_start *start;
- start = container_of(work, struct worker_start, work);
- __btrfs_start_workers(start->queue);
- kfree(start);
-}
+ ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
+ max_active, thresh);
+ if (unlikely(!ret->normal)) {
+ kfree(ret);
+ return NULL;
+ }
-/*
- * helper function to move a thread onto the idle list after it
- * has finished some requests.
- */
-static void check_idle_worker(struct btrfs_worker_thread *worker)
-{
- if (!worker->idle && atomic_read(&worker->num_pending) <
- worker->workers->idle_thresh / 2) {
- unsigned long flags;
- spin_lock_irqsave(&worker->workers->lock, flags);
- worker->idle = 1;
-
- /* the list may be empty if the worker is just starting */
- if (!list_empty(&worker->worker_list) &&
- !worker->workers->stopping) {
- list_move(&worker->worker_list,
- &worker->workers->idle_list);
+ if (flags & WQ_HIGHPRI) {
+ ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
+ thresh);
+ if (unlikely(!ret->high)) {
+ __btrfs_destroy_workqueue(ret->normal);
+ kfree(ret);
+ return NULL;
}
- spin_unlock_irqrestore(&worker->workers->lock, flags);
}
+ return ret;
}
/*
- * helper function to move a thread off the idle list after new
- * pending work is added.
+ * Hook for threshold which will be called in btrfs_queue_work.
+ * This hook WILL be called in IRQ handler context,
+ * so workqueue_set_max_active MUST NOT be called in this hook
*/
-static void check_busy_worker(struct btrfs_worker_thread *worker)
+static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
{
- if (worker->idle && atomic_read(&worker->num_pending) >=
- worker->workers->idle_thresh) {
- unsigned long flags;
- spin_lock_irqsave(&worker->workers->lock, flags);
- worker->idle = 0;
-
- if (!list_empty(&worker->worker_list) &&
- !worker->workers->stopping) {
- list_move_tail(&worker->worker_list,
- &worker->workers->worker_list);
- }
- spin_unlock_irqrestore(&worker->workers->lock, flags);
- }
+ if (wq->thresh == NO_THRESHOLD)
+ return;
+ atomic_inc(&wq->pending);
}
-static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
+/*
+ * Hook for threshold which will be called before executing the work,
+ * This hook is called in kthread content.
+ * So workqueue_set_max_active is called here.
+ */
+static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
{
- struct btrfs_workers *workers = worker->workers;
- struct worker_start *start;
- unsigned long flags;
+ int new_max_active;
+ long pending;
+ int need_change = 0;
- rmb();
- if (!workers->atomic_start_pending)
+ if (wq->thresh == NO_THRESHOLD)
return;
- start = kzalloc(sizeof(*start), GFP_NOFS);
- if (!start)
- return;
-
- start->work.func = start_new_worker_func;
- start->queue = workers;
-
- spin_lock_irqsave(&workers->lock, flags);
- if (!workers->atomic_start_pending)
- goto out;
-
- workers->atomic_start_pending = 0;
- if (workers->num_workers + workers->num_workers_starting >=
- workers->max_workers)
- goto out;
-
- workers->num_workers_starting += 1;
- spin_unlock_irqrestore(&workers->lock, flags);
- btrfs_queue_worker(workers->atomic_worker_start, &start->work);
- return;
+ atomic_dec(&wq->pending);
+ spin_lock(&wq->thres_lock);
+ /*
+ * Use wq->count to limit the calling frequency of
+ * workqueue_set_max_active.
+ */
+ wq->count++;
+ wq->count %= (wq->thresh / 4);
+ if (!wq->count)
+ goto out;
+ new_max_active = wq->current_max;
+ /*
+ * pending may be changed later, but it's OK since we really
+ * don't need it so accurate to calculate new_max_active.
+ */
+ pending = atomic_read(&wq->pending);
+ if (pending > wq->thresh)
+ new_max_active++;
+ if (pending < wq->thresh / 2)
+ new_max_active--;
+ new_max_active = clamp_val(new_max_active, 1, wq->max_active);
+ if (new_max_active != wq->current_max) {
+ need_change = 1;
+ wq->current_max = new_max_active;
+ }
out:
- kfree(start);
- spin_unlock_irqrestore(&workers->lock, flags);
+ spin_unlock(&wq->thres_lock);
+
+ if (need_change) {
+ workqueue_set_max_active(wq->normal_wq, wq->current_max);
+ }
}
-static noinline void run_ordered_completions(struct btrfs_workers *workers,
- struct btrfs_work *work)
+static void run_ordered_work(struct __btrfs_workqueue *wq)
{
- if (!workers->ordered)
- return;
-
- set_bit(WORK_DONE_BIT, &work->flags);
-
- spin_lock(&workers->order_lock);
+ struct list_head *list = &wq->ordered_list;
+ struct btrfs_work *work;
+ spinlock_t *lock = &wq->list_lock;
+ unsigned long flags;
while (1) {
- if (!list_empty(&workers->prio_order_list)) {
- work = list_entry(workers->prio_order_list.next,
- struct btrfs_work, order_list);
- } else if (!list_empty(&workers->order_list)) {
- work = list_entry(workers->order_list.next,
- struct btrfs_work, order_list);
- } else {
+ spin_lock_irqsave(lock, flags);
+ if (list_empty(list))
break;
- }
+ work = list_entry(list->next, struct btrfs_work,
+ ordered_list);
if (!test_bit(WORK_DONE_BIT, &work->flags))
break;
- /* we are going to call the ordered done function, but
+ /*
+ * we are going to call the ordered done function, but
* we leave the work item on the list as a barrier so
* that later work items that are done don't have their
* functions called before this one returns
*/
if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
break;
-
- spin_unlock(&workers->order_lock);
-
+ trace_btrfs_ordered_sched(work);
+ spin_unlock_irqrestore(lock, flags);
work->ordered_func(work);
/* now take the lock again and drop our item from the list */
- spin_lock(&workers->order_lock);
- list_del(&work->order_list);
- spin_unlock(&workers->order_lock);
+ spin_lock_irqsave(lock, flags);
+ list_del(&work->ordered_list);
+ spin_unlock_irqrestore(lock, flags);
/*
* we don't want to call the ordered free functions
* with the lock held though
*/
work->ordered_free(work);
- spin_lock(&workers->order_lock);
- }
-
- spin_unlock(&workers->order_lock);
-}
-
-static void put_worker(struct btrfs_worker_thread *worker)
-{
- if (atomic_dec_and_test(&worker->refs))
- kfree(worker);
-}
-
-static int try_worker_shutdown(struct btrfs_worker_thread *worker)
-{
- int freeit = 0;
-
- spin_lock_irq(&worker->lock);
- spin_lock(&worker->workers->lock);
- if (worker->workers->num_workers > 1 &&
- worker->idle &&
- !worker->working &&
- !list_empty(&worker->worker_list) &&
- list_empty(&worker->prio_pending) &&
- list_empty(&worker->pending) &&
- atomic_read(&worker->num_pending) == 0) {
- freeit = 1;
- list_del_init(&worker->worker_list);
- worker->workers->num_workers--;
+ trace_btrfs_all_work_done(work);
}
- spin_unlock(&worker->workers->lock);
- spin_unlock_irq(&worker->lock);
-
- if (freeit)
- put_worker(worker);
- return freeit;
+ spin_unlock_irqrestore(lock, flags);
}
-static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
- struct list_head *prio_head,
- struct list_head *head)
-{
- struct btrfs_work *work = NULL;
- struct list_head *cur = NULL;
-
- if (!list_empty(prio_head))
- cur = prio_head->next;
-
- smp_mb();
- if (!list_empty(&worker->prio_pending))
- goto refill;
-
- if (!list_empty(head))
- cur = head->next;
-
- if (cur)
- goto out;
-
-refill:
- spin_lock_irq(&worker->lock);
- list_splice_tail_init(&worker->prio_pending, prio_head);
- list_splice_tail_init(&worker->pending, head);
-
- if (!list_empty(prio_head))
- cur = prio_head->next;
- else if (!list_empty(head))
- cur = head->next;
- spin_unlock_irq(&worker->lock);
-
- if (!cur)
- goto out_fail;
-
-out:
- work = list_entry(cur, struct btrfs_work, list);
-
-out_fail:
- return work;
-}
-
-/*
- * main loop for servicing work items
- */
-static int worker_loop(void *arg)
+static void normal_work_helper(struct work_struct *arg)
{
- struct btrfs_worker_thread *worker = arg;
- struct list_head head;
- struct list_head prio_head;
struct btrfs_work *work;
+ struct __btrfs_workqueue *wq;
+ int need_order = 0;
- INIT_LIST_HEAD(&head);
- INIT_LIST_HEAD(&prio_head);
-
- do {
-again:
- while (1) {
-
-
- work = get_next_work(worker, &prio_head, &head);
- if (!work)
- break;
-
- list_del(&work->list);
- clear_bit(WORK_QUEUED_BIT, &work->flags);
-
- work->worker = worker;
-
- work->func(work);
-
- atomic_dec(&worker->num_pending);
- /*
- * unless this is an ordered work queue,
- * 'work' was probably freed by func above.
- */
- run_ordered_completions(worker->workers, work);
-
- check_pending_worker_creates(worker);
- cond_resched();
- }
-
- spin_lock_irq(&worker->lock);
- check_idle_worker(worker);
-
- if (freezing(current)) {
- worker->working = 0;
- spin_unlock_irq(&worker->lock);
- try_to_freeze();
- } else {
- spin_unlock_irq(&worker->lock);
- if (!kthread_should_stop()) {
- cpu_relax();
- /*
- * we've dropped the lock, did someone else
- * jump_in?
- */
- smp_mb();
- if (!list_empty(&worker->pending) ||
- !list_empty(&worker->prio_pending))
- continue;
-
- /*
- * this short schedule allows more work to
- * come in without the queue functions
- * needing to go through wake_up_process()
- *
- * worker->working is still 1, so nobody
- * is going to try and wake us up
- */
- schedule_timeout(1);
- smp_mb();
- if (!list_empty(&worker->pending) ||
- !list_empty(&worker->prio_pending))
- continue;
-
- if (kthread_should_stop())
- break;
-
- /* still no more work?, sleep for real */
- spin_lock_irq(&worker->lock);
- set_current_state(TASK_INTERRUPTIBLE);
- if (!list_empty(&worker->pending) ||
- !list_empty(&worker->prio_pending)) {
- spin_unlock_irq(&worker->lock);
- set_current_state(TASK_RUNNING);
- goto again;
- }
-
- /*
- * this makes sure we get a wakeup when someone
- * adds something new to the queue
- */
- worker->working = 0;
- spin_unlock_irq(&worker->lock);
-
- if (!kthread_should_stop()) {
- schedule_timeout(HZ * 120);
- if (!worker->working &&
- try_worker_shutdown(worker)) {
- return 0;
- }
- }
- }
- __set_current_state(TASK_RUNNING);
- }
- } while (!kthread_should_stop());
- return 0;
-}
-
-/*
- * this will wait for all the worker threads to shutdown
- */
-void btrfs_stop_workers(struct btrfs_workers *workers)
-{
- struct list_head *cur;
- struct btrfs_worker_thread *worker;
- int can_stop;
-
- spin_lock_irq(&workers->lock);
- workers->stopping = 1;
- list_splice_init(&workers->idle_list, &workers->worker_list);
- while (!list_empty(&workers->worker_list)) {
- cur = workers->worker_list.next;
- worker = list_entry(cur, struct btrfs_worker_thread,
- worker_list);
-
- atomic_inc(&worker->refs);
- workers->num_workers -= 1;
- if (!list_empty(&worker->worker_list)) {
- list_del_init(&worker->worker_list);
- put_worker(worker);
- can_stop = 1;
- } else
- can_stop = 0;
- spin_unlock_irq(&workers->lock);
- if (can_stop)
- kthread_stop(worker->task);
- spin_lock_irq(&workers->lock);
- put_worker(worker);
+ work = container_of(arg, struct btrfs_work, normal_work);
+ /*
+ * We should not touch things inside work in the following cases:
+ * 1) after work->func() if it has no ordered_free
+ * Since the struct is freed in work->func().
+ * 2) after setting WORK_DONE_BIT
+ * The work may be freed in other threads almost instantly.
+ * So we save the needed things here.
+ */
+ if (work->ordered_func)
+ need_order = 1;
+ wq = work->wq;
+
+ trace_btrfs_work_sched(work);
+ thresh_exec_hook(wq);
+ work->func(work);
+ if (need_order) {
+ set_bit(WORK_DONE_BIT, &work->flags);
+ run_ordered_work(wq);
}
- spin_unlock_irq(&workers->lock);
+ if (!need_order)
+ trace_btrfs_all_work_done(work);
}
-/*
- * simple init on struct btrfs_workers
- */
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
- struct btrfs_workers *async_helper)
+void btrfs_init_work(struct btrfs_work *work,
+ btrfs_func_t func,
+ btrfs_func_t ordered_func,
+ btrfs_func_t ordered_free)
{
- workers->num_workers = 0;
- workers->num_workers_starting = 0;
- INIT_LIST_HEAD(&workers->worker_list);
- INIT_LIST_HEAD(&workers->idle_list);
- INIT_LIST_HEAD(&workers->order_list);
- INIT_LIST_HEAD(&workers->prio_order_list);
- spin_lock_init(&workers->lock);
- spin_lock_init(&workers->order_lock);
- workers->max_workers = max;
- workers->idle_thresh = 32;
- workers->name = name;
- workers->ordered = 0;
- workers->atomic_start_pending = 0;
- workers->atomic_worker_start = async_helper;
- workers->stopping = 0;
+ work->func = func;
+ work->ordered_func = ordered_func;
+ work->ordered_free = ordered_free;
+ INIT_WORK(&work->normal_work, normal_work_helper);
+ INIT_LIST_HEAD(&work->ordered_list);
+ work->flags = 0;
}
-/*
- * starts new worker threads. This does not enforce the max worker
- * count in case you need to temporarily go past it.
- */
-static int __btrfs_start_workers(struct btrfs_workers *workers)
+static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
+ struct btrfs_work *work)
{
- struct btrfs_worker_thread *worker;
- int ret = 0;
-
- worker = kzalloc(sizeof(*worker), GFP_NOFS);
- if (!worker) {
- ret = -ENOMEM;
- goto fail;
- }
-
- INIT_LIST_HEAD(&worker->pending);
- INIT_LIST_HEAD(&worker->prio_pending);
- INIT_LIST_HEAD(&worker->worker_list);
- spin_lock_init(&worker->lock);
-
- atomic_set(&worker->num_pending, 0);
- atomic_set(&worker->refs, 1);
- worker->workers = workers;
- worker->task = kthread_create(worker_loop, worker,
- "btrfs-%s-%d", workers->name,
- workers->num_workers + 1);
- if (IS_ERR(worker->task)) {
- ret = PTR_ERR(worker->task);
- goto fail;
- }
+ unsigned long flags;
- spin_lock_irq(&workers->lock);
- if (workers->stopping) {
- spin_unlock_irq(&workers->lock);
- ret = -EINVAL;
- goto fail_kthread;
+ work->wq = wq;
+ thresh_queue_hook(wq);
+ if (work->ordered_func) {
+ spin_lock_irqsave(&wq->list_lock, flags);
+ list_add_tail(&work->ordered_list, &wq->ordered_list);
+ spin_unlock_irqrestore(&wq->list_lock, flags);
}
- list_add_tail(&worker->worker_list, &workers->idle_list);
- worker->idle = 1;
- workers->num_workers++;
- workers->num_workers_starting--;
- WARN_ON(workers->num_workers_starting < 0);
- spin_unlock_irq(&workers->lock);
-
- wake_up_process(worker->task);
- return 0;
-
-fail_kthread:
- kthread_stop(worker->task);
-fail:
- kfree(worker);
- spin_lock_irq(&workers->lock);
- workers->num_workers_starting--;
- spin_unlock_irq(&workers->lock);
- return ret;
+ queue_work(wq->normal_wq, &work->normal_work);
+ trace_btrfs_work_queued(work);
}
-int btrfs_start_workers(struct btrfs_workers *workers)
+void btrfs_queue_work(struct btrfs_workqueue *wq,
+ struct btrfs_work *work)
{
- spin_lock_irq(&workers->lock);
- workers->num_workers_starting++;
- spin_unlock_irq(&workers->lock);
- return __btrfs_start_workers(workers);
-}
-
-/*
- * run through the list and find a worker thread that doesn't have a lot
- * to do right now. This can return null if we aren't yet at the thread
- * count limit and all of the threads are busy.
- */
-static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
-{
- struct btrfs_worker_thread *worker;
- struct list_head *next;
- int enforce_min;
-
- enforce_min = (workers->num_workers + workers->num_workers_starting) <
- workers->max_workers;
-
- /*
- * if we find an idle thread, don't move it to the end of the
- * idle list. This improves the chance that the next submission
- * will reuse the same thread, and maybe catch it while it is still
- * working
- */
- if (!list_empty(&workers->idle_list)) {
- next = workers->idle_list.next;
- worker = list_entry(next, struct btrfs_worker_thread,
- worker_list);
- return worker;
- }
- if (enforce_min || list_empty(&workers->worker_list))
- return NULL;
-
- /*
- * if we pick a busy task, move the task to the end of the list.
- * hopefully this will keep things somewhat evenly balanced.
- * Do the move in batches based on the sequence number. This groups
- * requests submitted at roughly the same time onto the same worker.
- */
- next = workers->worker_list.next;
- worker = list_entry(next, struct btrfs_worker_thread, worker_list);
- worker->sequence++;
+ struct __btrfs_workqueue *dest_wq;
- if (worker->sequence % workers->idle_thresh == 0)
- list_move_tail(next, &workers->worker_list);
- return worker;
+ if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
+ dest_wq = wq->high;
+ else
+ dest_wq = wq->normal;
+ __btrfs_queue_work(dest_wq, work);
}
-/*
- * selects a worker thread to take the next job. This will either find
- * an idle worker, start a new worker up to the max count, or just return
- * one of the existing busy workers.
- */
-static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
+static inline void
+__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
{
- struct btrfs_worker_thread *worker;
- unsigned long flags;
- struct list_head *fallback;
- int ret;
-
- spin_lock_irqsave(&workers->lock, flags);
-again:
- worker = next_worker(workers);
-
- if (!worker) {
- if (workers->num_workers + workers->num_workers_starting >=
- workers->max_workers) {
- goto fallback;
- } else if (workers->atomic_worker_start) {
- workers->atomic_start_pending = 1;
- goto fallback;
- } else {
- workers->num_workers_starting++;
- spin_unlock_irqrestore(&workers->lock, flags);
- /* we're below the limit, start another worker */
- ret = __btrfs_start_workers(workers);
- spin_lock_irqsave(&workers->lock, flags);
- if (ret)
- goto fallback;
- goto again;
- }
- }
- goto found;
-
-fallback:
- fallback = NULL;
- /*
- * we have failed to find any workers, just
- * return the first one we can find.
- */
- if (!list_empty(&workers->worker_list))
- fallback = workers->worker_list.next;
- if (!list_empty(&workers->idle_list))
- fallback = workers->idle_list.next;
- BUG_ON(!fallback);
- worker = list_entry(fallback,
- struct btrfs_worker_thread, worker_list);
-found:
- /*
- * this makes sure the worker doesn't exit before it is placed
- * onto a busy/idle list
- */
- atomic_inc(&worker->num_pending);
- spin_unlock_irqrestore(&workers->lock, flags);
- return worker;
+ destroy_workqueue(wq->normal_wq);
+ trace_btrfs_workqueue_destroy(wq);
+ kfree(wq);
}
-/*
- * btrfs_requeue_work just puts the work item back on the tail of the list
- * it was taken from. It is intended for use with long running work functions
- * that make some progress and want to give the cpu up for others.
- */
-void btrfs_requeue_work(struct btrfs_work *work)
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
{
- struct btrfs_worker_thread *worker = work->worker;
- unsigned long flags;
- int wake = 0;
-
- if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
+ if (!wq)
return;
-
- spin_lock_irqsave(&worker->lock, flags);
- if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
- list_add_tail(&work->list, &worker->prio_pending);
- else
- list_add_tail(&work->list, &worker->pending);
- atomic_inc(&worker->num_pending);
-
- /* by definition we're busy, take ourselves off the idle
- * list
- */
- if (worker->idle) {
- spin_lock(&worker->workers->lock);
- worker->idle = 0;
- list_move_tail(&worker->worker_list,
- &worker->workers->worker_list);
- spin_unlock(&worker->workers->lock);
- }
- if (!worker->working) {
- wake = 1;
- worker->working = 1;
- }
-
- if (wake)
- wake_up_process(worker->task);
- spin_unlock_irqrestore(&worker->lock, flags);
+ if (wq->high)
+ __btrfs_destroy_workqueue(wq->high);
+ __btrfs_destroy_workqueue(wq->normal);
+ kfree(wq);
}
-void btrfs_set_work_high_prio(struct btrfs_work *work)
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
{
- set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+ wq->normal->max_active = max;
+ if (wq->high)
+ wq->high->max_active = max;
}
-/*
- * places a struct btrfs_work into the pending queue of one of the kthreads
- */
-void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+void btrfs_set_work_high_priority(struct btrfs_work *work)
{
- struct btrfs_worker_thread *worker;
- unsigned long flags;
- int wake = 0;
-
- /* don't requeue something already on a list */
- if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
- return;
-
- worker = find_worker(workers);
- if (workers->ordered) {
- /*
- * you're not allowed to do ordered queues from an
- * interrupt handler
- */
- spin_lock(&workers->order_lock);
- if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
- list_add_tail(&work->order_list,
- &workers->prio_order_list);
- } else {
- list_add_tail(&work->order_list, &workers->order_list);
- }
- spin_unlock(&workers->order_lock);
- } else {
- INIT_LIST_HEAD(&work->order_list);
- }
-
- spin_lock_irqsave(&worker->lock, flags);
-
- if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
- list_add_tail(&work->list, &worker->prio_pending);
- else
- list_add_tail(&work->list, &worker->pending);
- check_busy_worker(worker);
-
- /*
- * avoid calling into wake_up_process if this thread has already
- * been kicked
- */
- if (!worker->working)
- wake = 1;
- worker->working = 1;
-
- if (wake)
- wake_up_process(worker->task);
- spin_unlock_irqrestore(&worker->lock, flags);
+ set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1f26792683ed..9c6b66d15fb0 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2007 Oracle. All rights reserved.
+ * Copyright (C) 2014 Fujitsu. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -19,103 +20,35 @@
#ifndef __BTRFS_ASYNC_THREAD_
#define __BTRFS_ASYNC_THREAD_
-struct btrfs_worker_thread;
+struct btrfs_workqueue;
+/* Internal use only */
+struct __btrfs_workqueue;
+struct btrfs_work;
+typedef void (*btrfs_func_t)(struct btrfs_work *arg);
-/*
- * This is similar to a workqueue, but it is meant to spread the operations
- * across all available cpus instead of just the CPU that was used to
- * queue the work. There is also some batching introduced to try and
- * cut down on context switches.
- *
- * By default threads are added on demand up to 2 * the number of cpus.
- * Changing struct btrfs_workers->max_workers is one way to prevent
- * demand creation of kthreads.
- *
- * the basic model of these worker threads is to embed a btrfs_work
- * structure in your own data struct, and use container_of in a
- * work function to get back to your data struct.
- */
struct btrfs_work {
- /*
- * func should be set to the function you want called
- * your work struct is passed as the only arg
- *
- * ordered_func must be set for work sent to an ordered work queue,
- * and it is called to complete a given work item in the same
- * order they were sent to the queue.
- */
- void (*func)(struct btrfs_work *work);
- void (*ordered_func)(struct btrfs_work *work);
- void (*ordered_free)(struct btrfs_work *work);
-
- /*
- * flags should be set to zero. It is used to make sure the
- * struct is only inserted once into the list.
- */
+ btrfs_func_t func;
+ btrfs_func_t ordered_func;
+ btrfs_func_t ordered_free;
+
+ /* Don't touch things below */
+ struct work_struct normal_work;
+ struct list_head ordered_list;
+ struct __btrfs_workqueue *wq;
unsigned long flags;
-
- /* don't touch these */
- struct btrfs_worker_thread *worker;
- struct list_head list;
- struct list_head order_list;
-};
-
-struct btrfs_workers {
- /* current number of running workers */
- int num_workers;
-
- int num_workers_starting;
-
- /* max number of workers allowed. changed by btrfs_start_workers */
- int max_workers;
-
- /* once a worker has this many requests or fewer, it is idle */
- int idle_thresh;
-
- /* force completions in the order they were queued */
- int ordered;
-
- /* more workers required, but in an interrupt handler */
- int atomic_start_pending;
-
- /*
- * are we allowed to sleep while starting workers or are we required
- * to start them at a later time? If we can't sleep, this indicates
- * which queue we need to use to schedule thread creation.
- */
- struct btrfs_workers *atomic_worker_start;
-
- /* list with all the work threads. The workers on the idle thread
- * may be actively servicing jobs, but they haven't yet hit the
- * idle thresh limit above.
- */
- struct list_head worker_list;
- struct list_head idle_list;
-
- /*
- * when operating in ordered mode, this maintains the list
- * of work items waiting for completion
- */
- struct list_head order_list;
- struct list_head prio_order_list;
-
- /* lock for finding the next worker thread to queue on */
- spinlock_t lock;
-
- /* lock for the ordered lists */
- spinlock_t order_lock;
-
- /* extra name for this worker, used for current->name */
- char *name;
-
- int stopping;
};
-void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
-int btrfs_start_workers(struct btrfs_workers *workers);
-void btrfs_stop_workers(struct btrfs_workers *workers);
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
- struct btrfs_workers *async_starter);
-void btrfs_requeue_work(struct btrfs_work *work);
-void btrfs_set_work_high_prio(struct btrfs_work *work);
+struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+ int flags,
+ int max_active,
+ int thresh);
+void btrfs_init_work(struct btrfs_work *work,
+ btrfs_func_t func,
+ btrfs_func_t ordered_func,
+ btrfs_func_t ordered_free);
+void btrfs_queue_work(struct btrfs_workqueue *wq,
+ struct btrfs_work *work);
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
+void btrfs_set_work_high_priority(struct btrfs_work *work);
#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index aded3ef3d3d4..aad7201ad11b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -220,7 +220,8 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
struct ulist *parents, struct __prelim_ref *ref,
- int level, u64 time_seq, const u64 *extent_item_pos)
+ int level, u64 time_seq, const u64 *extent_item_pos,
+ u64 total_refs)
{
int ret = 0;
int slot;
@@ -249,7 +250,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
ret = btrfs_next_old_leaf(root, path, time_seq);
- while (!ret && count < ref->count) {
+ while (!ret && count < total_refs) {
eb = path->nodes[0];
slot = path->slots[0];
@@ -306,7 +307,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
struct __prelim_ref *ref,
struct ulist *parents,
- const u64 *extent_item_pos)
+ const u64 *extent_item_pos, u64 total_refs)
{
struct btrfs_root *root;
struct btrfs_key root_key;
@@ -361,7 +362,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
}
ret = add_all_parents(root, path, parents, ref, level, time_seq,
- extent_item_pos);
+ extent_item_pos, total_refs);
out:
path->lowest_level = 0;
btrfs_release_path(path);
@@ -374,7 +375,7 @@ out:
static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
struct list_head *head,
- const u64 *extent_item_pos)
+ const u64 *extent_item_pos, u64 total_refs)
{
int err;
int ret = 0;
@@ -400,7 +401,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
if (ref->count == 0)
continue;
err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
- parents, extent_item_pos);
+ parents, extent_item_pos,
+ total_refs);
/*
* we can only tolerate ENOENT,otherwise,we should catch error
* and return directly.
@@ -557,7 +559,7 @@ static void __merge_refs(struct list_head *head, int mode)
* smaller or equal that seq to the list
*/
static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
- struct list_head *prefs)
+ struct list_head *prefs, u64 *total_refs)
{
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
struct rb_node *n = &head->node.rb_node;
@@ -593,6 +595,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
default:
BUG_ON(1);
}
+ *total_refs += (node->ref_mod * sgn);
switch (node->type) {
case BTRFS_TREE_BLOCK_REF_KEY: {
struct btrfs_delayed_tree_ref *ref;
@@ -653,7 +656,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
*/
static int __add_inline_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 bytenr,
- int *info_level, struct list_head *prefs)
+ int *info_level, struct list_head *prefs,
+ u64 *total_refs)
{
int ret = 0;
int slot;
@@ -677,6 +681,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
flags = btrfs_extent_flags(leaf, ei);
+ *total_refs += btrfs_extent_refs(leaf, ei);
btrfs_item_key_to_cpu(leaf, &found_key, slot);
ptr = (unsigned long)(ei + 1);
@@ -859,6 +864,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct list_head prefs;
struct __prelim_ref *ref;
struct extent_inode_elem *eie = NULL;
+ u64 total_refs = 0;
INIT_LIST_HEAD(&prefs);
INIT_LIST_HEAD(&prefs_delayed);
@@ -873,8 +879,10 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- if (!trans)
+ if (!trans) {
path->search_commit_root = 1;
+ path->skip_locking = 1;
+ }
/*
* grab both a lock on the path and a lock on the delayed ref head.
@@ -915,7 +923,7 @@ again:
}
spin_unlock(&delayed_refs->lock);
ret = __add_delayed_refs(head, time_seq,
- &prefs_delayed);
+ &prefs_delayed, &total_refs);
mutex_unlock(&head->mutex);
if (ret)
goto out;
@@ -936,7 +944,8 @@ again:
(key.type == BTRFS_EXTENT_ITEM_KEY ||
key.type == BTRFS_METADATA_ITEM_KEY)) {
ret = __add_inline_refs(fs_info, path, bytenr,
- &info_level, &prefs);
+ &info_level, &prefs,
+ &total_refs);
if (ret)
goto out;
ret = __add_keyed_refs(fs_info, path, bytenr,
@@ -956,7 +965,7 @@ again:
__merge_refs(&prefs, 1);
ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
- extent_item_pos);
+ extent_item_pos, total_refs);
if (ret)
goto out;
@@ -965,7 +974,7 @@ again:
while (!list_empty(&prefs)) {
ref = list_first_entry(&prefs, struct __prelim_ref, list);
WARN_ON(ref->count < 0);
- if (ref->count && ref->root_id && ref->parent == 0) {
+ if (roots && ref->count && ref->root_id && ref->parent == 0) {
/* no parent == root of tree */
ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
if (ret < 0)
@@ -1061,22 +1070,14 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
u64 time_seq, struct ulist **leafs,
const u64 *extent_item_pos)
{
- struct ulist *tmp;
int ret;
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp)
- return -ENOMEM;
*leafs = ulist_alloc(GFP_NOFS);
- if (!*leafs) {
- ulist_free(tmp);
+ if (!*leafs)
return -ENOMEM;
- }
ret = find_parent_nodes(trans, fs_info, bytenr,
- time_seq, *leafs, tmp, extent_item_pos);
- ulist_free(tmp);
-
+ time_seq, *leafs, NULL, extent_item_pos);
if (ret < 0 && ret != -ENOENT) {
free_leaf_list(*leafs);
return ret;
@@ -1333,38 +1334,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
if (ret < 0)
return ret;
- while (1) {
- u32 nritems;
- if (path->slots[0] == 0) {
- btrfs_set_path_blocking(path);
- ret = btrfs_prev_leaf(fs_info->extent_root, path);
- if (ret != 0) {
- if (ret > 0) {
- pr_debug("logical %llu is not within "
- "any extent\n", logical);
- ret = -ENOENT;
- }
- return ret;
- }
- } else {
- path->slots[0]--;
- }
- nritems = btrfs_header_nritems(path->nodes[0]);
- if (nritems == 0) {
- pr_debug("logical %llu is not within any extent\n",
- logical);
- return -ENOENT;
- }
- if (path->slots[0] == nritems)
- path->slots[0]--;
-
- btrfs_item_key_to_cpu(path->nodes[0], found_key,
- path->slots[0]);
- if (found_key->type == BTRFS_EXTENT_ITEM_KEY ||
- found_key->type == BTRFS_METADATA_ITEM_KEY)
- break;
+ ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ return ret;
}
-
+ btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
if (found_key->type == BTRFS_METADATA_ITEM_KEY)
size = fs_info->extent_root->leafsize;
else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 8fed2125689e..c9a24444ec9a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -109,14 +109,17 @@ struct btrfs_inode {
u64 last_trans;
/*
- * log transid when this inode was last modified
+ * transid that last logged this inode
*/
- u64 last_sub_trans;
+ u64 logged_trans;
/*
- * transid that last logged this inode
+ * log transid when this inode was last modified
*/
- u64 logged_trans;
+ int last_sub_trans;
+
+ /* a local copy of root's last_log_commit */
+ int last_log_commit;
/* total number of bytes pending delalloc, used by stat to calc the
* real block usage of the file
@@ -155,9 +158,6 @@ struct btrfs_inode {
/* flags field from the on disk inode */
u32 flags;
- /* a local copy of root's last_log_commit */
- unsigned long last_log_commit;
-
/*
* Counters to keep track of the number of extent item's we may use due
* to delalloc and such. outstanding_extents is the number of extent
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b01fb6c527e3..d43c544d3b68 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -472,7 +472,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, pg_index);
rcu_read_unlock();
- if (page) {
+ if (page && !radix_tree_exceptional_entry(page)) {
misses++;
if (misses > 4)
break;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cbd3a7d6fa68..88d1b1eedc9c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5376,6 +5376,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
int advance_right;
u64 left_blockptr;
u64 right_blockptr;
+ u64 left_gen;
+ u64 right_gen;
u64 left_start_ctransid;
u64 right_start_ctransid;
u64 ctransid;
@@ -5640,7 +5642,14 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
right_blockptr = btrfs_node_blockptr(
right_path->nodes[right_level],
right_path->slots[right_level]);
- if (left_blockptr == right_blockptr) {
+ left_gen = btrfs_node_ptr_generation(
+ left_path->nodes[left_level],
+ left_path->slots[left_level]);
+ right_gen = btrfs_node_ptr_generation(
+ right_path->nodes[right_level],
+ right_path->slots[right_level]);
+ if (left_blockptr == right_blockptr &&
+ left_gen == right_gen) {
/*
* As we're on a shared block, don't
* allow to go deeper.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c1a42ca519f..bc96c03dd259 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -351,6 +351,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
#define BTRFS_FS_STATE_ERROR 0
#define BTRFS_FS_STATE_REMOUNTING 1
#define BTRFS_FS_STATE_TRANS_ABORTED 2
+#define BTRFS_FS_STATE_DEV_REPLACING 3
/* Super block flags */
/* Errors detected */
@@ -1489,6 +1490,7 @@ struct btrfs_fs_info {
*/
struct list_head ordered_roots;
+ struct mutex delalloc_root_mutex;
spinlock_t delalloc_root_lock;
/* all fs/file tree roots that have delalloc inodes. */
struct list_head delalloc_roots;
@@ -1503,28 +1505,27 @@ struct btrfs_fs_info {
* A third pool does submit_bio to avoid deadlocking with the other
* two
*/
- struct btrfs_workers generic_worker;
- struct btrfs_workers workers;
- struct btrfs_workers delalloc_workers;
- struct btrfs_workers flush_workers;
- struct btrfs_workers endio_workers;
- struct btrfs_workers endio_meta_workers;
- struct btrfs_workers endio_raid56_workers;
- struct btrfs_workers rmw_workers;
- struct btrfs_workers endio_meta_write_workers;
- struct btrfs_workers endio_write_workers;
- struct btrfs_workers endio_freespace_worker;
- struct btrfs_workers submit_workers;
- struct btrfs_workers caching_workers;
- struct btrfs_workers readahead_workers;
+ struct btrfs_workqueue *workers;
+ struct btrfs_workqueue *delalloc_workers;
+ struct btrfs_workqueue *flush_workers;
+ struct btrfs_workqueue *endio_workers;
+ struct btrfs_workqueue *endio_meta_workers;
+ struct btrfs_workqueue *endio_raid56_workers;
+ struct btrfs_workqueue *rmw_workers;
+ struct btrfs_workqueue *endio_meta_write_workers;
+ struct btrfs_workqueue *endio_write_workers;
+ struct btrfs_workqueue *endio_freespace_worker;
+ struct btrfs_workqueue *submit_workers;
+ struct btrfs_workqueue *caching_workers;
+ struct btrfs_workqueue *readahead_workers;
/*
* fixup workers take dirty pages that didn't properly go through
* the cow mechanism and make them safe to write. It happens
* for the sys_munmap function call path
*/
- struct btrfs_workers fixup_workers;
- struct btrfs_workers delayed_workers;
+ struct btrfs_workqueue *fixup_workers;
+ struct btrfs_workqueue *delayed_workers;
struct task_struct *transaction_kthread;
struct task_struct *cleaner_kthread;
int thread_pool_size;
@@ -1604,9 +1605,9 @@ struct btrfs_fs_info {
atomic_t scrub_cancel_req;
wait_queue_head_t scrub_pause_wait;
int scrub_workers_refcnt;
- struct btrfs_workers scrub_workers;
- struct btrfs_workers scrub_wr_completion_workers;
- struct btrfs_workers scrub_nocow_workers;
+ struct btrfs_workqueue *scrub_workers;
+ struct btrfs_workqueue *scrub_wr_completion_workers;
+ struct btrfs_workqueue *scrub_nocow_workers;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
u32 check_integrity_print_mask;
@@ -1647,7 +1648,7 @@ struct btrfs_fs_info {
/* qgroup rescan items */
struct mutex qgroup_rescan_lock; /* protects the progress item */
struct btrfs_key qgroup_rescan_progress;
- struct btrfs_workers qgroup_rescan_workers;
+ struct btrfs_workqueue *qgroup_rescan_workers;
struct completion qgroup_rescan_completion;
struct btrfs_work qgroup_rescan_work;
@@ -1674,10 +1675,18 @@ struct btrfs_fs_info {
atomic_t mutually_exclusive_operation_running;
+ struct percpu_counter bio_counter;
+ wait_queue_head_t replace_wait;
+
struct semaphore uuid_tree_rescan_sem;
unsigned int update_uuid_tree_gen:1;
};
+struct btrfs_subvolume_writers {
+ struct percpu_counter counter;
+ wait_queue_head_t wait;
+};
+
/*
* in ram representation of the tree. extent_root is used for all allocations
* and for the extent tree extent_root root.
@@ -1714,11 +1723,15 @@ struct btrfs_root {
struct mutex log_mutex;
wait_queue_head_t log_writer_wait;
wait_queue_head_t log_commit_wait[2];
+ struct list_head log_ctxs[2];
atomic_t log_writers;
atomic_t log_commit[2];
atomic_t log_batch;
- unsigned long log_transid;
- unsigned long last_log_commit;
+ int log_transid;
+ /* No matter the commit succeeds or not*/
+ int log_transid_committed;
+ /* Just be updated when the commit succeeds. */
+ int last_log_commit;
pid_t log_start_pid;
bool log_multiple_pids;
@@ -1793,6 +1806,7 @@ struct btrfs_root {
spinlock_t root_item_lock;
atomic_t refs;
+ struct mutex delalloc_mutex;
spinlock_t delalloc_lock;
/*
* all of the inodes that have delalloc bytes. It is possible for
@@ -1802,6 +1816,8 @@ struct btrfs_root {
struct list_head delalloc_inodes;
struct list_head delalloc_root;
u64 nr_delalloc_inodes;
+
+ struct mutex ordered_extent_mutex;
/*
* this is used by the balancing code to wait for all the pending
* ordered extents
@@ -1822,6 +1838,8 @@ struct btrfs_root {
* manipulation with the read-only status via SUBVOL_SETFLAGS
*/
int send_in_progress;
+ struct btrfs_subvolume_writers *subv_writers;
+ atomic_t will_be_snapshoted;
};
struct btrfs_ioctl_defrag_range_args {
@@ -3346,6 +3364,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int __get_raid_index(u64 flags);
+
+int btrfs_start_nocow_write(struct btrfs_root *root);
+void btrfs_end_nocow_write(struct btrfs_root *root);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -3723,7 +3744,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u32 min_type);
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+ int nr);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -4005,6 +4027,11 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
struct btrfs_scrub_progress *progress);
+/* dev-replace.c */
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
+
/* reada.c */
struct reada_control {
struct btrfs_root *root; /* tree to prefetch */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 451b00c86f6c..33e561a84013 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1392,11 +1392,11 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
return -ENOMEM;
async_work->delayed_root = delayed_root;
- async_work->work.func = btrfs_async_run_delayed_root;
- async_work->work.flags = 0;
+ btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root,
+ NULL, NULL);
async_work->nr = nr;
- btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work);
+ btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
return 0;
}
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index f3bff89eecf0..31299646024d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -199,44 +199,31 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
*/
static struct btrfs_delayed_ref_head *
find_ref_head(struct rb_root *root, u64 bytenr,
- struct btrfs_delayed_ref_head **last, int return_bigger)
+ int return_bigger)
{
struct rb_node *n;
struct btrfs_delayed_ref_head *entry;
- int cmp = 0;
-again:
n = root->rb_node;
entry = NULL;
while (n) {
entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
- if (last)
- *last = entry;
if (bytenr < entry->node.bytenr)
- cmp = -1;
- else if (bytenr > entry->node.bytenr)
- cmp = 1;
- else
- cmp = 0;
-
- if (cmp < 0)
n = n->rb_left;
- else if (cmp > 0)
+ else if (bytenr > entry->node.bytenr)
n = n->rb_right;
else
return entry;
}
if (entry && return_bigger) {
- if (cmp > 0) {
+ if (bytenr > entry->node.bytenr) {
n = rb_next(&entry->href_node);
if (!n)
n = rb_first(root);
entry = rb_entry(n, struct btrfs_delayed_ref_head,
href_node);
- bytenr = entry->node.bytenr;
- return_bigger = 0;
- goto again;
+ return entry;
}
return entry;
}
@@ -415,12 +402,12 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans)
again:
start = delayed_refs->run_delayed_start;
- head = find_ref_head(&delayed_refs->href_root, start, NULL, 1);
+ head = find_ref_head(&delayed_refs->href_root, start, 1);
if (!head && !loop) {
delayed_refs->run_delayed_start = 0;
start = 0;
loop = true;
- head = find_ref_head(&delayed_refs->href_root, start, NULL, 1);
+ head = find_ref_head(&delayed_refs->href_root, start, 1);
if (!head)
return NULL;
} else if (!head && loop) {
@@ -508,6 +495,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
ref = btrfs_delayed_node_to_head(update);
BUG_ON(existing_ref->is_data != ref->is_data);
+ spin_lock(&existing_ref->lock);
if (ref->must_insert_reserved) {
/* if the extent was freed and then
* reallocated before the delayed ref
@@ -549,7 +537,6 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
* only need the lock for this case cause we could be processing it
* currently, for refs we just added we know we're a-ok.
*/
- spin_lock(&existing_ref->lock);
existing->ref_mod += update->ref_mod;
spin_unlock(&existing_ref->lock);
}
@@ -898,7 +885,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
struct btrfs_delayed_ref_root *delayed_refs;
delayed_refs = &trans->transaction->delayed_refs;
- return find_ref_head(&delayed_refs->href_root, bytenr, NULL, 0);
+ return find_ref_head(&delayed_refs->href_root, bytenr, 0);
}
void btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 564c92638b20..9f2290509aca 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -431,6 +431,35 @@ leave_no_lock:
return ret;
}
+/*
+ * blocked until all flighting bios are finished.
+ */
+static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
+{
+ s64 writers;
+ DEFINE_WAIT(wait);
+
+ set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+ do {
+ prepare_to_wait(&fs_info->replace_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ writers = percpu_counter_sum(&fs_info->bio_counter);
+ if (writers)
+ schedule();
+ finish_wait(&fs_info->replace_wait, &wait);
+ } while (writers);
+}
+
+/*
+ * we have removed target device, it is safe to allow new bios request.
+ */
+static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
+{
+ clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+ if (waitqueue_active(&fs_info->replace_wait))
+ wake_up(&fs_info->replace_wait);
+}
+
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
@@ -458,17 +487,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
src_device = dev_replace->srcdev;
btrfs_dev_replace_unlock(dev_replace);
- /* replace old device with new one in mapping tree */
- if (!scrub_ret)
- btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
- src_device,
- tgt_device);
-
/*
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- ret = btrfs_start_delalloc_roots(root->fs_info, 0);
+ ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
@@ -484,6 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
WARN_ON(ret);
/* keep away write_all_supers() during the finishing procedure */
+ mutex_lock(&root->fs_info->chunk_mutex);
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
btrfs_dev_replace_lock(dev_replace);
dev_replace->replace_state =
@@ -494,7 +518,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
dev_replace->time_stopped = get_seconds();
dev_replace->item_needs_writeback = 1;
- if (scrub_ret) {
+ /* replace old device with new one in mapping tree */
+ if (!scrub_ret) {
+ btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+ src_device,
+ tgt_device);
+ } else {
printk_in_rcu(KERN_ERR
"BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
src_device->missing ? "<missing disk>" :
@@ -503,6 +532,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
rcu_str_deref(tgt_device->name), scrub_ret);
btrfs_dev_replace_unlock(dev_replace);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&root->fs_info->chunk_mutex);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -532,8 +562,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
fs_info->fs_devices->latest_bdev = tgt_device->bdev;
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+ btrfs_rm_dev_replace_blocked(fs_info);
+
btrfs_rm_dev_replace_srcdev(fs_info, src_device);
+ btrfs_rm_dev_replace_unblocked(fs_info);
+
/*
* this is again a consistent state where no dev_replace procedure
* is running, the target device is part of the filesystem, the
@@ -543,6 +577,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
*/
btrfs_dev_replace_unlock(dev_replace);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&root->fs_info->chunk_mutex);
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
@@ -862,3 +897,31 @@ void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
mutex_unlock(&dev_replace->lock_management_lock);
}
}
+
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
+{
+ percpu_counter_inc(&fs_info->bio_counter);
+}
+
+void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+{
+ percpu_counter_dec(&fs_info->bio_counter);
+
+ if (waitqueue_active(&fs_info->replace_wait))
+ wake_up(&fs_info->replace_wait);
+}
+
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
+{
+ DEFINE_WAIT(wait);
+again:
+ percpu_counter_inc(&fs_info->bio_counter);
+ if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
+ btrfs_bio_counter_dec(fs_info);
+ wait_event(fs_info->replace_wait,
+ !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+ &fs_info->fs_state));
+ goto again;
+ }
+
+}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5215f04260b2..bd0f752b797b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -678,32 +678,31 @@ static void end_workqueue_bio(struct bio *bio, int err)
fs_info = end_io_wq->info;
end_io_wq->error = err;
- end_io_wq->work.func = end_workqueue_fn;
- end_io_wq->work.flags = 0;
+ btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
if (bio->bi_rw & REQ_WRITE) {
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
- btrfs_queue_worker(&fs_info->endio_meta_write_workers,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_meta_write_workers,
+ &end_io_wq->work);
else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
- btrfs_queue_worker(&fs_info->endio_freespace_worker,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_freespace_worker,
+ &end_io_wq->work);
else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- btrfs_queue_worker(&fs_info->endio_raid56_workers,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_raid56_workers,
+ &end_io_wq->work);
else
- btrfs_queue_worker(&fs_info->endio_write_workers,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_write_workers,
+ &end_io_wq->work);
} else {
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- btrfs_queue_worker(&fs_info->endio_raid56_workers,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_raid56_workers,
+ &end_io_wq->work);
else if (end_io_wq->metadata)
- btrfs_queue_worker(&fs_info->endio_meta_workers,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_meta_workers,
+ &end_io_wq->work);
else
- btrfs_queue_worker(&fs_info->endio_workers,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_workers,
+ &end_io_wq->work);
}
}
@@ -738,7 +737,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
{
unsigned long limit = min_t(unsigned long,
- info->workers.max_workers,
+ info->thread_pool_size,
info->fs_devices->open_devices);
return 256 * limit;
}
@@ -811,11 +810,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
async->submit_bio_start = submit_bio_start;
async->submit_bio_done = submit_bio_done;
- async->work.func = run_one_async_start;
- async->work.ordered_func = run_one_async_done;
- async->work.ordered_free = run_one_async_free;
+ btrfs_init_work(&async->work, run_one_async_start,
+ run_one_async_done, run_one_async_free);
- async->work.flags = 0;
async->bio_flags = bio_flags;
async->bio_offset = bio_offset;
@@ -824,9 +821,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
atomic_inc(&fs_info->nr_async_submits);
if (rw & REQ_SYNC)
- btrfs_set_work_high_prio(&async->work);
+ btrfs_set_work_high_priority(&async->work);
- btrfs_queue_worker(&fs_info->workers, &async->work);
+ btrfs_queue_work(fs_info->workers, &async->work);
while (atomic_read(&fs_info->async_submit_draining) &&
atomic_read(&fs_info->nr_async_submits)) {
@@ -1149,6 +1146,32 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
}
+static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
+{
+ struct btrfs_subvolume_writers *writers;
+ int ret;
+
+ writers = kmalloc(sizeof(*writers), GFP_NOFS);
+ if (!writers)
+ return ERR_PTR(-ENOMEM);
+
+ ret = percpu_counter_init(&writers->counter, 0);
+ if (ret < 0) {
+ kfree(writers);
+ return ERR_PTR(ret);
+ }
+
+ init_waitqueue_head(&writers->wait);
+ return writers;
+}
+
+static void
+btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
+{
+ percpu_counter_destroy(&writers->counter);
+ kfree(writers);
+}
+
static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
u32 stripesize, struct btrfs_root *root,
struct btrfs_fs_info *fs_info,
@@ -1194,16 +1217,22 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
spin_lock_init(&root->log_extents_lock[1]);
mutex_init(&root->objectid_mutex);
mutex_init(&root->log_mutex);
+ mutex_init(&root->ordered_extent_mutex);
+ mutex_init(&root->delalloc_mutex);
init_waitqueue_head(&root->log_writer_wait);
init_waitqueue_head(&root->log_commit_wait[0]);
init_waitqueue_head(&root->log_commit_wait[1]);
+ INIT_LIST_HEAD(&root->log_ctxs[0]);
+ INIT_LIST_HEAD(&root->log_ctxs[1]);
atomic_set(&root->log_commit[0], 0);
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
atomic_set(&root->log_batch, 0);
atomic_set(&root->orphan_inodes, 0);
atomic_set(&root->refs, 1);
+ atomic_set(&root->will_be_snapshoted, 0);
root->log_transid = 0;
+ root->log_transid_committed = -1;
root->last_log_commit = 0;
if (fs_info)
extent_io_tree_init(&root->dirty_log_pages,
@@ -1417,6 +1446,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(root->log_root);
root->log_root = log_root;
root->log_transid = 0;
+ root->log_transid_committed = -1;
root->last_log_commit = 0;
return 0;
}
@@ -1498,6 +1528,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
int btrfs_init_fs_root(struct btrfs_root *root)
{
int ret;
+ struct btrfs_subvolume_writers *writers;
root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@@ -1507,6 +1538,13 @@ int btrfs_init_fs_root(struct btrfs_root *root)
goto fail;
}
+ writers = btrfs_alloc_subvolume_writers();
+ if (IS_ERR(writers)) {
+ ret = PTR_ERR(writers);
+ goto fail;
+ }
+ root->subv_writers = writers;
+
btrfs_init_free_ino_ctl(root);
mutex_init(&root->fs_commit_mutex);
spin_lock_init(&root->cache_lock);
@@ -1514,8 +1552,11 @@ int btrfs_init_fs_root(struct btrfs_root *root)
ret = get_anon_bdev(&root->anon_dev);
if (ret)
- goto fail;
+ goto free_writers;
return 0;
+
+free_writers:
+ btrfs_free_subvolume_writers(root->subv_writers);
fail:
kfree(root->free_ino_ctl);
kfree(root->free_ino_pinned);
@@ -1990,23 +2031,22 @@ static noinline int next_root_backup(struct btrfs_fs_info *info,
/* helper to cleanup workers */
static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
{
- btrfs_stop_workers(&fs_info->generic_worker);
- btrfs_stop_workers(&fs_info->fixup_workers);
- btrfs_stop_workers(&fs_info->delalloc_workers);
- btrfs_stop_workers(&fs_info->workers);
- btrfs_stop_workers(&fs_info->endio_workers);
- btrfs_stop_workers(&fs_info->endio_meta_workers);
- btrfs_stop_workers(&fs_info->endio_raid56_workers);
- btrfs_stop_workers(&fs_info->rmw_workers);
- btrfs_stop_workers(&fs_info->endio_meta_write_workers);
- btrfs_stop_workers(&fs_info->endio_write_workers);
- btrfs_stop_workers(&fs_info->endio_freespace_worker);
- btrfs_stop_workers(&fs_info->submit_workers);
- btrfs_stop_workers(&fs_info->delayed_workers);
- btrfs_stop_workers(&fs_info->caching_workers);
- btrfs_stop_workers(&fs_info->readahead_workers);
- btrfs_stop_workers(&fs_info->flush_workers);
- btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
+ btrfs_destroy_workqueue(fs_info->fixup_workers);
+ btrfs_destroy_workqueue(fs_info->delalloc_workers);
+ btrfs_destroy_workqueue(fs_info->workers);
+ btrfs_destroy_workqueue(fs_info->endio_workers);
+ btrfs_destroy_workqueue(fs_info->endio_meta_workers);
+ btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+ btrfs_destroy_workqueue(fs_info->rmw_workers);
+ btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
+ btrfs_destroy_workqueue(fs_info->endio_write_workers);
+ btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
+ btrfs_destroy_workqueue(fs_info->submit_workers);
+ btrfs_destroy_workqueue(fs_info->delayed_workers);
+ btrfs_destroy_workqueue(fs_info->caching_workers);
+ btrfs_destroy_workqueue(fs_info->readahead_workers);
+ btrfs_destroy_workqueue(fs_info->flush_workers);
+ btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
}
static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2097,6 +2137,8 @@ int open_ctree(struct super_block *sb,
int err = -EINVAL;
int num_backups_tried = 0;
int backup_index = 0;
+ int max_active;
+ int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
bool create_uuid_tree;
bool check_uuid_tree;
@@ -2133,10 +2175,16 @@ int open_ctree(struct super_block *sb,
goto fail_dirty_metadata_bytes;
}
+ ret = percpu_counter_init(&fs_info->bio_counter, 0);
+ if (ret) {
+ err = ret;
+ goto fail_delalloc_bytes;
+ }
+
fs_info->btree_inode = new_inode(sb);
if (!fs_info->btree_inode) {
err = -ENOMEM;
- goto fail_delalloc_bytes;
+ goto fail_bio_counter;
}
mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2159,6 +2207,7 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->buffer_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex);
+ mutex_init(&fs_info->delalloc_root_mutex);
seqlock_init(&fs_info->profiles_lock);
init_completion(&fs_info->kobj_unregister);
@@ -2211,6 +2260,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->scrub_pause_req, 0);
atomic_set(&fs_info->scrubs_paused, 0);
atomic_set(&fs_info->scrub_cancel_req, 0);
+ init_waitqueue_head(&fs_info->replace_wait);
init_waitqueue_head(&fs_info->scrub_pause_wait);
fs_info->scrub_workers_refcnt = 0;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -2458,104 +2508,68 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
- btrfs_init_workers(&fs_info->generic_worker,
- "genwork", 1, NULL);
-
- btrfs_init_workers(&fs_info->workers, "worker",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
+ max_active = fs_info->thread_pool_size;
- btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
- fs_info->thread_pool_size, NULL);
+ fs_info->workers =
+ btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
+ max_active, 16);
- btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
- fs_info->thread_pool_size, NULL);
+ fs_info->delalloc_workers =
+ btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
- btrfs_init_workers(&fs_info->submit_workers, "submit",
- min_t(u64, fs_devices->num_devices,
- fs_info->thread_pool_size), NULL);
+ fs_info->flush_workers =
+ btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
- btrfs_init_workers(&fs_info->caching_workers, "cache",
- fs_info->thread_pool_size, NULL);
+ fs_info->caching_workers =
+ btrfs_alloc_workqueue("cache", flags, max_active, 0);
- /* a higher idle thresh on the submit workers makes it much more
+ /*
+ * a higher idle thresh on the submit workers makes it much more
* likely that bios will be send down in a sane order to the
* devices
*/
- fs_info->submit_workers.idle_thresh = 64;
-
- fs_info->workers.idle_thresh = 16;
- fs_info->workers.ordered = 1;
-
- fs_info->delalloc_workers.idle_thresh = 2;
- fs_info->delalloc_workers.ordered = 1;
-
- btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->endio_workers, "endio",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->endio_meta_write_workers,
- "endio-meta-write", fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->endio_raid56_workers,
- "endio-raid56", fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->rmw_workers,
- "rmw", fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
- 1, &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->readahead_workers, "readahead",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
- &fs_info->generic_worker);
+ fs_info->submit_workers =
+ btrfs_alloc_workqueue("submit", flags,
+ min_t(u64, fs_devices->num_devices,
+ max_active), 64);
+
+ fs_info->fixup_workers =
+ btrfs_alloc_workqueue("fixup", flags, 1, 0);
/*
* endios are largely parallel and should have a very
* low idle thresh
*/
- fs_info->endio_workers.idle_thresh = 4;
- fs_info->endio_meta_workers.idle_thresh = 4;
- fs_info->endio_raid56_workers.idle_thresh = 4;
- fs_info->rmw_workers.idle_thresh = 2;
-
- fs_info->endio_write_workers.idle_thresh = 2;
- fs_info->endio_meta_write_workers.idle_thresh = 2;
- fs_info->readahead_workers.idle_thresh = 2;
-
- /*
- * btrfs_start_workers can really only fail because of ENOMEM so just
- * return -ENOMEM if any of these fail.
- */
- ret = btrfs_start_workers(&fs_info->workers);
- ret |= btrfs_start_workers(&fs_info->generic_worker);
- ret |= btrfs_start_workers(&fs_info->submit_workers);
- ret |= btrfs_start_workers(&fs_info->delalloc_workers);
- ret |= btrfs_start_workers(&fs_info->fixup_workers);
- ret |= btrfs_start_workers(&fs_info->endio_workers);
- ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
- ret |= btrfs_start_workers(&fs_info->rmw_workers);
- ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
- ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
- ret |= btrfs_start_workers(&fs_info->endio_write_workers);
- ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
- ret |= btrfs_start_workers(&fs_info->delayed_workers);
- ret |= btrfs_start_workers(&fs_info->caching_workers);
- ret |= btrfs_start_workers(&fs_info->readahead_workers);
- ret |= btrfs_start_workers(&fs_info->flush_workers);
- ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers);
- if (ret) {
+ fs_info->endio_workers =
+ btrfs_alloc_workqueue("endio", flags, max_active, 4);
+ fs_info->endio_meta_workers =
+ btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
+ fs_info->endio_meta_write_workers =
+ btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
+ fs_info->endio_raid56_workers =
+ btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+ fs_info->rmw_workers =
+ btrfs_alloc_workqueue("rmw", flags, max_active, 2);
+ fs_info->endio_write_workers =
+ btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
+ fs_info->endio_freespace_worker =
+ btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
+ fs_info->delayed_workers =
+ btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
+ fs_info->readahead_workers =
+ btrfs_alloc_workqueue("readahead", flags, max_active, 2);
+ fs_info->qgroup_rescan_workers =
+ btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
+
+ if (!(fs_info->workers && fs_info->delalloc_workers &&
+ fs_info->submit_workers && fs_info->flush_workers &&
+ fs_info->endio_workers && fs_info->endio_meta_workers &&
+ fs_info->endio_meta_write_workers &&
+ fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
+ fs_info->endio_freespace_worker && fs_info->rmw_workers &&
+ fs_info->caching_workers && fs_info->readahead_workers &&
+ fs_info->fixup_workers && fs_info->delayed_workers &&
+ fs_info->qgroup_rescan_workers)) {
err = -ENOMEM;
goto fail_sb_buffer;
}
@@ -2963,6 +2977,8 @@ fail_iput:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
iput(fs_info->btree_inode);
+fail_bio_counter:
+ percpu_counter_destroy(&fs_info->bio_counter);
fail_delalloc_bytes:
percpu_counter_destroy(&fs_info->delalloc_bytes);
fail_dirty_metadata_bytes:
@@ -3244,6 +3260,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
/* send down all the barriers */
head = &info->fs_devices->devices;
list_for_each_entry_rcu(dev, head, dev_list) {
+ if (dev->missing)
+ continue;
if (!dev->bdev) {
errors_send++;
continue;
@@ -3258,6 +3276,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
/* wait for all the barriers */
list_for_each_entry_rcu(dev, head, dev_list) {
+ if (dev->missing)
+ continue;
if (!dev->bdev) {
errors_wait++;
continue;
@@ -3477,6 +3497,8 @@ static void free_fs_root(struct btrfs_root *root)
root->orphan_block_rsv = NULL;
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
+ if (root->subv_writers)
+ btrfs_free_subvolume_writers(root->subv_writers);
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
kfree(root->free_ino_ctl);
@@ -3610,6 +3632,7 @@ int close_ctree(struct btrfs_root *root)
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
+ percpu_counter_destroy(&fs_info->bio_counter);
bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
@@ -3791,9 +3814,11 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
list_move_tail(&root->ordered_root,
&fs_info->ordered_roots);
+ spin_unlock(&fs_info->ordered_root_lock);
btrfs_destroy_ordered_extents(root);
- cond_resched_lock(&fs_info->ordered_root_lock);
+ cond_resched();
+ spin_lock(&fs_info->ordered_root_lock);
}
spin_unlock(&fs_info->ordered_root_lock);
}
@@ -3839,7 +3864,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
rb_erase(&ref->rb_node, &head->ref_root);
atomic_dec(&delayed_refs->num_entries);
btrfs_put_delayed_ref(ref);
- cond_resched_lock(&head->lock);
}
if (head->must_insert_reserved)
pin_bytes = true;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32312e09f0f5..c6b6a6e3e735 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -549,7 +549,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
caching_ctl->block_group = cache;
caching_ctl->progress = cache->key.objectid;
atomic_set(&caching_ctl->count, 1);
- caching_ctl->work.func = caching_thread;
+ btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
spin_lock(&cache->lock);
/*
@@ -640,7 +640,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
btrfs_get_block_group(cache);
- btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
+ btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
return ret;
}
@@ -3971,7 +3971,7 @@ static int can_overcommit(struct btrfs_root *root,
}
static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
- unsigned long nr_pages)
+ unsigned long nr_pages, int nr_items)
{
struct super_block *sb = root->fs_info->sb;
@@ -3986,9 +3986,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
* the filesystem is readonly(all dirty pages are written to
* the disk).
*/
- btrfs_start_delalloc_roots(root->fs_info, 0);
+ btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
if (!current->journal_info)
- btrfs_wait_ordered_roots(root->fs_info, -1);
+ btrfs_wait_ordered_roots(root->fs_info, nr_items);
}
}
@@ -4045,7 +4045,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
while (delalloc_bytes && loops < 3) {
max_reclaim = min(delalloc_bytes, to_reclaim);
nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
- btrfs_writeback_inodes_sb_nr(root, nr_pages);
+ btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
/*
* We need to wait for the async pages to actually start before
* we do anything.
@@ -4112,13 +4112,9 @@ static int may_commit_transaction(struct btrfs_root *root,
goto commit;
/* See if there is enough pinned space to make this reservation */
- spin_lock(&space_info->lock);
if (percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes) >= 0) {
- spin_unlock(&space_info->lock);
+ bytes) >= 0)
goto commit;
- }
- spin_unlock(&space_info->lock);
/*
* See if there is some space in the delayed insertion reservation for
@@ -4127,16 +4123,13 @@ static int may_commit_transaction(struct btrfs_root *root,
if (space_info != delayed_rsv->space_info)
return -ENOSPC;
- spin_lock(&space_info->lock);
spin_lock(&delayed_rsv->lock);
if (percpu_counter_compare(&space_info->total_bytes_pinned,
bytes - delayed_rsv->size) >= 0) {
spin_unlock(&delayed_rsv->lock);
- spin_unlock(&space_info->lock);
return -ENOSPC;
}
spin_unlock(&delayed_rsv->lock);
- spin_unlock(&space_info->lock);
commit:
trans = btrfs_join_transaction(root);
@@ -4181,7 +4174,7 @@ static int flush_space(struct btrfs_root *root,
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
- shrink_delalloc(root, num_bytes, orig_bytes,
+ shrink_delalloc(root, num_bytes * 2, orig_bytes,
state == FLUSH_DELALLOC_WAIT);
break;
case ALLOC_CHUNK:
@@ -8938,3 +8931,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
range->len = trimmed;
return ret;
}
+
+/*
+ * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
+ * they are used to prevent the some tasks writing data into the page cache
+ * by nocow before the subvolume is snapshoted, but flush the data into
+ * the disk after the snapshot creation.
+ */
+void btrfs_end_nocow_write(struct btrfs_root *root)
+{
+ percpu_counter_dec(&root->subv_writers->counter);
+ /*
+ * Make sure counter is updated before we wake up
+ * waiters.
+ */
+ smp_mb();
+ if (waitqueue_active(&root->subv_writers->wait))
+ wake_up(&root->subv_writers->wait);
+}
+
+int btrfs_start_nocow_write(struct btrfs_root *root)
+{
+ if (unlikely(atomic_read(&root->will_be_snapshoted)))
+ return 0;
+
+ percpu_counter_inc(&root->subv_writers->counter);
+ /*
+ * Make sure counter is updated before we check for snapshot creation.
+ */
+ smp_mb();
+ if (unlikely(atomic_read(&root->will_be_snapshoted))) {
+ btrfs_end_nocow_write(root);
+ return 0;
+ }
+ return 1;
+}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 85bbd01f1271..ae69a00387e7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -229,12 +229,14 @@ void free_extent_state(struct extent_state *state)
}
}
-static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+static struct rb_node *tree_insert(struct rb_root *root,
+ struct rb_node *search_start,
+ u64 offset,
struct rb_node *node,
struct rb_node ***p_in,
struct rb_node **parent_in)
{
- struct rb_node **p = &root->rb_node;
+ struct rb_node **p;
struct rb_node *parent = NULL;
struct tree_entry *entry;
@@ -244,6 +246,7 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
goto do_insert;
}
+ p = search_start ? &search_start : &root->rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct tree_entry, rb_node);
@@ -430,7 +433,7 @@ static int insert_state(struct extent_io_tree *tree,
set_state_bits(tree, state, bits);
- node = tree_insert(&tree->state, end, &state->rb_node, p, parent);
+ node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
if (node) {
struct extent_state *found;
found = rb_entry(node, struct extent_state, rb_node);
@@ -477,8 +480,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
prealloc->state = orig->state;
orig->start = split;
- node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node,
- NULL, NULL);
+ node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
+ &prealloc->rb_node, NULL, NULL);
if (node) {
free_extent_state(prealloc);
return -EEXIST;
@@ -2757,7 +2760,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
if (em_cached && *em_cached) {
em = *em_cached;
- if (em->in_tree && start >= em->start &&
+ if (extent_map_in_tree(em) && start >= em->start &&
start < extent_map_end(em)) {
atomic_inc(&em->refs);
return em;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 996ad56b57db..1874aee69c86 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -51,7 +51,7 @@ struct extent_map *alloc_extent_map(void)
em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
if (!em)
return NULL;
- em->in_tree = 0;
+ RB_CLEAR_NODE(&em->rb_node);
em->flags = 0;
em->compress_type = BTRFS_COMPRESS_NONE;
em->generation = 0;
@@ -73,7 +73,7 @@ void free_extent_map(struct extent_map *em)
return;
WARN_ON(atomic_read(&em->refs) == 0);
if (atomic_dec_and_test(&em->refs)) {
- WARN_ON(em->in_tree);
+ WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
kmem_cache_free(extent_map_cache, em);
}
@@ -99,8 +99,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
parent = *p;
entry = rb_entry(parent, struct extent_map, rb_node);
- WARN_ON(!entry->in_tree);
-
if (em->start < entry->start)
p = &(*p)->rb_left;
else if (em->start >= extent_map_end(entry))
@@ -128,7 +126,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
if (end > entry->start && em->start < extent_map_end(entry))
return -EEXIST;
- em->in_tree = 1;
rb_link_node(&em->rb_node, orig_parent, p);
rb_insert_color(&em->rb_node, root);
return 0;
@@ -153,8 +150,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
prev = n;
prev_entry = entry;
- WARN_ON(!entry->in_tree);
-
if (offset < entry->start)
n = n->rb_left;
else if (offset >= extent_map_end(entry))
@@ -240,12 +235,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->len += merge->len;
em->block_len += merge->block_len;
em->block_start = merge->block_start;
- merge->in_tree = 0;
em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
rb_erase(&merge->rb_node, &tree->map);
+ RB_CLEAR_NODE(&merge->rb_node);
free_extent_map(merge);
}
}
@@ -257,7 +252,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->len += merge->len;
em->block_len += merge->block_len;
rb_erase(&merge->rb_node, &tree->map);
- merge->in_tree = 0;
+ RB_CLEAR_NODE(&merge->rb_node);
em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
free_extent_map(merge);
@@ -319,7 +314,21 @@ out:
void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
{
clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
- if (em->in_tree)
+ if (extent_map_in_tree(em))
+ try_merge_map(tree, em);
+}
+
+static inline void setup_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *em,
+ int modified)
+{
+ atomic_inc(&em->refs);
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+
+ if (modified)
+ list_move(&em->list, &tree->modified_extents);
+ else
try_merge_map(tree, em);
}
@@ -342,15 +351,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
if (ret)
goto out;
- atomic_inc(&em->refs);
-
- em->mod_start = em->start;
- em->mod_len = em->len;
-
- if (modified)
- list_move(&em->list, &tree->modified_extents);
- else
- try_merge_map(tree, em);
+ setup_extent_mapping(tree, em, modified);
out:
return ret;
}
@@ -434,6 +435,21 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
rb_erase(&em->rb_node, &tree->map);
if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
list_del_init(&em->list);
- em->in_tree = 0;
+ RB_CLEAR_NODE(&em->rb_node);
return ret;
}
+
+void replace_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *cur,
+ struct extent_map *new,
+ int modified)
+{
+ WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
+ ASSERT(extent_map_in_tree(cur));
+ if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
+ list_del_init(&cur->list);
+ rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
+ RB_CLEAR_NODE(&cur->rb_node);
+
+ setup_extent_mapping(tree, new, modified);
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 93fba716d7f8..e7fd8a56a140 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -33,7 +33,6 @@ struct extent_map {
unsigned long flags;
struct block_device *bdev;
atomic_t refs;
- unsigned int in_tree;
unsigned int compress_type;
struct list_head list;
};
@@ -44,6 +43,11 @@ struct extent_map_tree {
rwlock_t lock;
};
+static inline int extent_map_in_tree(const struct extent_map *em)
+{
+ return !RB_EMPTY_NODE(&em->rb_node);
+}
+
static inline u64 extent_map_end(struct extent_map *em)
{
if (em->start + em->len < em->start)
@@ -64,6 +68,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
int add_extent_mapping(struct extent_map_tree *tree,
struct extent_map *em, int modified);
int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+void replace_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *cur,
+ struct extent_map *new,
+ int modified);
struct extent_map *alloc_extent_map(void);
void free_extent_map(struct extent_map *em);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0165b8672f09..e1ffb1e22898 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -591,7 +591,6 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
clear_bit(EXTENT_FLAG_LOGGING, &flags);
modified = !list_empty(&em->list);
- remove_extent_mapping(em_tree, em);
if (no_splits)
goto next;
@@ -622,8 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
- ret = add_extent_mapping(em_tree, split, modified);
- BUG_ON(ret); /* Logic error */
+ replace_extent_mapping(em_tree, em, split, modified);
free_extent_map(split);
split = split2;
split2 = NULL;
@@ -661,12 +659,20 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->orig_block_len = 0;
}
- ret = add_extent_mapping(em_tree, split, modified);
- BUG_ON(ret); /* Logic error */
+ if (extent_map_in_tree(em)) {
+ replace_extent_mapping(em_tree, em, split,
+ modified);
+ } else {
+ ret = add_extent_mapping(em_tree, split,
+ modified);
+ ASSERT(ret == 0); /* Logic error */
+ }
free_extent_map(split);
split = NULL;
}
next:
+ if (extent_map_in_tree(em))
+ remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
/* once for us */
@@ -720,7 +726,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
if (drop_cache)
btrfs_drop_extent_cache(inode, start, end - 1, 0);
- if (start >= BTRFS_I(inode)->disk_i_size)
+ if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
modify_tree = 0;
while (1) {
@@ -798,7 +804,10 @@ next_slot:
*/
if (start > key.offset && end < extent_end) {
BUG_ON(del_nr > 0);
- BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = -EINVAL;
+ break;
+ }
memcpy(&new_key, &key, sizeof(new_key));
new_key.offset = start;
@@ -841,7 +850,10 @@ next_slot:
* | -------- extent -------- |
*/
if (start <= key.offset && end < extent_end) {
- BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = -EINVAL;
+ break;
+ }
memcpy(&new_key, &key, sizeof(new_key));
new_key.offset = end;
@@ -864,7 +876,10 @@ next_slot:
*/
if (start > key.offset && end >= extent_end) {
BUG_ON(del_nr > 0);
- BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = -EINVAL;
+ break;
+ }
btrfs_set_file_extent_num_bytes(leaf, fi,
start - key.offset);
@@ -938,34 +953,42 @@ next_slot:
* Set path->slots[0] to first slot, so that after the delete
* if items are move off from our leaf to its immediate left or
* right neighbor leafs, we end up with a correct and adjusted
- * path->slots[0] for our insertion.
+ * path->slots[0] for our insertion (if replace_extent != 0).
*/
path->slots[0] = del_slot;
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
if (ret)
btrfs_abort_transaction(trans, root, ret);
+ }
- leaf = path->nodes[0];
- /*
- * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that
- * is, its contents got pushed to its neighbors), in which case
- * it means path->locks[0] == 0
- */
- if (!ret && replace_extent && leafs_visited == 1 &&
- path->locks[0] &&
- btrfs_leaf_free_space(root, leaf) >=
- sizeof(struct btrfs_item) + extent_item_size) {
-
- key.objectid = ino;
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = start;
- setup_items_for_insert(root, path, &key,
- &extent_item_size,
- extent_item_size,
- sizeof(struct btrfs_item) +
- extent_item_size, 1);
- *key_inserted = 1;
+ leaf = path->nodes[0];
+ /*
+ * If btrfs_del_items() was called, it might have deleted a leaf, in
+ * which case it unlocked our path, so check path->locks[0] matches a
+ * write lock.
+ */
+ if (!ret && replace_extent && leafs_visited == 1 &&
+ (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
+ path->locks[0] == BTRFS_WRITE_LOCK) &&
+ btrfs_leaf_free_space(root, leaf) >=
+ sizeof(struct btrfs_item) + extent_item_size) {
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+ if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
+ struct btrfs_key slot_key;
+
+ btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
+ if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
+ path->slots[0]++;
}
+ setup_items_for_insert(root, path, &key,
+ &extent_item_size,
+ extent_item_size,
+ sizeof(struct btrfs_item) +
+ extent_item_size, 1);
+ *key_inserted = 1;
}
if (!replace_extent || !(*key_inserted))
@@ -1346,11 +1369,11 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
struct btrfs_ordered_extent *ordered;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
start_pos, last_pos, 0, cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, last_pos);
+ ordered = btrfs_lookup_ordered_range(inode, start_pos,
+ last_pos - start_pos + 1);
if (ordered &&
ordered->file_offset + ordered->len > start_pos &&
ordered->file_offset <= last_pos) {
- btrfs_put_ordered_extent(ordered);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
start_pos, last_pos,
cached_state, GFP_NOFS);
@@ -1358,12 +1381,9 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
unlock_page(pages[i]);
page_cache_release(pages[i]);
}
- ret = btrfs_wait_ordered_range(inode, start_pos,
- last_pos - start_pos + 1);
- if (ret)
- return ret;
- else
- return -EAGAIN;
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ return -EAGAIN;
}
if (ordered)
btrfs_put_ordered_extent(ordered);
@@ -1396,8 +1416,12 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
u64 num_bytes;
int ret;
+ ret = btrfs_start_nocow_write(root);
+ if (!ret)
+ return -ENOSPC;
+
lockstart = round_down(pos, root->sectorsize);
- lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
+ lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
while (1) {
lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1415,12 +1439,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
if (ret <= 0) {
ret = 0;
+ btrfs_end_nocow_write(root);
} else {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
- NULL, GFP_NOFS);
- *write_bytes = min_t(size_t, *write_bytes, num_bytes);
+ *write_bytes = min_t(size_t, *write_bytes ,
+ num_bytes - pos + lockstart);
}
unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1510,6 +1532,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
if (!only_release_metadata)
btrfs_free_reserved_data_space(inode,
reserve_bytes);
+ else
+ btrfs_end_nocow_write(root);
break;
}
@@ -1598,6 +1622,9 @@ again:
}
release_bytes = 0;
+ if (only_release_metadata)
+ btrfs_end_nocow_write(root);
+
if (only_release_metadata && copied > 0) {
u64 lockstart = round_down(pos, root->sectorsize);
u64 lockend = lockstart +
@@ -1624,10 +1651,12 @@ again:
kfree(pages);
if (release_bytes) {
- if (only_release_metadata)
+ if (only_release_metadata) {
+ btrfs_end_nocow_write(root);
btrfs_delalloc_release_metadata(inode, release_bytes);
- else
+ } else {
btrfs_delalloc_release_space(inode, release_bytes);
+ }
}
return num_written ? num_written : ret;
@@ -1797,7 +1826,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
BTRFS_I(inode)->last_sub_trans = root->log_transid;
if (num_written > 0) {
err = generic_write_sync(file, pos, num_written);
- if (err < 0 && num_written > 0)
+ if (err < 0)
num_written = err;
}
@@ -1856,8 +1885,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret = 0;
struct btrfs_trans_handle *trans;
+ struct btrfs_log_ctx ctx;
+ int ret = 0;
bool full_sync = 0;
trace_btrfs_sync_file(file, datasync);
@@ -1951,7 +1981,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
trans->sync = true;
- ret = btrfs_log_dentry_safe(trans, root, dentry);
+ btrfs_init_log_ctx(&ctx);
+
+ ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
@@ -1971,7 +2003,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret != BTRFS_NO_LOG_SYNC) {
if (!ret) {
- ret = btrfs_sync_log(trans, root);
+ ret = btrfs_sync_log(trans, root, &ctx);
if (!ret) {
ret = btrfs_end_transaction(trans, root);
goto out;
@@ -2157,6 +2189,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
((offset + len - 1) >> PAGE_CACHE_SHIFT));
bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
+ u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
ret = btrfs_wait_ordered_range(inode, offset, len);
if (ret)
@@ -2172,14 +2205,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
* entire page.
*/
if (same_page && len < PAGE_CACHE_SIZE) {
- if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
+ if (offset < ino_size)
ret = btrfs_truncate_page(inode, offset, len, 0);
mutex_unlock(&inode->i_mutex);
return ret;
}
/* zero back part of the first page */
- if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+ if (offset < ino_size) {
ret = btrfs_truncate_page(inode, offset, 0, 0);
if (ret) {
mutex_unlock(&inode->i_mutex);
@@ -2188,7 +2221,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
/* zero the front end of the last page */
- if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+ if (offset + len < ino_size) {
ret = btrfs_truncate_page(inode, offset + len, 0, 1);
if (ret) {
mutex_unlock(&inode->i_mutex);
@@ -2277,10 +2310,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
trans->block_rsv = &root->fs_info->trans_block_rsv;
- ret = fill_holes(trans, inode, path, cur_offset, drop_end);
- if (ret) {
- err = ret;
- break;
+ if (cur_offset < ino_size) {
+ ret = fill_holes(trans, inode, path, cur_offset,
+ drop_end);
+ if (ret) {
+ err = ret;
+ break;
+ }
}
cur_offset = drop_end;
@@ -2313,10 +2349,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
trans->block_rsv = &root->fs_info->trans_block_rsv;
- ret = fill_holes(trans, inode, path, cur_offset, drop_end);
- if (ret) {
- err = ret;
- goto out_trans;
+ if (cur_offset < ino_size) {
+ ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+ if (ret) {
+ err = ret;
+ goto out_trans;
+ }
}
out_trans:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 184e9cb39647..06e9a4152b14 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -864,7 +864,8 @@ static noinline int cow_file_range(struct inode *inode,
if (btrfs_is_free_space_inode(inode)) {
WARN_ON_ONCE(1);
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_unlock;
}
num_bytes = ALIGN(end - start + 1, blocksize);
@@ -1075,17 +1076,15 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
async_cow->end = cur_end;
INIT_LIST_HEAD(&async_cow->extents);
- async_cow->work.func = async_cow_start;
- async_cow->work.ordered_func = async_cow_submit;
- async_cow->work.ordered_free = async_cow_free;
- async_cow->work.flags = 0;
+ btrfs_init_work(&async_cow->work, async_cow_start,
+ async_cow_submit, async_cow_free);
nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
PAGE_CACHE_SHIFT;
atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
- btrfs_queue_worker(&root->fs_info->delalloc_workers,
- &async_cow->work);
+ btrfs_queue_work(root->fs_info->delalloc_workers,
+ &async_cow->work);
if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
wait_event(root->fs_info->async_submit_wait,
@@ -1843,9 +1842,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
SetPageChecked(page);
page_cache_get(page);
- fixup->work.func = btrfs_writepage_fixup_worker;
+ btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
- btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
+ btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
return -EBUSY;
}
@@ -2239,6 +2238,11 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
return PTR_ERR(root);
}
+ if (btrfs_root_readonly(root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ return 0;
+ }
+
/* step 2: get inode */
key.objectid = backref->inum;
key.type = BTRFS_INODE_ITEM_KEY;
@@ -2759,7 +2763,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
struct inode *inode = page->mapping->host;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ordered_extent *ordered_extent = NULL;
- struct btrfs_workers *workers;
+ struct btrfs_workqueue *workers;
trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
@@ -2768,14 +2772,13 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
end - start + 1, uptodate))
return 0;
- ordered_extent->work.func = finish_ordered_fn;
- ordered_extent->work.flags = 0;
+ btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
if (btrfs_is_free_space_inode(inode))
- workers = &root->fs_info->endio_freespace_worker;
+ workers = root->fs_info->endio_freespace_worker;
else
- workers = &root->fs_info->endio_write_workers;
- btrfs_queue_worker(workers, &ordered_extent->work);
+ workers = root->fs_info->endio_write_workers;
+ btrfs_queue_work(workers, &ordered_extent->work);
return 0;
}
@@ -4593,7 +4596,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
struct rb_node *node;
ASSERT(inode->i_state & I_FREEING);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
write_lock(&map_tree->lock);
while (!RB_EMPTY_ROOT(&map_tree->map)) {
@@ -4924,7 +4927,8 @@ void btrfs_invalidate_inodes(struct btrfs_root *root)
struct inode *inode;
u64 objectid = 0;
- WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+ if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+ WARN_ON(btrfs_root_refs(&root->root_item) != 0);
spin_lock(&root->inode_lock);
again:
@@ -5154,7 +5158,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
return ERR_CAST(inode);
}
- return d_splice_alias(inode, dentry);
+ return d_materialise_unique(dentry, inode);
}
unsigned char btrfs_filetype_table[] = {
@@ -5799,6 +5803,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
}
out_unlock:
btrfs_end_transaction(trans, root);
+ btrfs_balance_delayed_items(root);
btrfs_btree_balance_dirty(root);
if (drop_inode) {
inode_dec_link_count(inode);
@@ -5872,6 +5877,7 @@ out_unlock:
inode_dec_link_count(inode);
iput(inode);
}
+ btrfs_balance_delayed_items(root);
btrfs_btree_balance_dirty(root);
return err;
}
@@ -5930,6 +5936,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
}
btrfs_end_transaction(trans, root);
+ btrfs_balance_delayed_items(root);
fail:
if (drop_inode) {
inode_dec_link_count(inode);
@@ -5996,6 +6003,7 @@ out_fail:
btrfs_end_transaction(trans, root);
if (drop_on_err)
iput(inode);
+ btrfs_balance_delayed_items(root);
btrfs_btree_balance_dirty(root);
return err;
}
@@ -6550,6 +6558,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
int ret;
struct extent_buffer *leaf;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
u64 disk_bytenr;
@@ -6626,6 +6635,20 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
if (btrfs_extent_readonly(root, disk_bytenr))
goto out;
+
+ num_bytes = min(offset + *len, extent_end) - offset;
+ if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ u64 range_end;
+
+ range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
+ ret = test_range_bit(io_tree, offset, range_end,
+ EXTENT_DELALLOC, 0, NULL);
+ if (ret) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ }
+
btrfs_release_path(path);
/*
@@ -6654,7 +6677,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
*/
disk_bytenr += backref_offset;
disk_bytenr += offset - key.offset;
- num_bytes = min(offset + *len, extent_end) - offset;
if (csum_exist_in_range(root, disk_bytenr, num_bytes))
goto out;
/*
@@ -7024,10 +7046,9 @@ again:
if (!ret)
goto out_test;
- ordered->work.func = finish_ordered_fn;
- ordered->work.flags = 0;
- btrfs_queue_worker(&root->fs_info->endio_write_workers,
- &ordered->work);
+ btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+ btrfs_queue_work(root->fs_info->endio_write_workers,
+ &ordered->work);
out_test:
/*
* our bio might span multiple ordered extents. If we haven't
@@ -7404,15 +7425,15 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
smp_mb__after_atomic_inc();
/*
- * The generic stuff only does filemap_write_and_wait_range, which isn't
- * enough if we've written compressed pages to this area, so we need to
- * call btrfs_wait_ordered_range to make absolutely sure that any
- * outstanding dirty pages are on disk.
+ * The generic stuff only does filemap_write_and_wait_range, which
+ * isn't enough if we've written compressed pages to this area, so
+ * we need to flush the dirty pages again to make absolutely sure
+ * that any outstanding dirty pages are on disk.
*/
count = iov_length(iov, nr_segs);
- ret = btrfs_wait_ordered_range(inode, offset, count);
- if (ret)
- return ret;
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_fdatawrite_range(inode->i_mapping, offset, count);
if (rw & WRITE) {
/*
@@ -8404,7 +8425,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
work->inode = inode;
work->wait = wait;
work->delay_iput = delay_iput;
- work->work.func = btrfs_run_delalloc_work;
+ btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
return work;
}
@@ -8419,7 +8440,8 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
+ int nr)
{
struct btrfs_inode *binode;
struct inode *inode;
@@ -8431,6 +8453,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
INIT_LIST_HEAD(&works);
INIT_LIST_HEAD(&splice);
+ mutex_lock(&root->delalloc_mutex);
spin_lock(&root->delalloc_lock);
list_splice_init(&root->delalloc_inodes, &splice);
while (!list_empty(&splice)) {
@@ -8453,12 +8476,14 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
else
iput(inode);
ret = -ENOMEM;
- goto out;
+ break;
}
list_add_tail(&work->list, &works);
- btrfs_queue_worker(&root->fs_info->flush_workers,
- &work->work);
-
+ btrfs_queue_work(root->fs_info->flush_workers,
+ &work->work);
+ ret++;
+ if (nr != -1 && ret >= nr)
+ break;
cond_resched();
spin_lock(&root->delalloc_lock);
}
@@ -8468,18 +8493,13 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
list_del_init(&work->list);
btrfs_wait_and_free_delalloc_work(work);
}
- return 0;
-out:
- list_for_each_entry_safe(work, next, &works, list) {
- list_del_init(&work->list);
- btrfs_wait_and_free_delalloc_work(work);
- }
if (!list_empty_careful(&splice)) {
spin_lock(&root->delalloc_lock);
list_splice_tail(&splice, &root->delalloc_inodes);
spin_unlock(&root->delalloc_lock);
}
+ mutex_unlock(&root->delalloc_mutex);
return ret;
}
@@ -8490,7 +8510,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
return -EROFS;
- ret = __start_delalloc_inodes(root, delay_iput);
+ ret = __start_delalloc_inodes(root, delay_iput, -1);
+ if (ret > 0)
+ ret = 0;
/*
* the filemap_flush will queue IO into the worker threads, but
* we have to make sure the IO is actually started and that
@@ -8507,7 +8529,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
return ret;
}
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+ int nr)
{
struct btrfs_root *root;
struct list_head splice;
@@ -8518,9 +8541,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
INIT_LIST_HEAD(&splice);
+ mutex_lock(&fs_info->delalloc_root_mutex);
spin_lock(&fs_info->delalloc_root_lock);
list_splice_init(&fs_info->delalloc_roots, &splice);
- while (!list_empty(&splice)) {
+ while (!list_empty(&splice) && nr) {
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
root = btrfs_grab_fs_root(root);
@@ -8529,15 +8553,20 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = __start_delalloc_inodes(root, delay_iput);
+ ret = __start_delalloc_inodes(root, delay_iput, nr);
btrfs_put_fs_root(root);
- if (ret)
+ if (ret < 0)
goto out;
+ if (nr != -1) {
+ nr -= ret;
+ WARN_ON(nr < 0);
+ }
spin_lock(&fs_info->delalloc_root_lock);
}
spin_unlock(&fs_info->delalloc_root_lock);
+ ret = 0;
atomic_inc(&fs_info->async_submit_draining);
while (atomic_read(&fs_info->nr_async_submits) ||
atomic_read(&fs_info->async_delalloc_pages)) {
@@ -8546,13 +8575,13 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
atomic_read(&fs_info->async_delalloc_pages) == 0));
}
atomic_dec(&fs_info->async_submit_draining);
- return 0;
out:
if (!list_empty_careful(&splice)) {
spin_lock(&fs_info->delalloc_root_lock);
list_splice_tail(&splice, &fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
}
+ mutex_unlock(&fs_info->delalloc_root_mutex);
return ret;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 383ab455bfa7..0401397b5c92 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -59,6 +59,32 @@
#include "props.h"
#include "sysfs.h"
+#ifdef CONFIG_64BIT
+/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
+ * structures are incorrect, as the timespec structure from userspace
+ * is 4 bytes too small. We define these alternatives here to teach
+ * the kernel about the 32-bit struct packing.
+ */
+struct btrfs_ioctl_timespec_32 {
+ __u64 sec;
+ __u32 nsec;
+} __attribute__ ((__packed__));
+
+struct btrfs_ioctl_received_subvol_args_32 {
+ char uuid[BTRFS_UUID_SIZE]; /* in */
+ __u64 stransid; /* in */
+ __u64 rtransid; /* out */
+ struct btrfs_ioctl_timespec_32 stime; /* in */
+ struct btrfs_ioctl_timespec_32 rtime; /* out */
+ __u64 flags; /* in */
+ __u64 reserved[16]; /* in */
+} __attribute__ ((__packed__));
+
+#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
+ struct btrfs_ioctl_received_subvol_args_32)
+#endif
+
+
static int btrfs_clone(struct inode *src, struct inode *inode,
u64 off, u64 olen, u64 olen_aligned, u64 destoff);
@@ -585,6 +611,23 @@ fail:
return ret;
}
+static void btrfs_wait_nocow_write(struct btrfs_root *root)
+{
+ s64 writers;
+ DEFINE_WAIT(wait);
+
+ do {
+ prepare_to_wait(&root->subv_writers->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ writers = percpu_counter_sum(&root->subv_writers->counter);
+ if (writers)
+ schedule();
+
+ finish_wait(&root->subv_writers->wait, &wait);
+ } while (writers);
+}
+
static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct dentry *dentry, char *name, int namelen,
u64 *async_transid, bool readonly,
@@ -598,15 +641,21 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (!root->ref_cows)
return -EINVAL;
+ atomic_inc(&root->will_be_snapshoted);
+ smp_mb__after_atomic_inc();
+ btrfs_wait_nocow_write(root);
+
ret = btrfs_start_delalloc_inodes(root, 0);
if (ret)
- return ret;
+ goto out;
btrfs_wait_ordered_extents(root, -1);
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
- if (!pending_snapshot)
- return -ENOMEM;
+ if (!pending_snapshot) {
+ ret = -ENOMEM;
+ goto out;
+ }
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
@@ -623,7 +672,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
&pending_snapshot->qgroup_reserved,
false);
if (ret)
- goto out;
+ goto free;
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
@@ -674,8 +723,10 @@ fail:
btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
&pending_snapshot->block_rsv,
pending_snapshot->qgroup_reserved);
-out:
+free:
kfree(pending_snapshot);
+out:
+ atomic_dec(&root->will_be_snapshoted);
return ret;
}
@@ -884,12 +935,14 @@ static int find_new_extents(struct btrfs_root *root,
min_key.type = BTRFS_EXTENT_DATA_KEY;
min_key.offset = *off;
- path->keep_locks = 1;
-
while (1) {
+ path->keep_locks = 1;
ret = btrfs_search_forward(root, &min_key, path, newer_than);
if (ret != 0)
goto none;
+ path->keep_locks = 0;
+ btrfs_unlock_up_safe(path, 1);
+process_slot:
if (min_key.objectid != ino)
goto none;
if (min_key.type != BTRFS_EXTENT_DATA_KEY)
@@ -908,6 +961,12 @@ static int find_new_extents(struct btrfs_root *root,
return 0;
}
+ path->slots[0]++;
+ if (path->slots[0] < btrfs_header_nritems(leaf)) {
+ btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
+ goto process_slot;
+ }
+
if (min_key.offset == (u64)-1)
goto none;
@@ -935,10 +994,13 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
read_unlock(&em_tree->lock);
if (!em) {
+ struct extent_state *cached = NULL;
+ u64 end = start + len - 1;
+
/* get the big lock and read metadata off disk */
- lock_extent(io_tree, start, start + len - 1);
+ lock_extent_bits(io_tree, start, end, 0, &cached);
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
- unlock_extent(io_tree, start, start + len - 1);
+ unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
if (IS_ERR(em))
return NULL;
@@ -957,7 +1019,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
return false;
next = defrag_lookup_extent(inode, em->start + em->len);
- if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
+ (em->block_start + em->block_len == next->block_start))
ret = false;
free_extent_map(next);
@@ -1076,10 +1139,12 @@ again:
page_start = page_offset(page);
page_end = page_start + PAGE_CACHE_SIZE - 1;
while (1) {
- lock_extent(tree, page_start, page_end);
+ lock_extent_bits(tree, page_start, page_end,
+ 0, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode,
page_start);
- unlock_extent(tree, page_start, page_end);
+ unlock_extent_cached(tree, page_start, page_end,
+ &cached_state, GFP_NOFS);
if (!ordered)
break;
@@ -1356,8 +1421,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
}
}
- if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
+ if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+ }
if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
/* the filemap_flush will queue IO into the worker threads, but
@@ -1573,7 +1642,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
if (src_inode->i_sb != file_inode(file)->i_sb) {
btrfs_info(BTRFS_I(src_inode)->root->fs_info,
"Snapshot src from another FS");
- ret = -EINVAL;
+ ret = -EXDEV;
} else if (!inode_owner_or_capable(src_inode)) {
/*
* Subvolume creation is not restricted, but snapshots
@@ -1797,7 +1866,9 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
if (di && !IS_ERR(di)) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
if (key.objectid == root->root_key.objectid) {
- ret = -ENOTEMPTY;
+ ret = -EPERM;
+ btrfs_err(root->fs_info, "deleting default subvolume "
+ "%llu is not allowed", key.objectid);
goto out;
}
btrfs_release_path(path);
@@ -2994,8 +3065,9 @@ process_slot:
new_key.offset + datal,
1);
if (ret) {
- btrfs_abort_transaction(trans, root,
- ret);
+ if (ret != -EINVAL)
+ btrfs_abort_transaction(trans,
+ root, ret);
btrfs_end_transaction(trans, root);
goto out;
}
@@ -3153,8 +3225,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
* decompress into destination's address_space (the file offset
* may change, so source mapping won't do), then recompress (or
* otherwise reinsert) a subrange.
- * - allow ranges within the same file to be cloned (provided
- * they don't overlap)?
+ *
+ * - split destination inode's inline extents. The inline extents can
+ * be either compressed or non-compressed.
*/
/* the destination must be opened for writing */
@@ -3537,20 +3610,6 @@ out:
return ret;
}
-static long btrfs_ioctl_global_rsv(struct btrfs_root *root, void __user *arg)
-{
- struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv;
- u64 reserved;
-
- spin_lock(&block_rsv->lock);
- reserved = block_rsv->reserved;
- spin_unlock(&block_rsv->lock);
-
- if (arg && copy_to_user(arg, &reserved, sizeof(reserved)))
- return -EFAULT;
- return 0;
-}
-
/*
* there are many ways the trans_start and trans_end ioctls can lead
* to deadlocks. They should only be used by applications that
@@ -4367,10 +4426,9 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
return btrfs_qgroup_wait_for_completion(root->fs_info);
}
-static long btrfs_ioctl_set_received_subvol(struct file *file,
- void __user *arg)
+static long _btrfs_ioctl_set_received_subvol(struct file *file,
+ struct btrfs_ioctl_received_subvol_args *sa)
{
- struct btrfs_ioctl_received_subvol_args *sa = NULL;
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_root_item *root_item = &root->root_item;
@@ -4398,13 +4456,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
goto out;
}
- sa = memdup_user(arg, sizeof(*sa));
- if (IS_ERR(sa)) {
- ret = PTR_ERR(sa);
- sa = NULL;
- goto out;
- }
-
/*
* 1 - root item
* 2 - uuid items (received uuid + subvol uuid)
@@ -4458,14 +4509,91 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
goto out;
}
+out:
+ up_write(&root->fs_info->subvol_sem);
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+#ifdef CONFIG_64BIT
+static long btrfs_ioctl_set_received_subvol_32(struct file *file,
+ void __user *arg)
+{
+ struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
+ struct btrfs_ioctl_received_subvol_args *args64 = NULL;
+ int ret = 0;
+
+ args32 = memdup_user(arg, sizeof(*args32));
+ if (IS_ERR(args32)) {
+ ret = PTR_ERR(args32);
+ args32 = NULL;
+ goto out;
+ }
+
+ args64 = kmalloc(sizeof(*args64), GFP_NOFS);
+ if (IS_ERR(args64)) {
+ ret = PTR_ERR(args64);
+ args64 = NULL;
+ goto out;
+ }
+
+ memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
+ args64->stransid = args32->stransid;
+ args64->rtransid = args32->rtransid;
+ args64->stime.sec = args32->stime.sec;
+ args64->stime.nsec = args32->stime.nsec;
+ args64->rtime.sec = args32->rtime.sec;
+ args64->rtime.nsec = args32->rtime.nsec;
+ args64->flags = args32->flags;
+
+ ret = _btrfs_ioctl_set_received_subvol(file, args64);
+ if (ret)
+ goto out;
+
+ memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
+ args32->stransid = args64->stransid;
+ args32->rtransid = args64->rtransid;
+ args32->stime.sec = args64->stime.sec;
+ args32->stime.nsec = args64->stime.nsec;
+ args32->rtime.sec = args64->rtime.sec;
+ args32->rtime.nsec = args64->rtime.nsec;
+ args32->flags = args64->flags;
+
+ ret = copy_to_user(arg, args32, sizeof(*args32));
+ if (ret)
+ ret = -EFAULT;
+
+out:
+ kfree(args32);
+ kfree(args64);
+ return ret;
+}
+#endif
+
+static long btrfs_ioctl_set_received_subvol(struct file *file,
+ void __user *arg)
+{
+ struct btrfs_ioctl_received_subvol_args *sa = NULL;
+ int ret = 0;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ sa = NULL;
+ goto out;
+ }
+
+ ret = _btrfs_ioctl_set_received_subvol(file, sa);
+
+ if (ret)
+ goto out;
+
ret = copy_to_user(arg, sa, sizeof(*sa));
if (ret)
ret = -EFAULT;
out:
kfree(sa);
- up_write(&root->fs_info->subvol_sem);
- mnt_drop_write_file(file);
return ret;
}
@@ -4757,12 +4885,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_logical_to_ino(root, argp);
case BTRFS_IOC_SPACE_INFO:
return btrfs_ioctl_space_info(root, argp);
- case BTRFS_IOC_GLOBAL_RSV:
- return btrfs_ioctl_global_rsv(root, argp);
case BTRFS_IOC_SYNC: {
int ret;
- ret = btrfs_start_delalloc_roots(root->fs_info, 0);
+ ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
if (ret)
return ret;
ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
@@ -4786,6 +4912,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_balance_progress(root, argp);
case BTRFS_IOC_SET_RECEIVED_SUBVOL:
return btrfs_ioctl_set_received_subvol(file, argp);
+#ifdef CONFIG_64BIT
+ case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
+ return btrfs_ioctl_set_received_subvol_32(file, argp);
+#endif
case BTRFS_IOC_SEND:
return btrfs_ioctl_send(file, argp);
case BTRFS_IOC_GET_DEV_STATS:
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b16450b840e7..a94b05f72869 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -349,10 +349,13 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
if (!uptodate)
set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
- if (entry->bytes_left == 0)
+ if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
- else
+ if (waitqueue_active(&entry->wait))
+ wake_up(&entry->wait);
+ } else {
ret = 1;
+ }
out:
if (!ret && cached && entry) {
*cached = entry;
@@ -410,10 +413,13 @@ have_entry:
if (!uptodate)
set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
- if (entry->bytes_left == 0)
+ if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
- else
+ if (waitqueue_active(&entry->wait))
+ wake_up(&entry->wait);
+ } else {
ret = 1;
+ }
out:
if (!ret && cached && entry) {
*cached = entry;
@@ -424,27 +430,48 @@ out:
}
/* Needs to either be called under a log transaction or the log_mutex */
-void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
+void btrfs_get_logged_extents(struct inode *inode,
+ struct list_head *logged_list)
{
struct btrfs_ordered_inode_tree *tree;
struct btrfs_ordered_extent *ordered;
struct rb_node *n;
- int index = log->log_transid % 2;
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irq(&tree->lock);
for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
- spin_lock(&log->log_extents_lock[index]);
- if (list_empty(&ordered->log_list)) {
- list_add_tail(&ordered->log_list, &log->logged_list[index]);
- atomic_inc(&ordered->refs);
- }
- spin_unlock(&log->log_extents_lock[index]);
+ if (!list_empty(&ordered->log_list))
+ continue;
+ list_add_tail(&ordered->log_list, logged_list);
+ atomic_inc(&ordered->refs);
}
spin_unlock_irq(&tree->lock);
}
+void btrfs_put_logged_extents(struct list_head *logged_list)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ while (!list_empty(logged_list)) {
+ ordered = list_first_entry(logged_list,
+ struct btrfs_ordered_extent,
+ log_list);
+ list_del_init(&ordered->log_list);
+ btrfs_put_ordered_extent(ordered);
+ }
+}
+
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+ struct btrfs_root *log)
+{
+ int index = log->log_transid % 2;
+
+ spin_lock_irq(&log->log_extents_lock[index]);
+ list_splice_tail(logged_list, &log->logged_list[index]);
+ spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
{
struct btrfs_ordered_extent *ordered;
@@ -577,7 +604,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
INIT_LIST_HEAD(&splice);
INIT_LIST_HEAD(&works);
- mutex_lock(&root->fs_info->ordered_operations_mutex);
+ mutex_lock(&root->ordered_extent_mutex);
spin_lock(&root->ordered_extent_lock);
list_splice_init(&root->ordered_extents, &splice);
while (!list_empty(&splice) && nr) {
@@ -588,10 +615,11 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
atomic_inc(&ordered->refs);
spin_unlock(&root->ordered_extent_lock);
- ordered->flush_work.func = btrfs_run_ordered_extent_work;
+ btrfs_init_work(&ordered->flush_work,
+ btrfs_run_ordered_extent_work, NULL, NULL);
list_add_tail(&ordered->work_list, &works);
- btrfs_queue_worker(&root->fs_info->flush_workers,
- &ordered->flush_work);
+ btrfs_queue_work(root->fs_info->flush_workers,
+ &ordered->flush_work);
cond_resched();
spin_lock(&root->ordered_extent_lock);
@@ -608,7 +636,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
btrfs_put_ordered_extent(ordered);
cond_resched();
}
- mutex_unlock(&root->fs_info->ordered_operations_mutex);
+ mutex_unlock(&root->ordered_extent_mutex);
return count;
}
@@ -621,6 +649,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
INIT_LIST_HEAD(&splice);
+ mutex_lock(&fs_info->ordered_operations_mutex);
spin_lock(&fs_info->ordered_root_lock);
list_splice_init(&fs_info->ordered_roots, &splice);
while (!list_empty(&splice) && nr) {
@@ -643,6 +672,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
}
list_splice_tail(&splice, &fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
+ mutex_unlock(&fs_info->ordered_operations_mutex);
}
/*
@@ -704,8 +734,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
goto out;
}
list_add_tail(&work->list, &works);
- btrfs_queue_worker(&root->fs_info->flush_workers,
- &work->work);
+ btrfs_queue_work(root->fs_info->flush_workers,
+ &work->work);
cond_resched();
spin_lock(&root->fs_info->ordered_root_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 9b0450f7ac20..246897058efb 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -197,7 +197,11 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct inode *inode);
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
-void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
+void btrfs_get_logged_extents(struct inode *inode,
+ struct list_head *logged_list);
+void btrfs_put_logged_extents(struct list_head *logged_list);
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+ struct btrfs_root *log);
void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
int __init ordered_data_init(void);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 472302a2d745..2cf905877aaf 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1509,8 +1509,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
ret = qgroup_rescan_init(fs_info, 0, 1);
if (!ret) {
qgroup_rescan_zero_tracking(fs_info);
- btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
- &fs_info->qgroup_rescan_work);
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
}
ret = 0;
}
@@ -2095,7 +2095,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
memset(&fs_info->qgroup_rescan_work, 0,
sizeof(fs_info->qgroup_rescan_work));
- fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker;
+ btrfs_init_work(&fs_info->qgroup_rescan_work,
+ btrfs_qgroup_rescan_worker, NULL, NULL);
if (ret) {
err:
@@ -2158,8 +2159,8 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
qgroup_rescan_zero_tracking(fs_info);
- btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
- &fs_info->qgroup_rescan_work);
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
return 0;
}
@@ -2190,6 +2191,6 @@ void
btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
{
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
- btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
- &fs_info->qgroup_rescan_work);
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 9af0b25d991a..4055291a523e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1416,20 +1416,18 @@ cleanup:
static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
{
- rbio->work.flags = 0;
- rbio->work.func = rmw_work;
+ btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
- btrfs_queue_worker(&rbio->fs_info->rmw_workers,
- &rbio->work);
+ btrfs_queue_work(rbio->fs_info->rmw_workers,
+ &rbio->work);
}
static void async_read_rebuild(struct btrfs_raid_bio *rbio)
{
- rbio->work.flags = 0;
- rbio->work.func = read_rebuild_work;
+ btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
- btrfs_queue_worker(&rbio->fs_info->rmw_workers,
- &rbio->work);
+ btrfs_queue_work(rbio->fs_info->rmw_workers,
+ &rbio->work);
}
/*
@@ -1667,10 +1665,9 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (from_schedule) {
- plug->work.flags = 0;
- plug->work.func = unplug_work;
- btrfs_queue_worker(&plug->info->rmw_workers,
- &plug->work);
+ btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
+ btrfs_queue_work(plug->info->rmw_workers,
+ &plug->work);
return;
}
run_plug(plug);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 31c797c48c3e..30947f923620 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -793,10 +793,10 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
/* FIXME we cannot handle this properly right now */
BUG();
}
- rmw->work.func = reada_start_machine_worker;
+ btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
rmw->fs_info = fs_info;
- btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
+ btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
}
#ifdef DEBUG
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 07b3b36f40ee..def428a25b2a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4248,7 +4248,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
rc->block_group->key.objectid, rc->block_group->flags);
- ret = btrfs_start_delalloc_roots(fs_info, 0);
+ ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
if (ret < 0) {
err = ret;
goto out;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 1389b69059de..38bb47e7d6b1 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -16,6 +16,7 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/err.h>
#include <linux/uuid.h>
#include "ctree.h"
#include "transaction.h"
@@ -271,7 +272,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
key.offset++;
root = btrfs_read_fs_root(tree_root, &root_key);
- err = PTR_RET(root);
+ err = PTR_ERR_OR_ZERO(root);
if (err && err != -ENOENT) {
break;
} else if (err == -ENOENT) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efba5d1282ee..93e6d7172844 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -315,6 +315,16 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
atomic_inc(&fs_info->scrubs_running);
atomic_inc(&fs_info->scrubs_paused);
mutex_unlock(&fs_info->scrub_lock);
+
+ /*
+ * check if @scrubs_running=@scrubs_paused condition
+ * inside wait_event() is not an atomic operation.
+ * which means we may inc/dec @scrub_running/paused
+ * at any time. Let's wake up @scrub_pause_wait as
+ * much as we can to let commit transaction blocked less.
+ */
+ wake_up(&fs_info->scrub_pause_wait);
+
atomic_inc(&sctx->workers_pending);
}
@@ -418,7 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
sbio->index = i;
sbio->sctx = sctx;
sbio->page_count = 0;
- sbio->work.func = scrub_bio_end_io_worker;
+ btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
+ NULL, NULL);
if (i != SCRUB_BIOS_PER_SCTX - 1)
sctx->bios[i]->next_free = i + 1;
@@ -987,9 +998,10 @@ nodatasum_case:
fixup_nodatasum->root = fs_info->extent_root;
fixup_nodatasum->mirror_num = failed_mirror_index + 1;
scrub_pending_trans_workers_inc(sctx);
- fixup_nodatasum->work.func = scrub_fixup_nodatasum;
- btrfs_queue_worker(&fs_info->scrub_workers,
- &fixup_nodatasum->work);
+ btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
+ NULL, NULL);
+ btrfs_queue_work(fs_info->scrub_workers,
+ &fixup_nodatasum->work);
goto out;
}
@@ -1603,8 +1615,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
sbio->err = err;
sbio->bio = bio;
- sbio->work.func = scrub_wr_bio_end_io_worker;
- btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
+ btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
+ btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
}
static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
@@ -2072,7 +2084,7 @@ static void scrub_bio_end_io(struct bio *bio, int err)
sbio->err = err;
sbio->bio = bio;
- btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+ btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
}
static void scrub_bio_end_io_worker(struct btrfs_work *work)
@@ -2686,10 +2698,23 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
- atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+ atomic_inc(&fs_info->scrubs_paused);
+ wake_up(&fs_info->scrub_pause_wait);
+
+ /*
+ * must be called before we decrease @scrub_paused.
+ * make sure we don't block transaction commit while
+ * we are waiting pending workers finished.
+ */
wait_event(sctx->list_wait,
atomic_read(&sctx->workers_pending) == 0);
- scrub_blocked_if_needed(fs_info);
+ atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+
+ mutex_lock(&fs_info->scrub_lock);
+ __scrub_blocked_if_needed(fs_info);
+ atomic_dec(&fs_info->scrubs_paused);
+ mutex_unlock(&fs_info->scrub_lock);
+ wake_up(&fs_info->scrub_pause_wait);
btrfs_put_block_group(cache);
if (ret)
@@ -2757,33 +2782,35 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
int is_dev_replace)
{
int ret = 0;
+ int flags = WQ_FREEZABLE | WQ_UNBOUND;
+ int max_active = fs_info->thread_pool_size;
if (fs_info->scrub_workers_refcnt == 0) {
if (is_dev_replace)
- btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
- &fs_info->generic_worker);
+ fs_info->scrub_workers =
+ btrfs_alloc_workqueue("btrfs-scrub", flags,
+ 1, 4);
else
- btrfs_init_workers(&fs_info->scrub_workers, "scrub",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- fs_info->scrub_workers.idle_thresh = 4;
- ret = btrfs_start_workers(&fs_info->scrub_workers);
- if (ret)
+ fs_info->scrub_workers =
+ btrfs_alloc_workqueue("btrfs-scrub", flags,
+ max_active, 4);
+ if (!fs_info->scrub_workers) {
+ ret = -ENOMEM;
goto out;
- btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
- "scrubwrc",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- fs_info->scrub_wr_completion_workers.idle_thresh = 2;
- ret = btrfs_start_workers(
- &fs_info->scrub_wr_completion_workers);
- if (ret)
+ }
+ fs_info->scrub_wr_completion_workers =
+ btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
+ max_active, 2);
+ if (!fs_info->scrub_wr_completion_workers) {
+ ret = -ENOMEM;
goto out;
- btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
- &fs_info->generic_worker);
- ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
- if (ret)
+ }
+ fs_info->scrub_nocow_workers =
+ btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
+ if (!fs_info->scrub_nocow_workers) {
+ ret = -ENOMEM;
goto out;
+ }
}
++fs_info->scrub_workers_refcnt;
out:
@@ -2793,9 +2820,9 @@ out:
static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
{
if (--fs_info->scrub_workers_refcnt == 0) {
- btrfs_stop_workers(&fs_info->scrub_workers);
- btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
- btrfs_stop_workers(&fs_info->scrub_nocow_workers);
+ btrfs_destroy_workqueue(fs_info->scrub_workers);
+ btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+ btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
}
WARN_ON(fs_info->scrub_workers_refcnt < 0);
}
@@ -3106,10 +3133,10 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
nocow_ctx->len = len;
nocow_ctx->mirror_num = mirror_num;
nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
- nocow_ctx->work.func = copy_nocow_pages_worker;
+ btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
INIT_LIST_HEAD(&nocow_ctx->inodes);
- btrfs_queue_worker(&fs_info->scrub_nocow_workers,
- &nocow_ctx->work);
+ btrfs_queue_work(fs_info->scrub_nocow_workers,
+ &nocow_ctx->work);
return 0;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 9c8d1a3fdc3a..9b6da9d55f9a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -51,15 +51,18 @@ struct fs_path {
struct {
char *start;
char *end;
- char *prepared;
char *buf;
- int buf_len;
- unsigned int reversed:1;
- unsigned int virtual_mem:1;
+ unsigned short buf_len:15;
+ unsigned short reversed:1;
char inline_buf[];
};
- char pad[PAGE_SIZE];
+ /*
+ * Average path length does not exceed 200 bytes, we'll have
+ * better packing in the slab and higher chance to satisfy
+ * a allocation later during send.
+ */
+ char pad[256];
};
};
#define FS_PATH_INLINE_SIZE \
@@ -109,6 +112,7 @@ struct send_ctx {
int cur_inode_deleted;
u64 cur_inode_size;
u64 cur_inode_mode;
+ u64 cur_inode_rdev;
u64 cur_inode_last_extent;
u64 send_progress;
@@ -120,6 +124,8 @@ struct send_ctx {
struct list_head name_cache_list;
int name_cache_size;
+ struct file_ra_state ra;
+
char *read_buf;
/*
@@ -175,6 +181,47 @@ struct send_ctx {
* own move/rename can be performed.
*/
struct rb_root waiting_dir_moves;
+
+ /*
+ * A directory that is going to be rm'ed might have a child directory
+ * which is in the pending directory moves index above. In this case,
+ * the directory can only be removed after the move/rename of its child
+ * is performed. Example:
+ *
+ * Parent snapshot:
+ *
+ * . (ino 256)
+ * |-- a/ (ino 257)
+ * |-- b/ (ino 258)
+ * |-- c/ (ino 259)
+ * | |-- x/ (ino 260)
+ * |
+ * |-- y/ (ino 261)
+ *
+ * Send snapshot:
+ *
+ * . (ino 256)
+ * |-- a/ (ino 257)
+ * |-- b/ (ino 258)
+ * |-- YY/ (ino 261)
+ * |-- x/ (ino 260)
+ *
+ * Sequence of steps that lead to the send snapshot:
+ * rm -f /a/b/c/foo.txt
+ * mv /a/b/y /a/b/YY
+ * mv /a/b/c/x /a/b/YY
+ * rmdir /a/b/c
+ *
+ * When the child is processed, its move/rename is delayed until its
+ * parent is processed (as explained above), but all other operations
+ * like update utimes, chown, chgrp, etc, are performed and the paths
+ * that it uses for those operations must use the orphanized name of
+ * its parent (the directory we're going to rm later), so we need to
+ * memorize that name.
+ *
+ * Indexed by the inode number of the directory to be deleted.
+ */
+ struct rb_root orphan_dirs;
};
struct pending_dir_move {
@@ -189,6 +236,18 @@ struct pending_dir_move {
struct waiting_dir_move {
struct rb_node node;
u64 ino;
+ /*
+ * There might be some directory that could not be removed because it
+ * was waiting for this directory inode to be moved first. Therefore
+ * after this directory is moved, we can try to rmdir the ino rmdir_ino.
+ */
+ u64 rmdir_ino;
+};
+
+struct orphan_dir_info {
+ struct rb_node node;
+ u64 ino;
+ u64 gen;
};
struct name_cache_entry {
@@ -214,6 +273,11 @@ struct name_cache_entry {
static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
+
static int need_send_hole(struct send_ctx *sctx)
{
return (sctx->parent_root && !sctx->cur_inode_new &&
@@ -242,7 +306,6 @@ static struct fs_path *fs_path_alloc(void)
if (!p)
return NULL;
p->reversed = 0;
- p->virtual_mem = 0;
p->buf = p->inline_buf;
p->buf_len = FS_PATH_INLINE_SIZE;
fs_path_reset(p);
@@ -265,12 +328,8 @@ static void fs_path_free(struct fs_path *p)
{
if (!p)
return;
- if (p->buf != p->inline_buf) {
- if (p->virtual_mem)
- vfree(p->buf);
- else
- kfree(p->buf);
- }
+ if (p->buf != p->inline_buf)
+ kfree(p->buf);
kfree(p);
}
@@ -292,40 +351,23 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
path_len = p->end - p->start;
old_buf_len = p->buf_len;
- len = PAGE_ALIGN(len);
-
- if (p->buf == p->inline_buf) {
- tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN);
- if (!tmp_buf) {
- tmp_buf = vmalloc(len);
- if (!tmp_buf)
- return -ENOMEM;
- p->virtual_mem = 1;
- }
- memcpy(tmp_buf, p->buf, p->buf_len);
- p->buf = tmp_buf;
- p->buf_len = len;
- } else {
- if (p->virtual_mem) {
- tmp_buf = vmalloc(len);
- if (!tmp_buf)
- return -ENOMEM;
- memcpy(tmp_buf, p->buf, p->buf_len);
- vfree(p->buf);
- } else {
- tmp_buf = krealloc(p->buf, len, GFP_NOFS);
- if (!tmp_buf) {
- tmp_buf = vmalloc(len);
- if (!tmp_buf)
- return -ENOMEM;
- memcpy(tmp_buf, p->buf, p->buf_len);
- kfree(p->buf);
- p->virtual_mem = 1;
- }
- }
- p->buf = tmp_buf;
- p->buf_len = len;
- }
+
+ /*
+ * First time the inline_buf does not suffice
+ */
+ if (p->buf == p->inline_buf)
+ tmp_buf = kmalloc(len, GFP_NOFS);
+ else
+ tmp_buf = krealloc(p->buf, len, GFP_NOFS);
+ if (!tmp_buf)
+ return -ENOMEM;
+ p->buf = tmp_buf;
+ /*
+ * The real size of the buffer is bigger, this will let the fast path
+ * happen most of the time
+ */
+ p->buf_len = ksize(p->buf);
+
if (p->reversed) {
tmp_buf = p->buf + old_buf_len - path_len - 1;
p->end = p->buf + p->buf_len - 1;
@@ -338,7 +380,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
return 0;
}
-static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
+static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
+ char **prepared)
{
int ret;
int new_len;
@@ -354,11 +397,11 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
if (p->start != p->end)
*--p->start = '/';
p->start -= name_len;
- p->prepared = p->start;
+ *prepared = p->start;
} else {
if (p->start != p->end)
*p->end++ = '/';
- p->prepared = p->end;
+ *prepared = p->end;
p->end += name_len;
*p->end = 0;
}
@@ -370,12 +413,12 @@ out:
static int fs_path_add(struct fs_path *p, const char *name, int name_len)
{
int ret;
+ char *prepared;
- ret = fs_path_prepare_for_add(p, name_len);
+ ret = fs_path_prepare_for_add(p, name_len, &prepared);
if (ret < 0)
goto out;
- memcpy(p->prepared, name, name_len);
- p->prepared = NULL;
+ memcpy(prepared, name, name_len);
out:
return ret;
@@ -384,12 +427,12 @@ out:
static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
{
int ret;
+ char *prepared;
- ret = fs_path_prepare_for_add(p, p2->end - p2->start);
+ ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
if (ret < 0)
goto out;
- memcpy(p->prepared, p2->start, p2->end - p2->start);
- p->prepared = NULL;
+ memcpy(prepared, p2->start, p2->end - p2->start);
out:
return ret;
@@ -400,13 +443,13 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,
unsigned long off, int len)
{
int ret;
+ char *prepared;
- ret = fs_path_prepare_for_add(p, len);
+ ret = fs_path_prepare_for_add(p, len, &prepared);
if (ret < 0)
goto out;
- read_extent_buffer(eb, p->prepared, off, len);
- p->prepared = NULL;
+ read_extent_buffer(eb, prepared, off, len);
out:
return ret;
@@ -915,9 +958,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_dir_item *di;
struct btrfs_key di_key;
char *buf = NULL;
- char *buf2 = NULL;
- int buf_len;
- int buf_virtual = 0;
+ const int buf_len = PATH_MAX;
u32 name_len;
u32 data_len;
u32 cur;
@@ -927,7 +968,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
int num;
u8 type;
- buf_len = PAGE_SIZE;
buf = kmalloc(buf_len, GFP_NOFS);
if (!buf) {
ret = -ENOMEM;
@@ -949,30 +989,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
type = btrfs_dir_type(eb, di);
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+ /*
+ * Path too long
+ */
if (name_len + data_len > buf_len) {
- buf_len = PAGE_ALIGN(name_len + data_len);
- if (buf_virtual) {
- buf2 = vmalloc(buf_len);
- if (!buf2) {
- ret = -ENOMEM;
- goto out;
- }
- vfree(buf);
- } else {
- buf2 = krealloc(buf, buf_len, GFP_NOFS);
- if (!buf2) {
- buf2 = vmalloc(buf_len);
- if (!buf2) {
- ret = -ENOMEM;
- goto out;
- }
- kfree(buf);
- buf_virtual = 1;
- }
- }
-
- buf = buf2;
- buf2 = NULL;
+ ret = -ENAMETOOLONG;
+ goto out;
}
read_extent_buffer(eb, buf, (unsigned long)(di + 1),
@@ -995,10 +1017,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
}
out:
- if (buf_virtual)
- vfree(buf);
- else
- kfree(buf);
+ kfree(buf);
return ret;
}
@@ -1292,8 +1311,6 @@ static int find_extent_clone(struct send_ctx *sctx,
extent_item_pos = logical - found_key.objectid;
else
extent_item_pos = 0;
-
- extent_item_pos = logical - found_key.objectid;
ret = iterate_extent_inodes(sctx->send_root->fs_info,
found_key.objectid, extent_item_pos, 1,
__iterate_backrefs, backref_ctx);
@@ -1332,6 +1349,16 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
}
if (cur_clone_root) {
+ if (compressed != BTRFS_COMPRESS_NONE) {
+ /*
+ * Offsets given by iterate_extent_inodes() are relative
+ * to the start of the extent, we need to add logical
+ * offset from the file extent item.
+ * (See why at backref.c:check_extent_in_eb())
+ */
+ cur_clone_root->offset += btrfs_file_extent_offset(eb,
+ fi);
+ }
*found = cur_clone_root;
ret = 0;
} else {
@@ -1408,11 +1435,7 @@ static int gen_unique_name(struct send_ctx *sctx,
while (1) {
len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
ino, gen, idx);
- if (len >= sizeof(tmp)) {
- /* should really not happen */
- ret = -EOVERFLOW;
- goto out;
- }
+ ASSERT(len < sizeof(tmp));
di = btrfs_lookup_dir_item(NULL, sctx->send_root,
path, BTRFS_FIRST_FREE_OBJECTID,
@@ -1888,13 +1911,20 @@ static void name_cache_delete(struct send_ctx *sctx,
nce_head = radix_tree_lookup(&sctx->name_cache,
(unsigned long)nce->ino);
- BUG_ON(!nce_head);
+ if (!nce_head) {
+ btrfs_err(sctx->send_root->fs_info,
+ "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
+ nce->ino, sctx->name_cache_size);
+ }
list_del(&nce->radix_list);
list_del(&nce->list);
sctx->name_cache_size--;
- if (list_empty(nce_head)) {
+ /*
+ * We may not get to the final release of nce_head if the lookup fails
+ */
+ if (nce_head && list_empty(nce_head)) {
radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
kfree(nce_head);
}
@@ -1967,7 +1997,6 @@ static void name_cache_free(struct send_ctx *sctx)
*/
static int __get_cur_name_and_parent(struct send_ctx *sctx,
u64 ino, u64 gen,
- int skip_name_cache,
u64 *parent_ino,
u64 *parent_gen,
struct fs_path *dest)
@@ -1977,8 +2006,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
struct btrfs_path *path = NULL;
struct name_cache_entry *nce = NULL;
- if (skip_name_cache)
- goto get_ref;
/*
* First check if we already did a call to this function with the same
* ino/gen. If yes, check if the cache entry is still up-to-date. If yes
@@ -2023,12 +2050,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
goto out_cache;
}
-get_ref:
/*
* Depending on whether the inode was already processed or not, use
* send_root or parent_root for ref lookup.
*/
- if (ino < sctx->send_progress && !skip_name_cache)
+ if (ino < sctx->send_progress)
ret = get_first_ref(sctx->send_root, ino,
parent_ino, parent_gen, dest);
else
@@ -2052,8 +2078,6 @@ get_ref:
goto out;
ret = 1;
}
- if (skip_name_cache)
- goto out;
out_cache:
/*
@@ -2121,9 +2145,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
u64 parent_inode = 0;
u64 parent_gen = 0;
int stop = 0;
- u64 start_ino = ino;
- u64 start_gen = gen;
- int skip_name_cache = 0;
name = fs_path_alloc();
if (!name) {
@@ -2131,31 +2152,33 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
goto out;
}
- if (is_waiting_for_move(sctx, ino))
- skip_name_cache = 1;
-
-again:
dest->reversed = 1;
fs_path_reset(dest);
while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
fs_path_reset(name);
- ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache,
- &parent_inode, &parent_gen, name);
+ if (is_waiting_for_rm(sctx, ino)) {
+ ret = gen_unique_name(sctx, ino, gen, name);
+ if (ret < 0)
+ goto out;
+ ret = fs_path_add_path(dest, name);
+ break;
+ }
+
+ if (is_waiting_for_move(sctx, ino)) {
+ ret = get_first_ref(sctx->parent_root, ino,
+ &parent_inode, &parent_gen, name);
+ } else {
+ ret = __get_cur_name_and_parent(sctx, ino, gen,
+ &parent_inode,
+ &parent_gen, name);
+ if (ret)
+ stop = 1;
+ }
+
if (ret < 0)
goto out;
- if (ret)
- stop = 1;
-
- if (!skip_name_cache &&
- is_waiting_for_move(sctx, parent_inode)) {
- ino = start_ino;
- gen = start_gen;
- stop = 0;
- skip_name_cache = 1;
- goto again;
- }
ret = fs_path_add_path(dest, name);
if (ret < 0)
@@ -2419,10 +2442,16 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
if (!p)
return -ENOMEM;
- ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL,
- NULL, &rdev);
- if (ret < 0)
- goto out;
+ if (ino != sctx->cur_ino) {
+ ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
+ NULL, NULL, &rdev);
+ if (ret < 0)
+ goto out;
+ } else {
+ gen = sctx->cur_inode_gen;
+ mode = sctx->cur_inode_mode;
+ rdev = sctx->cur_inode_rdev;
+ }
if (S_ISREG(mode)) {
cmd = BTRFS_SEND_C_MKFILE;
@@ -2502,17 +2531,26 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
+ ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
while (1) {
- ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
- 1, 0);
- if (ret < 0)
- goto out;
- if (!ret) {
- eb = path->nodes[0];
- slot = path->slots[0];
- btrfs_item_key_to_cpu(eb, &found_key, slot);
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(sctx->send_root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
}
- if (ret || found_key.objectid != key.objectid ||
+
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
+ if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
goto out;
@@ -2527,8 +2565,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
goto out;
}
- key.offset = found_key.offset + 1;
- btrfs_release_path(path);
+ path->slots[0]++;
}
out:
@@ -2580,7 +2617,7 @@ struct recorded_ref {
* everything mixed. So we first record all refs and later process them.
* This function is a helper to record one ref.
*/
-static int record_ref(struct list_head *head, u64 dir,
+static int __record_ref(struct list_head *head, u64 dir,
u64 dir_gen, struct fs_path *path)
{
struct recorded_ref *ref;
@@ -2666,12 +2703,78 @@ out:
return ret;
}
+static struct orphan_dir_info *
+add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+ struct rb_node **p = &sctx->orphan_dirs.rb_node;
+ struct rb_node *parent = NULL;
+ struct orphan_dir_info *entry, *odi;
+
+ odi = kmalloc(sizeof(*odi), GFP_NOFS);
+ if (!odi)
+ return ERR_PTR(-ENOMEM);
+ odi->ino = dir_ino;
+ odi->gen = 0;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct orphan_dir_info, node);
+ if (dir_ino < entry->ino) {
+ p = &(*p)->rb_left;
+ } else if (dir_ino > entry->ino) {
+ p = &(*p)->rb_right;
+ } else {
+ kfree(odi);
+ return entry;
+ }
+ }
+
+ rb_link_node(&odi->node, parent, p);
+ rb_insert_color(&odi->node, &sctx->orphan_dirs);
+ return odi;
+}
+
+static struct orphan_dir_info *
+get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+ struct rb_node *n = sctx->orphan_dirs.rb_node;
+ struct orphan_dir_info *entry;
+
+ while (n) {
+ entry = rb_entry(n, struct orphan_dir_info, node);
+ if (dir_ino < entry->ino)
+ n = n->rb_left;
+ else if (dir_ino > entry->ino)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
+{
+ struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
+
+ return odi != NULL;
+}
+
+static void free_orphan_dir_info(struct send_ctx *sctx,
+ struct orphan_dir_info *odi)
+{
+ if (!odi)
+ return;
+ rb_erase(&odi->node, &sctx->orphan_dirs);
+ kfree(odi);
+}
+
/*
* Returns 1 if a directory can be removed at this point in time.
* We check this by iterating all dir items and checking if the inode behind
* the dir item was already processed.
*/
-static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
+static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+ u64 send_progress)
{
int ret = 0;
struct btrfs_root *root = sctx->parent_root;
@@ -2694,31 +2797,52 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
while (1) {
- ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
- if (ret < 0)
- goto out;
- if (!ret) {
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
+ struct waiting_dir_move *dm;
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
}
- if (ret || found_key.objectid != key.objectid ||
- found_key.type != key.type) {
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ if (found_key.objectid != key.objectid ||
+ found_key.type != key.type)
break;
- }
di = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_dir_item);
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
+ dm = get_waiting_dir_move(sctx, loc.objectid);
+ if (dm) {
+ struct orphan_dir_info *odi;
+
+ odi = add_orphan_dir_info(sctx, dir);
+ if (IS_ERR(odi)) {
+ ret = PTR_ERR(odi);
+ goto out;
+ }
+ odi->gen = dir_gen;
+ dm->rmdir_ino = dir;
+ ret = 0;
+ goto out;
+ }
+
if (loc.objectid > send_progress) {
ret = 0;
goto out;
}
- btrfs_release_path(path);
- key.offset = found_key.offset + 1;
+ path->slots[0]++;
}
ret = 1;
@@ -2730,19 +2854,9 @@ out:
static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
{
- struct rb_node *n = sctx->waiting_dir_moves.rb_node;
- struct waiting_dir_move *entry;
+ struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
- while (n) {
- entry = rb_entry(n, struct waiting_dir_move, node);
- if (ino < entry->ino)
- n = n->rb_left;
- else if (ino > entry->ino)
- n = n->rb_right;
- else
- return 1;
- }
- return 0;
+ return entry != NULL;
}
static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
@@ -2755,6 +2869,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
if (!dm)
return -ENOMEM;
dm->ino = ino;
+ dm->rmdir_ino = 0;
while (*p) {
parent = *p;
@@ -2774,31 +2889,41 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
return 0;
}
-static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
{
struct rb_node *n = sctx->waiting_dir_moves.rb_node;
struct waiting_dir_move *entry;
while (n) {
entry = rb_entry(n, struct waiting_dir_move, node);
- if (ino < entry->ino) {
+ if (ino < entry->ino)
n = n->rb_left;
- } else if (ino > entry->ino) {
+ else if (ino > entry->ino)
n = n->rb_right;
- } else {
- rb_erase(&entry->node, &sctx->waiting_dir_moves);
- kfree(entry);
- return 0;
- }
+ else
+ return entry;
}
- return -ENOENT;
+ return NULL;
}
-static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
+static void free_waiting_dir_move(struct send_ctx *sctx,
+ struct waiting_dir_move *dm)
+{
+ if (!dm)
+ return;
+ rb_erase(&dm->node, &sctx->waiting_dir_moves);
+ kfree(dm);
+}
+
+static int add_pending_dir_move(struct send_ctx *sctx,
+ u64 ino,
+ u64 ino_gen,
+ u64 parent_ino)
{
struct rb_node **p = &sctx->pending_dir_moves.rb_node;
struct rb_node *parent = NULL;
- struct pending_dir_move *entry, *pm;
+ struct pending_dir_move *entry = NULL, *pm;
struct recorded_ref *cur;
int exists = 0;
int ret;
@@ -2807,8 +2932,8 @@ static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
if (!pm)
return -ENOMEM;
pm->parent_ino = parent_ino;
- pm->ino = sctx->cur_ino;
- pm->gen = sctx->cur_inode_gen;
+ pm->ino = ino;
+ pm->gen = ino_gen;
INIT_LIST_HEAD(&pm->list);
INIT_LIST_HEAD(&pm->update_refs);
RB_CLEAR_NODE(&pm->node);
@@ -2878,19 +3003,52 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
{
struct fs_path *from_path = NULL;
struct fs_path *to_path = NULL;
+ struct fs_path *name = NULL;
u64 orig_progress = sctx->send_progress;
struct recorded_ref *cur;
+ u64 parent_ino, parent_gen;
+ struct waiting_dir_move *dm = NULL;
+ u64 rmdir_ino = 0;
int ret;
+ name = fs_path_alloc();
from_path = fs_path_alloc();
- if (!from_path)
- return -ENOMEM;
+ if (!name || !from_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ dm = get_waiting_dir_move(sctx, pm->ino);
+ ASSERT(dm);
+ rmdir_ino = dm->rmdir_ino;
+ free_waiting_dir_move(sctx, dm);
- sctx->send_progress = pm->ino;
- ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
+ ret = get_first_ref(sctx->parent_root, pm->ino,
+ &parent_ino, &parent_gen, name);
if (ret < 0)
goto out;
+ if (parent_ino == sctx->cur_ino) {
+ /* child only renamed, not moved */
+ ASSERT(parent_gen == sctx->cur_inode_gen);
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ from_path);
+ if (ret < 0)
+ goto out;
+ ret = fs_path_add_path(from_path, name);
+ if (ret < 0)
+ goto out;
+ } else {
+ /* child moved and maybe renamed too */
+ sctx->send_progress = pm->ino;
+ ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
+ if (ret < 0)
+ goto out;
+ }
+
+ fs_path_free(name);
+ name = NULL;
+
to_path = fs_path_alloc();
if (!to_path) {
ret = -ENOMEM;
@@ -2898,9 +3056,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
}
sctx->send_progress = sctx->cur_ino + 1;
- ret = del_waiting_dir_move(sctx, pm->ino);
- ASSERT(ret == 0);
-
ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
if (ret < 0)
goto out;
@@ -2909,6 +3064,35 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
if (ret < 0)
goto out;
+ if (rmdir_ino) {
+ struct orphan_dir_info *odi;
+
+ odi = get_orphan_dir_info(sctx, rmdir_ino);
+ if (!odi) {
+ /* already deleted */
+ goto finish;
+ }
+ ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
+ if (ret < 0)
+ goto out;
+ if (!ret)
+ goto finish;
+
+ name = fs_path_alloc();
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
+ if (ret < 0)
+ goto out;
+ ret = send_rmdir(sctx, name);
+ if (ret < 0)
+ goto out;
+ free_orphan_dir_info(sctx, odi);
+ }
+
+finish:
ret = send_utimes(sctx, pm->ino, pm->gen);
if (ret < 0)
goto out;
@@ -2918,12 +3102,15 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
* and old parent(s).
*/
list_for_each_entry(cur, &pm->update_refs, list) {
+ if (cur->dir == rmdir_ino)
+ continue;
ret = send_utimes(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
}
out:
+ fs_path_free(name);
fs_path_free(from_path);
fs_path_free(to_path);
sctx->send_progress = orig_progress;
@@ -2995,17 +3182,19 @@ static int wait_for_parent_move(struct send_ctx *sctx,
int ret;
u64 ino = parent_ref->dir;
u64 parent_ino_before, parent_ino_after;
- u64 new_gen, old_gen;
+ u64 old_gen;
struct fs_path *path_before = NULL;
struct fs_path *path_after = NULL;
int len1, len2;
-
- if (parent_ref->dir <= sctx->cur_ino)
- return 0;
+ int register_upper_dirs;
+ u64 gen;
if (is_waiting_for_move(sctx, ino))
return 1;
+ if (parent_ref->dir <= sctx->cur_ino)
+ return 0;
+
ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
NULL, NULL, NULL, NULL);
if (ret == -ENOENT)
@@ -3013,12 +3202,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
else if (ret < 0)
return ret;
- ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen,
- NULL, NULL, NULL, NULL);
- if (ret < 0)
- return ret;
-
- if (new_gen != old_gen)
+ if (parent_ref->dir_gen != old_gen)
return 0;
path_before = fs_path_alloc();
@@ -3041,7 +3225,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
}
ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
- NULL, path_after);
+ &gen, path_after);
if (ret == -ENOENT) {
ret = 0;
goto out;
@@ -3051,13 +3235,67 @@ static int wait_for_parent_move(struct send_ctx *sctx,
len1 = fs_path_len(path_before);
len2 = fs_path_len(path_after);
- if ((parent_ino_before != parent_ino_after) && (len1 != len2 ||
- memcmp(path_before->start, path_after->start, len1))) {
+ if (parent_ino_before != parent_ino_after || len1 != len2 ||
+ memcmp(path_before->start, path_after->start, len1)) {
ret = 1;
goto out;
}
ret = 0;
+ /*
+ * Ok, our new most direct ancestor has a higher inode number but
+ * wasn't moved/renamed. So maybe some of the new ancestors higher in
+ * the hierarchy have an higher inode number too *and* were renamed
+ * or moved - in this case we need to wait for the ancestor's rename
+ * or move operation before we can do the move/rename for the current
+ * inode.
+ */
+ register_upper_dirs = 0;
+ ino = parent_ino_after;
+again:
+ while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) {
+ u64 parent_gen;
+
+ fs_path_reset(path_before);
+ fs_path_reset(path_after);
+
+ ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
+ &parent_gen, path_after);
+ if (ret < 0)
+ goto out;
+ ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
+ NULL, path_before);
+ if (ret == -ENOENT) {
+ ret = 0;
+ break;
+ } else if (ret < 0) {
+ goto out;
+ }
+
+ len1 = fs_path_len(path_before);
+ len2 = fs_path_len(path_after);
+ if (parent_ino_before != parent_ino_after || len1 != len2 ||
+ memcmp(path_before->start, path_after->start, len1)) {
+ ret = 1;
+ if (register_upper_dirs) {
+ break;
+ } else {
+ register_upper_dirs = 1;
+ ino = parent_ref->dir;
+ gen = parent_ref->dir_gen;
+ goto again;
+ }
+ } else if (register_upper_dirs) {
+ ret = add_pending_dir_move(sctx, ino, gen,
+ parent_ino_after);
+ if (ret < 0 && ret != -EEXIST)
+ goto out;
+ }
+
+ ino = parent_ino_after;
+ gen = parent_gen;
+ }
+
out:
fs_path_free(path_before);
fs_path_free(path_after);
@@ -3079,6 +3317,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
u64 ow_gen;
int did_overwrite = 0;
int is_orphan = 0;
+ u64 last_dir_ino_rm = 0;
verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
@@ -3217,9 +3456,14 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
* dirs, we always have one new and one deleted
* ref. The deleted ref is ignored later.
*/
- if (wait_for_parent_move(sctx, cur)) {
+ ret = wait_for_parent_move(sctx, cur);
+ if (ret < 0)
+ goto out;
+ if (ret) {
ret = add_pending_dir_move(sctx,
- cur->dir);
+ sctx->cur_ino,
+ sctx->cur_inode_gen,
+ cur->dir);
*pending_move = 1;
} else {
ret = send_rename(sctx, valid_path,
@@ -3249,7 +3493,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
* later, we do this check again and rmdir it then if possible.
* See the use of check_dirs for more details.
*/
- ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino);
+ ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ sctx->cur_ino);
if (ret < 0)
goto out;
if (ret) {
@@ -3340,8 +3585,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
ret = send_utimes(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
- } else if (ret == inode_state_did_delete) {
- ret = can_rmdir(sctx, cur->dir, sctx->cur_ino);
+ } else if (ret == inode_state_did_delete &&
+ cur->dir != last_dir_ino_rm) {
+ ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
+ sctx->cur_ino);
if (ret < 0)
goto out;
if (ret) {
@@ -3352,6 +3599,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
ret = send_rmdir(sctx, valid_path);
if (ret < 0)
goto out;
+ last_dir_ino_rm = cur->dir;
}
}
}
@@ -3365,9 +3613,8 @@ out:
return ret;
}
-static int __record_new_ref(int num, u64 dir, int index,
- struct fs_path *name,
- void *ctx)
+static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
+ struct fs_path *name, void *ctx, struct list_head *refs)
{
int ret = 0;
struct send_ctx *sctx = ctx;
@@ -3378,7 +3625,7 @@ static int __record_new_ref(int num, u64 dir, int index,
if (!p)
return -ENOMEM;
- ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
+ ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
NULL, NULL);
if (ret < 0)
goto out;
@@ -3390,7 +3637,7 @@ static int __record_new_ref(int num, u64 dir, int index,
if (ret < 0)
goto out;
- ret = record_ref(&sctx->new_refs, dir, gen, p);
+ ret = __record_ref(refs, dir, gen, p);
out:
if (ret)
@@ -3398,37 +3645,23 @@ out:
return ret;
}
+static int __record_new_ref(int num, u64 dir, int index,
+ struct fs_path *name,
+ void *ctx)
+{
+ struct send_ctx *sctx = ctx;
+ return record_ref(sctx->send_root, num, dir, index, name,
+ ctx, &sctx->new_refs);
+}
+
+
static int __record_deleted_ref(int num, u64 dir, int index,
struct fs_path *name,
void *ctx)
{
- int ret = 0;
struct send_ctx *sctx = ctx;
- struct fs_path *p;
- u64 gen;
-
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
-
- ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
- NULL, NULL);
- if (ret < 0)
- goto out;
-
- ret = get_cur_path(sctx, dir, gen, p);
- if (ret < 0)
- goto out;
- ret = fs_path_add_path(p, name);
- if (ret < 0)
- goto out;
-
- ret = record_ref(&sctx->deleted_refs, dir, gen, p);
-
-out:
- if (ret)
- fs_path_free(p);
- return ret;
+ return record_ref(sctx->parent_root, num, dir, index, name,
+ ctx, &sctx->deleted_refs);
}
static int record_new_ref(struct send_ctx *sctx)
@@ -3609,21 +3842,31 @@ static int process_all_refs(struct send_ctx *sctx,
root = sctx->parent_root;
cb = __record_deleted_ref;
} else {
- BUG();
+ btrfs_err(sctx->send_root->fs_info,
+ "Wrong command %d in process_all_refs", cmd);
+ ret = -EINVAL;
+ goto out;
}
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
- while (1) {
- ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
- if (ret < 0)
- goto out;
- if (ret)
- break;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ while (1) {
eb = path->nodes[0];
slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
btrfs_item_key_to_cpu(eb, &found_key, slot);
if (found_key.objectid != key.objectid ||
@@ -3632,11 +3875,10 @@ static int process_all_refs(struct send_ctx *sctx,
break;
ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
- btrfs_release_path(path);
if (ret < 0)
goto out;
- key.offset = found_key.offset + 1;
+ path->slots[0]++;
}
btrfs_release_path(path);
@@ -3917,19 +4159,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = 0;
- while (1) {
- ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
- if (ret < 0)
- goto out;
- if (ret) {
- ret = 0;
- goto out;
- }
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ while (1) {
eb = path->nodes[0];
slot = path->slots[0];
- btrfs_item_key_to_cpu(eb, &found_key, slot);
+ if (slot >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
+ }
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
@@ -3941,8 +4189,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
if (ret < 0)
goto out;
- btrfs_release_path(path);
- key.offset = found_key.offset + 1;
+ path->slots[0]++;
}
out:
@@ -3981,6 +4228,13 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
goto out;
last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+
+ /* initial readahead */
+ memset(&sctx->ra, 0, sizeof(struct file_ra_state));
+ file_ra_state_init(&sctx->ra, inode->i_mapping);
+ btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index,
+ last_index - index + 1);
+
while (index <= last_index) {
unsigned cur_len = min_t(unsigned, len,
PAGE_CACHE_SIZE - pg_offset);
@@ -4753,18 +5007,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
ret = apply_children_dir_moves(sctx);
if (ret)
goto out;
+ /*
+ * Need to send that every time, no matter if it actually
+ * changed between the two trees as we have done changes to
+ * the inode before. If our inode is a directory and it's
+ * waiting to be moved/renamed, we will send its utimes when
+ * it's moved/renamed, therefore we don't need to do it here.
+ */
+ sctx->send_progress = sctx->cur_ino + 1;
+ ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+ if (ret < 0)
+ goto out;
}
- /*
- * Need to send that every time, no matter if it actually
- * changed between the two trees as we have done changes to
- * the inode before.
- */
- sctx->send_progress = sctx->cur_ino + 1;
- ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
- if (ret < 0)
- goto out;
-
out:
return ret;
}
@@ -4830,6 +5085,8 @@ static int changed_inode(struct send_ctx *sctx,
sctx->left_path->nodes[0], left_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
sctx->left_path->nodes[0], left_ii);
+ sctx->cur_inode_rdev = btrfs_inode_rdev(
+ sctx->left_path->nodes[0], left_ii);
if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
ret = send_create_inode_if_needed(sctx);
} else if (result == BTRFS_COMPARE_TREE_DELETED) {
@@ -4874,6 +5131,8 @@ static int changed_inode(struct send_ctx *sctx,
sctx->left_path->nodes[0], left_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
sctx->left_path->nodes[0], left_ii);
+ sctx->cur_inode_rdev = btrfs_inode_rdev(
+ sctx->left_path->nodes[0], left_ii);
ret = send_create_inode_if_needed(sctx);
if (ret < 0)
goto out;
@@ -5108,6 +5367,7 @@ out:
static int full_send_tree(struct send_ctx *sctx)
{
int ret;
+ struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *send_root = sctx->send_root;
struct btrfs_key key;
struct btrfs_key found_key;
@@ -5129,6 +5389,19 @@ static int full_send_tree(struct send_ctx *sctx)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
+join_trans:
+ /*
+ * We need to make sure the transaction does not get committed
+ * while we do anything on commit roots. Join a transaction to prevent
+ * this.
+ */
+ trans = btrfs_join_transaction(send_root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+
/*
* Make sure the tree has not changed after re-joining. We detect this
* by comparing start_ctransid and ctransid. They should always match.
@@ -5152,6 +5425,19 @@ static int full_send_tree(struct send_ctx *sctx)
goto out_finish;
while (1) {
+ /*
+ * When someone want to commit while we iterate, end the
+ * joined transaction and rejoin.
+ */
+ if (btrfs_should_end_transaction(trans, send_root)) {
+ ret = btrfs_end_transaction(trans, send_root);
+ trans = NULL;
+ if (ret < 0)
+ goto out;
+ btrfs_release_path(path);
+ goto join_trans;
+ }
+
eb = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(eb, &found_key, slot);
@@ -5179,6 +5465,12 @@ out_finish:
out:
btrfs_free_path(path);
+ if (trans) {
+ if (!ret)
+ ret = btrfs_end_transaction(trans, send_root);
+ else
+ btrfs_end_transaction(trans, send_root);
+ }
return ret;
}
@@ -5330,6 +5622,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
sctx->pending_dir_moves = RB_ROOT;
sctx->waiting_dir_moves = RB_ROOT;
+ sctx->orphan_dirs = RB_ROOT;
sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
(arg->clone_sources_count + 1));
@@ -5467,6 +5760,16 @@ out:
kfree(dm);
}
+ WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
+ while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
+ struct rb_node *n;
+ struct orphan_dir_info *odi;
+
+ n = rb_first(&sctx->orphan_dirs);
+ odi = rb_entry(n, struct orphan_dir_info, node);
+ free_orphan_dir_info(sctx, odi);
+ }
+
if (sort_clone_roots) {
for (i = 0; i < sctx->clone_roots_cnt; i++)
btrfs_root_dec_send_in_progress(
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 00cd0c57b0b3..9dbf42395153 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -566,7 +566,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
kfree(num);
if (info->max_inline) {
- info->max_inline = max_t(u64,
+ info->max_inline = min_t(u64,
info->max_inline,
root->sectorsize);
}
@@ -855,6 +855,7 @@ static struct dentry *get_default_root(struct super_block *sb,
struct btrfs_path *path;
struct btrfs_key location;
struct inode *inode;
+ struct dentry *dentry;
u64 dir_id;
int new = 0;
@@ -925,7 +926,13 @@ setup_root:
return dget(sb->s_root);
}
- return d_obtain_alias(inode);
+ dentry = d_obtain_alias(inode);
+ if (!IS_ERR(dentry)) {
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~DCACHE_DISCONNECTED;
+ spin_unlock(&dentry->d_lock);
+ }
+ return dentry;
}
static int btrfs_fill_super(struct super_block *sb,
@@ -1298,13 +1305,6 @@ error_fs_info:
return ERR_PTR(error);
}
-static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
-{
- spin_lock_irq(&workers->lock);
- workers->max_workers = new_limit;
- spin_unlock_irq(&workers->lock);
-}
-
static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
int new_pool_size, int old_pool_size)
{
@@ -1316,21 +1316,20 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
btrfs_info(fs_info, "resize thread pool %d -> %d",
old_pool_size, new_pool_size);
- btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
- btrfs_set_max_workers(&fs_info->workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
- btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
- new_pool_size);
+ btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
+ new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
+ new_pool_size);
}
static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
@@ -1473,6 +1472,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
sb->s_flags &= ~MS_RDONLY;
}
out:
+ wake_up_process(fs_info->transaction_kthread);
btrfs_remount_cleanup(fs_info, old_opts);
return 0;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 782374d8fd19..c5eb2143dc66 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -24,6 +24,7 @@
#include <linux/kobject.h>
#include <linux/bug.h>
#include <linux/genhd.h>
+#include <linux/debugfs.h>
#include "ctree.h"
#include "disk-io.h"
@@ -578,8 +579,14 @@ static int add_device_membership(struct btrfs_fs_info *fs_info)
return -ENOMEM;
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
- struct hd_struct *disk = dev->bdev->bd_part;
- struct kobject *disk_kobj = &part_to_dev(disk)->kobj;
+ struct hd_struct *disk;
+ struct kobject *disk_kobj;
+
+ if (!dev->bdev)
+ continue;
+
+ disk = dev->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
error = sysfs_create_link(fs_info->device_dir_kobj,
disk_kobj, disk_kobj->name);
@@ -593,6 +600,12 @@ static int add_device_membership(struct btrfs_fs_info *fs_info)
/* /sys/fs/btrfs/ entry */
static struct kset *btrfs_kset;
+/* /sys/kernel/debug/btrfs */
+static struct dentry *btrfs_debugfs_root_dentry;
+
+/* Debugging tunables and exported data */
+u64 btrfs_debugfs_test;
+
int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
{
int error;
@@ -636,27 +649,41 @@ failure:
return error;
}
+static int btrfs_init_debugfs(void)
+{
+#ifdef CONFIG_DEBUG_FS
+ btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL);
+ if (!btrfs_debugfs_root_dentry)
+ return -ENOMEM;
+
+ debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry,
+ &btrfs_debugfs_test);
+#endif
+ return 0;
+}
+
int btrfs_init_sysfs(void)
{
int ret;
+
btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
if (!btrfs_kset)
return -ENOMEM;
- init_feature_attrs();
+ ret = btrfs_init_debugfs();
+ if (ret)
+ return ret;
+ init_feature_attrs();
ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
- if (ret) {
- kset_unregister(btrfs_kset);
- return ret;
- }
- return 0;
+ return ret;
}
void btrfs_exit_sysfs(void)
{
sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
kset_unregister(btrfs_kset);
+ debugfs_remove_recursive(btrfs_debugfs_root_dentry);
}
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index f3cea3710d44..9ab576318a84 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -1,6 +1,11 @@
#ifndef _BTRFS_SYSFS_H_
#define _BTRFS_SYSFS_H_
+/*
+ * Data exported through sysfs
+ */
+extern u64 btrfs_debugfs_test;
+
enum btrfs_feature_set {
FEAT_COMPAT,
FEAT_COMPAT_RO,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 34cd83184c4a..a04707f740d6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -683,7 +683,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
int lock = (trans->type != TRANS_JOIN_NOLOCK);
int err = 0;
- if (--trans->use_count) {
+ if (trans->use_count > 1) {
+ trans->use_count--;
trans->block_rsv = trans->orig_rsv;
return 0;
}
@@ -731,17 +732,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
}
if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
- if (throttle) {
- /*
- * We may race with somebody else here so end up having
- * to call end_transaction on ourselves again, so inc
- * our use_count.
- */
- trans->use_count++;
+ if (throttle)
return btrfs_commit_transaction(trans, root);
- } else {
+ else
wake_up_process(info->transaction_kthread);
- }
}
if (trans->type & __TRANS_FREEZABLE)
@@ -1578,10 +1572,9 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
trace_btrfs_transaction_commit(root);
- btrfs_scrub_continue(root);
-
if (current->journal_info == trans)
current->journal_info = NULL;
+ btrfs_scrub_cancel(root->fs_info);
kmem_cache_free(btrfs_trans_handle_cachep, trans);
}
@@ -1621,7 +1614,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
- return btrfs_start_delalloc_roots(fs_info, 1);
+ return btrfs_start_delalloc_roots(fs_info, 1, -1);
return 0;
}
@@ -1754,7 +1747,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
/* ->aborted might be set after the previous check, so check it */
if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
ret = cur_trans->aborted;
- goto cleanup_transaction;
+ goto scrub_continue;
}
/*
* the reloc mutex makes sure that we stop
@@ -1771,7 +1764,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = create_pending_snapshots(trans, root->fs_info);
if (ret) {
mutex_unlock(&root->fs_info->reloc_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
/*
@@ -1787,13 +1780,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = btrfs_run_delayed_items(trans, root);
if (ret) {
mutex_unlock(&root->fs_info->reloc_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
if (ret) {
mutex_unlock(&root->fs_info->reloc_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
/*
@@ -1823,7 +1816,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
if (ret) {
mutex_unlock(&root->fs_info->tree_log_mutex);
mutex_unlock(&root->fs_info->reloc_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
/*
@@ -1844,7 +1837,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
if (ret) {
mutex_unlock(&root->fs_info->tree_log_mutex);
mutex_unlock(&root->fs_info->reloc_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
/*
@@ -1855,7 +1848,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = cur_trans->aborted;
mutex_unlock(&root->fs_info->tree_log_mutex);
mutex_unlock(&root->fs_info->reloc_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
btrfs_prepare_extent_commit(trans, root);
@@ -1891,13 +1884,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_error(root->fs_info, ret,
"Error while writing out transaction");
mutex_unlock(&root->fs_info->tree_log_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
ret = write_ctree_super(trans, root, 0);
if (ret) {
mutex_unlock(&root->fs_info->tree_log_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
/*
@@ -1940,6 +1933,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return ret;
+scrub_continue:
+ btrfs_scrub_continue(root);
cleanup_transaction:
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 39d83da03e03..e2f45fc02610 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -136,13 +136,20 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
* syncing the tree wait for us to finish
*/
static int start_log_trans(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
{
+ int index;
int ret;
- int err = 0;
mutex_lock(&root->log_mutex);
if (root->log_root) {
+ if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
+ trans->transid) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
if (!root->log_start_pid) {
root->log_start_pid = current->pid;
root->log_multiple_pids = false;
@@ -152,27 +159,40 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
+ if (ctx) {
+ index = root->log_transid % 2;
+ list_add_tail(&ctx->list, &root->log_ctxs[index]);
+ ctx->log_transid = root->log_transid;
+ }
mutex_unlock(&root->log_mutex);
return 0;
}
- root->log_multiple_pids = false;
- root->log_start_pid = current->pid;
+
+ ret = 0;
mutex_lock(&root->fs_info->tree_log_mutex);
- if (!root->fs_info->log_root_tree) {
+ if (!root->fs_info->log_root_tree)
ret = btrfs_init_log_root_tree(trans, root->fs_info);
- if (ret)
- err = ret;
- }
- if (err == 0 && !root->log_root) {
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ if (ret)
+ goto out;
+
+ if (!root->log_root) {
ret = btrfs_add_log_tree(trans, root);
if (ret)
- err = ret;
+ goto out;
}
- mutex_unlock(&root->fs_info->tree_log_mutex);
+ root->log_multiple_pids = false;
+ root->log_start_pid = current->pid;
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
+ if (ctx) {
+ index = root->log_transid % 2;
+ list_add_tail(&ctx->list, &root->log_ctxs[index]);
+ ctx->log_transid = root->log_transid;
+ }
+out:
mutex_unlock(&root->log_mutex);
- return err;
+ return ret;
}
/*
@@ -2359,8 +2379,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
return ret;
}
-static int wait_log_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, unsigned long transid)
+static void wait_log_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int transid)
{
DEFINE_WAIT(wait);
int index = transid % 2;
@@ -2375,36 +2395,63 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
&wait, TASK_UNINTERRUPTIBLE);
mutex_unlock(&root->log_mutex);
- if (root->fs_info->last_trans_log_full_commit !=
- trans->transid && root->log_transid < transid + 2 &&
+ if (root->log_transid_committed < transid &&
atomic_read(&root->log_commit[index]))
schedule();
finish_wait(&root->log_commit_wait[index], &wait);
mutex_lock(&root->log_mutex);
- } while (root->fs_info->last_trans_log_full_commit !=
- trans->transid && root->log_transid < transid + 2 &&
+ } while (root->log_transid_committed < transid &&
atomic_read(&root->log_commit[index]));
- return 0;
}
static void wait_for_writer(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
DEFINE_WAIT(wait);
- while (root->fs_info->last_trans_log_full_commit !=
- trans->transid && atomic_read(&root->log_writers)) {
+
+ while (atomic_read(&root->log_writers)) {
prepare_to_wait(&root->log_writer_wait,
&wait, TASK_UNINTERRUPTIBLE);
mutex_unlock(&root->log_mutex);
- if (root->fs_info->last_trans_log_full_commit !=
- trans->transid && atomic_read(&root->log_writers))
+ if (atomic_read(&root->log_writers))
schedule();
mutex_lock(&root->log_mutex);
finish_wait(&root->log_writer_wait, &wait);
}
}
+static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
+{
+ if (!ctx)
+ return;
+
+ mutex_lock(&root->log_mutex);
+ list_del_init(&ctx->list);
+ mutex_unlock(&root->log_mutex);
+}
+
+/*
+ * Invoked in log mutex context, or be sure there is no other task which
+ * can access the list.
+ */
+static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
+ int index, int error)
+{
+ struct btrfs_log_ctx *ctx;
+
+ if (!error) {
+ INIT_LIST_HEAD(&root->log_ctxs[index]);
+ return;
+ }
+
+ list_for_each_entry(ctx, &root->log_ctxs[index], list)
+ ctx->log_ret = error;
+
+ INIT_LIST_HEAD(&root->log_ctxs[index]);
+}
+
/*
* btrfs_sync_log does sends a given tree log down to the disk and
* updates the super blocks to record it. When this call is done,
@@ -2418,7 +2465,7 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
* that has happened.
*/
int btrfs_sync_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_root *root, struct btrfs_log_ctx *ctx)
{
int index1;
int index2;
@@ -2426,22 +2473,30 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
- unsigned long log_transid = 0;
+ int log_transid = 0;
+ struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
mutex_lock(&root->log_mutex);
- log_transid = root->log_transid;
- index1 = root->log_transid % 2;
+ log_transid = ctx->log_transid;
+ if (root->log_transid_committed >= log_transid) {
+ mutex_unlock(&root->log_mutex);
+ return ctx->log_ret;
+ }
+
+ index1 = log_transid % 2;
if (atomic_read(&root->log_commit[index1])) {
- wait_log_commit(trans, root, root->log_transid);
+ wait_log_commit(trans, root, log_transid);
mutex_unlock(&root->log_mutex);
- return 0;
+ return ctx->log_ret;
}
+ ASSERT(log_transid == root->log_transid);
atomic_set(&root->log_commit[index1], 1);
/* wait for previous tree log sync to complete */
if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
- wait_log_commit(trans, root, root->log_transid - 1);
+ wait_log_commit(trans, root, log_transid - 1);
+
while (1) {
int batch = atomic_read(&root->log_batch);
/* when we're on an ssd, just kick the log commit out */
@@ -2456,7 +2511,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
/* bail out if we need to do a full commit */
- if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
+ trans->transid) {
ret = -EAGAIN;
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&root->log_mutex);
@@ -2477,6 +2533,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
blk_finish_plug(&plug);
btrfs_abort_transaction(trans, root, ret);
btrfs_free_logged_extents(log, log_transid);
+ ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
+ trans->transid;
mutex_unlock(&root->log_mutex);
goto out;
}
@@ -2486,7 +2544,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
root->log_transid++;
log->log_transid = root->log_transid;
root->log_start_pid = 0;
- smp_mb();
/*
* IO has been started, blocks of the log tree have WRITTEN flag set
* in their headers. new modifications of the log will be written to
@@ -2494,9 +2551,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
mutex_unlock(&root->log_mutex);
+ btrfs_init_log_ctx(&root_log_ctx);
+
mutex_lock(&log_root_tree->log_mutex);
atomic_inc(&log_root_tree->log_batch);
atomic_inc(&log_root_tree->log_writers);
+
+ index2 = log_root_tree->log_transid % 2;
+ list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
+ root_log_ctx.log_transid = log_root_tree->log_transid;
+
mutex_unlock(&log_root_tree->log_mutex);
ret = update_log_root(trans, log);
@@ -2509,13 +2573,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
if (ret) {
+ if (!list_empty(&root_log_ctx.list))
+ list_del_init(&root_log_ctx.list);
+
blk_finish_plug(&plug);
+ ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
+ trans->transid;
if (ret != -ENOSPC) {
btrfs_abort_transaction(trans, root, ret);
mutex_unlock(&log_root_tree->log_mutex);
goto out;
}
- root->fs_info->last_trans_log_full_commit = trans->transid;
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2523,22 +2591,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
- index2 = log_root_tree->log_transid % 2;
+ if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
+ mutex_unlock(&log_root_tree->log_mutex);
+ ret = root_log_ctx.log_ret;
+ goto out;
+ }
+
+ index2 = root_log_ctx.log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
blk_finish_plug(&plug);
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
wait_log_commit(trans, log_root_tree,
- log_root_tree->log_transid);
+ root_log_ctx.log_transid);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
- ret = 0;
+ ret = root_log_ctx.log_ret;
goto out;
}
+ ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
atomic_set(&log_root_tree->log_commit[index2], 1);
if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
wait_log_commit(trans, log_root_tree,
- log_root_tree->log_transid - 1);
+ root_log_ctx.log_transid - 1);
}
wait_for_writer(trans, log_root_tree);
@@ -2547,7 +2622,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* now that we've moved on to the tree of log tree roots,
* check the full commit flag again
*/
- if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
+ trans->transid) {
blk_finish_plug(&plug);
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
btrfs_free_logged_extents(log, log_transid);
@@ -2561,6 +2637,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
EXTENT_DIRTY | EXTENT_NEW);
blk_finish_plug(&plug);
if (ret) {
+ ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
+ trans->transid;
btrfs_abort_transaction(trans, root, ret);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2578,8 +2656,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_header_level(log_root_tree->node));
log_root_tree->log_transid++;
- smp_mb();
-
mutex_unlock(&log_root_tree->log_mutex);
/*
@@ -2591,6 +2667,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
if (ret) {
+ ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
+ trans->transid;
btrfs_abort_transaction(trans, root, ret);
goto out_wake_log_root;
}
@@ -2601,13 +2679,28 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&root->log_mutex);
out_wake_log_root:
+ /*
+ * We needn't get log_mutex here because we are sure all
+ * the other tasks are blocked.
+ */
+ btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
+
+ mutex_lock(&log_root_tree->log_mutex);
+ log_root_tree->log_transid_committed++;
atomic_set(&log_root_tree->log_commit[index2], 0);
- smp_mb();
+ mutex_unlock(&log_root_tree->log_mutex);
+
if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
wake_up(&log_root_tree->log_commit_wait[index2]);
out:
+ /* See above. */
+ btrfs_remove_all_log_ctxs(root, index1, ret);
+
+ mutex_lock(&root->log_mutex);
+ root->log_transid_committed++;
atomic_set(&root->log_commit[index1], 0);
- smp_mb();
+ mutex_unlock(&root->log_mutex);
+
if (waitqueue_active(&root->log_commit_wait[index1]))
wake_up(&root->log_commit_wait[index1]);
return ret;
@@ -3479,7 +3572,8 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
static int log_one_extent(struct btrfs_trans_handle *trans,
struct inode *inode, struct btrfs_root *root,
- struct extent_map *em, struct btrfs_path *path)
+ struct extent_map *em, struct btrfs_path *path,
+ struct list_head *logged_list)
{
struct btrfs_root *log = root->log_root;
struct btrfs_file_extent_item *fi;
@@ -3495,7 +3589,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
u64 extent_offset = em->start - em->orig_start;
u64 block_len;
int ret;
- int index = log->log_transid % 2;
bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
int extent_inserted = 0;
@@ -3579,17 +3672,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
* First check and see if our csums are on our outstanding ordered
* extents.
*/
-again:
- spin_lock_irq(&log->log_extents_lock[index]);
- list_for_each_entry(ordered, &log->logged_list[index], log_list) {
+ list_for_each_entry(ordered, logged_list, log_list) {
struct btrfs_ordered_sum *sum;
if (!mod_len)
break;
- if (ordered->inode != inode)
- continue;
-
if (ordered->file_offset + ordered->len <= mod_start ||
mod_start + mod_len <= ordered->file_offset)
continue;
@@ -3632,12 +3720,6 @@ again:
if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
&ordered->flags))
continue;
- atomic_inc(&ordered->refs);
- spin_unlock_irq(&log->log_extents_lock[index]);
- /*
- * we've dropped the lock, we must either break or
- * start over after this.
- */
if (ordered->csum_bytes_left) {
btrfs_start_ordered_extent(inode, ordered, 0);
@@ -3647,16 +3729,11 @@ again:
list_for_each_entry(sum, &ordered->list, list) {
ret = btrfs_csum_file_blocks(trans, log, sum);
- if (ret) {
- btrfs_put_ordered_extent(ordered);
+ if (ret)
goto unlocked;
- }
}
- btrfs_put_ordered_extent(ordered);
- goto again;
}
- spin_unlock_irq(&log->log_extents_lock[index]);
unlocked:
if (!mod_len || ret)
@@ -3694,7 +3771,8 @@ unlocked:
static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode,
- struct btrfs_path *path)
+ struct btrfs_path *path,
+ struct list_head *logged_list)
{
struct extent_map *em, *n;
struct list_head extents;
@@ -3752,7 +3830,7 @@ process:
write_unlock(&tree->lock);
- ret = log_one_extent(trans, inode, root, em, path);
+ ret = log_one_extent(trans, inode, root, em, path, logged_list);
write_lock(&tree->lock);
clear_em_logging(tree, em);
free_extent_map(em);
@@ -3788,6 +3866,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
struct extent_buffer *src = NULL;
+ LIST_HEAD(logged_list);
u64 last_extent = 0;
int err = 0;
int ret;
@@ -3836,7 +3915,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
mutex_lock(&BTRFS_I(inode)->log_mutex);
- btrfs_get_logged_extents(log, inode);
+ btrfs_get_logged_extents(inode, &logged_list);
/*
* a brute force approach to making sure we get the most uptodate
@@ -3962,7 +4041,8 @@ log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
if (fast_search) {
- ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
+ ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
+ &logged_list);
if (ret) {
err = ret;
goto out_unlock;
@@ -3987,8 +4067,10 @@ log_extents:
BTRFS_I(inode)->logged_trans = trans->transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
out_unlock:
- if (err)
- btrfs_free_logged_extents(log, log->log_transid);
+ if (unlikely(err))
+ btrfs_put_logged_extents(&logged_list);
+ else
+ btrfs_submit_logged_extents(&logged_list, log);
mutex_unlock(&BTRFS_I(inode)->log_mutex);
btrfs_free_path(path);
@@ -4079,7 +4161,8 @@ out:
*/
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
- struct dentry *parent, int exists_only)
+ struct dentry *parent, int exists_only,
+ struct btrfs_log_ctx *ctx)
{
int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
struct super_block *sb;
@@ -4116,9 +4199,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
- ret = start_log_trans(trans, root);
+ ret = start_log_trans(trans, root, ctx);
if (ret)
- goto end_trans;
+ goto end_no_trans;
ret = btrfs_log_inode(trans, root, inode, inode_only);
if (ret)
@@ -4166,6 +4249,9 @@ end_trans:
root->fs_info->last_trans_log_full_commit = trans->transid;
ret = 1;
}
+
+ if (ret)
+ btrfs_remove_log_ctx(root, ctx);
btrfs_end_log_trans(root);
end_no_trans:
return ret;
@@ -4178,12 +4264,14 @@ end_no_trans:
* data on disk.
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry)
+ struct btrfs_root *root, struct dentry *dentry,
+ struct btrfs_log_ctx *ctx)
{
struct dentry *parent = dget_parent(dentry);
int ret;
- ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
+ ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
+ 0, ctx);
dput(parent);
return ret;
@@ -4420,6 +4508,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
root->fs_info->last_trans_committed))
return 0;
- return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+ return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 1d4ae0d15a70..91b145fce333 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,28 @@
/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
#define BTRFS_NO_LOG_SYNC 256
+struct btrfs_log_ctx {
+ int log_ret;
+ int log_transid;
+ struct list_head list;
+};
+
+static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
+{
+ ctx->log_ret = 0;
+ ctx->log_transid = 0;
+ INIT_LIST_HEAD(&ctx->list);
+}
+
int btrfs_sync_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
+ struct btrfs_root *root, struct btrfs_log_ctx *ctx);
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry);
+ struct btrfs_root *root, struct dentry *dentry,
+ struct btrfs_log_ctx *ctx);
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bab0b84d8f80..d241130a32fd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -415,7 +415,8 @@ loop_lock:
device->running_pending = 1;
spin_unlock(&device->io_lock);
- btrfs_requeue_work(&device->work);
+ btrfs_queue_work(fs_info->submit_workers,
+ &device->work);
goto done;
}
/* unplug every 64 requests just for good measure */
@@ -5263,6 +5264,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
static void btrfs_end_bio(struct bio *bio, int err)
{
struct btrfs_bio *bbio = bio->bi_private;
+ struct btrfs_device *dev = bbio->stripes[0].dev;
int is_orig_bio = 0;
if (err) {
@@ -5270,7 +5272,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
if (err == -EIO || err == -EREMOTEIO) {
unsigned int stripe_index =
btrfs_io_bio(bio)->stripe_index;
- struct btrfs_device *dev;
BUG_ON(stripe_index >= bbio->num_stripes);
dev = bbio->stripes[stripe_index].dev;
@@ -5292,6 +5293,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
if (bio == bbio->orig_bio)
is_orig_bio = 1;
+ btrfs_bio_counter_dec(bbio->fs_info);
+
if (atomic_dec_and_test(&bbio->stripes_pending)) {
if (!is_orig_bio) {
bio_put(bio);
@@ -5328,13 +5331,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
}
}
-struct async_sched {
- struct bio *bio;
- int rw;
- struct btrfs_fs_info *info;
- struct btrfs_work work;
-};
-
/*
* see run_scheduled_bios for a description of why bios are collected for
* async submit.
@@ -5391,8 +5387,8 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
spin_unlock(&device->io_lock);
if (should_queue)
- btrfs_queue_worker(&root->fs_info->submit_workers,
- &device->work);
+ btrfs_queue_work(root->fs_info->submit_workers,
+ &device->work);
}
static int bio_size_ok(struct block_device *bdev, struct bio *bio,
@@ -5447,6 +5443,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
}
#endif
bio->bi_bdev = dev->bdev;
+
+ btrfs_bio_counter_inc_noblocked(root->fs_info);
+
if (async)
btrfs_schedule_bio(root, dev, rw, bio);
else
@@ -5515,28 +5514,38 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
length = bio->bi_iter.bi_size;
map_length = length;
+ btrfs_bio_counter_inc_blocked(root->fs_info);
ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
mirror_num, &raid_map);
- if (ret) /* -ENOMEM */
+ if (ret) {
+ btrfs_bio_counter_dec(root->fs_info);
return ret;
+ }
total_devs = bbio->num_stripes;
bbio->orig_bio = first_bio;
bbio->private = first_bio->bi_private;
bbio->end_io = first_bio->bi_end_io;
+ bbio->fs_info = root->fs_info;
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
if (raid_map) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
if (rw & WRITE) {
- return raid56_parity_write(root, bio, bbio,
- raid_map, map_length);
+ ret = raid56_parity_write(root, bio, bbio,
+ raid_map, map_length);
} else {
- return raid56_parity_recover(root, bio, bbio,
- raid_map, map_length,
- mirror_num);
+ ret = raid56_parity_recover(root, bio, bbio,
+ raid_map, map_length,
+ mirror_num);
}
+ /*
+ * FIXME, replace dosen't support raid56 yet, please fix
+ * it in the future.
+ */
+ btrfs_bio_counter_dec(root->fs_info);
+ return ret;
}
if (map_length < length) {
@@ -5578,6 +5587,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
async_submit);
dev_nr++;
}
+ btrfs_bio_counter_dec(root->fs_info);
return 0;
}
@@ -5666,7 +5676,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
else
generate_random_uuid(dev->uuid);
- dev->work.func = pending_bios_fn;
+ btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
return dev;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 8b3cd142b373..80754f9dd3df 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -192,6 +192,7 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
struct btrfs_bio {
atomic_t stripes_pending;
+ struct btrfs_fs_info *fs_info;
bio_end_io_t *end_io;
struct bio *orig_bio;
void *private;