Btrfs: use seqlock to protect fs_info->avail_{data, metadata, system}_alloc_bits

[cascardo/linux.git] / fs / btrfs / extent-tree.c
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 3d3e2c1..faff98f 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
  #include "volumes.h"
  #include "locking.h"
  #include "free-space-cache.h"
+#include "math.h"
  
  #undef SCRAMBLE_DELAYED_REFS
  
@@ -71,8 +72,7 @@ enum {
         RESERVE_ALLOC_NO_ACCOUNT = 2,
  };
  
-static int update_block_group(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root,
+static int update_block_group(struct btrfs_root *root,
                               u64 bytenr, u64 num_bytes, int alloc);
  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
@@ -161,6 +161,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
         rb_link_node(&block_group->cache_node, parent, p);
         rb_insert_color(&block_group->cache_node,
                         &info->block_group_cache_tree);
+
+       if (info->first_logical_byte > block_group->key.objectid)
+               info->first_logical_byte = block_group->key.objectid;
+
         spin_unlock(&info->block_group_cache_lock);
  
         return 0;
@@ -202,8 +206,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
                         break;
                 }
         }
-       if (ret)
+       if (ret) {
                 btrfs_get_block_group(ret);
+               if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
+                       info->first_logical_byte = ret->key.objectid;
+       }
         spin_unlock(&info->block_group_cache_lock);
  
         return ret;
@@ -467,8 +474,6 @@ out:
  }
  
  static int cache_block_group(struct btrfs_block_group_cache *cache,
-                            struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root,
                              int load_cache_only)
  {
         DEFINE_WAIT(wait);
@@ -526,12 +531,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
         cache->cached = BTRFS_CACHE_FAST;
         spin_unlock(&cache->lock);
  
-       /*
-        * We can't do the read from on-disk cache during a commit since we need
-        * to have the normal tree locking.  Also if we are currently trying to
-        * allocate blocks for the tree root we can't do the fast caching since
-        * we likely hold important locks.
-        */
         if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
                 ret = load_free_space_cache(fs_info, cache);
  
@@ -649,24 +648,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
         rcu_read_unlock();
  }
  
-static u64 div_factor(u64 num, int factor)
-{
-       if (factor == 10)
-               return num;
-       num *= factor;
-       do_div(num, 10);
-       return num;
-}
-
-static u64 div_factor_fine(u64 num, int factor)
-{
-       if (factor == 100)
-               return num;
-       num *= factor;
-       do_div(num, 100);
-       return num;
-}
-
  u64 btrfs_find_block_group(struct btrfs_root *root,
                            u64 search_start, u64 search_hint, int owner)
  {
@@ -1835,7 +1816,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
  
  
         /* Tell the block device(s) that the sectors can be discarded */
-       ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
+       ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
                               bytenr, &num_bytes, &bbio, 0);
         /* Error condition is -ENOMEM */
         if (!ret) {
@@ -2160,7 +2141,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                                                       node->num_bytes);
                         }
                 }
-               mutex_unlock(&head->mutex);
                 return ret;
         }
  
@@ -2275,7 +2255,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                          * process of being added. Don't run this ref yet.
                          */
                         list_del_init(&locked_ref->cluster);
-                       mutex_unlock(&locked_ref->mutex);
+                       btrfs_delayed_ref_unlock(locked_ref);
                         locked_ref = NULL;
                         delayed_refs->num_heads_ready++;
                         spin_unlock(&delayed_refs->lock);
@@ -2302,7 +2282,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                         ref = &locked_ref->node;
  
                         if (extent_op && must_insert_reserved) {
-                               kfree(extent_op);
+                               btrfs_free_delayed_extent_op(extent_op);
                                 extent_op = NULL;
                         }
  
@@ -2311,25 +2291,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
  
                                 ret = run_delayed_extent_op(trans, root,
                                                             ref, extent_op);
-                               kfree(extent_op);
+                               btrfs_free_delayed_extent_op(extent_op);
  
                                 if (ret) {
-                                       printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
+                                       printk(KERN_DEBUG
+                                              "btrfs: run_delayed_extent_op "
+                                              "returned %d\n", ret);
                                         spin_lock(&delayed_refs->lock);
+                                       btrfs_delayed_ref_unlock(locked_ref);
                                         return ret;
                                 }
  
                                 goto next;
                         }
-
-                       list_del_init(&locked_ref->cluster);
-                       locked_ref = NULL;
                 }
  
                 ref->in_tree = 0;
                 rb_erase(&ref->rb_node, &delayed_refs->root);
                 delayed_refs->num_entries--;
-               if (locked_ref) {
+               if (!btrfs_delayed_ref_is_head(ref)) {
                         /*
                          * when we play the delayed ref, also correct the
                          * ref_mod on head
@@ -2351,16 +2331,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
                                           must_insert_reserved);
  
-               btrfs_put_delayed_ref(ref);
-               kfree(extent_op);
-               count++;
-
+               btrfs_free_delayed_extent_op(extent_op);
                 if (ret) {
-                       printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
+                       btrfs_delayed_ref_unlock(locked_ref);
+                       btrfs_put_delayed_ref(ref);
+                       printk(KERN_DEBUG
+                              "btrfs: run_one_delayed_ref returned %d\n", ret);
                         spin_lock(&delayed_refs->lock);
                         return ret;
                 }
  
+               /*
+                * If this node is a head, that means all the refs in this head
+                * have been dealt with, and we will pick the next head to deal
+                * with, so we must unlock the head and drop it from the cluster
+                * list before we release it.
+                */
+               if (btrfs_delayed_ref_is_head(ref)) {
+                       list_del_init(&locked_ref->cluster);
+                       btrfs_delayed_ref_unlock(locked_ref);
+                       locked_ref = NULL;
+               }
+               btrfs_put_delayed_ref(ref);
+               count++;
  next:
                 cond_resched();
                 spin_lock(&delayed_refs->lock);
@@ -2510,6 +2503,7 @@ again:
  
                 ret = run_clustered_refs(trans, root, &cluster);
                 if (ret < 0) {
+                       btrfs_release_ref_cluster(&cluster);
                         spin_unlock(&delayed_refs->lock);
                         btrfs_abort_transaction(trans, root, ret);
                         return ret;
@@ -2596,7 +2590,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
         struct btrfs_delayed_extent_op *extent_op;
         int ret;
  
-       extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+       extent_op = btrfs_alloc_delayed_extent_op();
         if (!extent_op)
                 return -ENOMEM;
  
@@ -2608,7 +2602,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
         ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
                                           num_bytes, extent_op);
         if (ret)
-               kfree(extent_op);
+               btrfs_free_delayed_extent_op(extent_op);
         return ret;
  }
  
@@ -3233,12 +3227,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
         u64 extra_flags = chunk_to_extended(flags) &
                                 BTRFS_EXTENDED_PROFILE_MASK;
  
+       write_seqlock(&fs_info->profiles_lock);
         if (flags & BTRFS_BLOCK_GROUP_DATA)
                 fs_info->avail_data_alloc_bits |= extra_flags;
         if (flags & BTRFS_BLOCK_GROUP_METADATA)
                 fs_info->avail_metadata_alloc_bits |= extra_flags;
         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
                 fs_info->avail_system_alloc_bits |= extra_flags;
+       write_sequnlock(&fs_info->profiles_lock);
  }
  
  /*
@@ -3330,12 +3326,18 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
  
  static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
  {
-       if (flags & BTRFS_BLOCK_GROUP_DATA)
-               flags |= root->fs_info->avail_data_alloc_bits;
-       else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-               flags |= root->fs_info->avail_system_alloc_bits;
-       else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-               flags |= root->fs_info->avail_metadata_alloc_bits;
+       unsigned seq;
+
+       do {
+               seq = read_seqbegin(&root->fs_info->profiles_lock);
+
+               if (flags & BTRFS_BLOCK_GROUP_DATA)
+                       flags |= root->fs_info->avail_data_alloc_bits;
+               else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+                       flags |= root->fs_info->avail_system_alloc_bits;
+               else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+                       flags |= root->fs_info->avail_metadata_alloc_bits;
+       } while (read_seqretry(&root->fs_info->profiles_lock, seq));
  
         return btrfs_reduce_alloc_profile(root, flags);
  }
@@ -3574,6 +3576,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
         int wait_for_alloc = 0;
         int ret = 0;
  
+       /* Don't re-enter if we're already allocating a chunk */
+       if (trans->allocating_chunk)
+               return -ENOSPC;
+
         space_info = __find_space_info(extent_root->fs_info, flags);
         if (!space_info) {
                 ret = update_space_info(extent_root->fs_info, flags,
@@ -3616,6 +3622,8 @@ again:
                 goto again;
         }
  
+       trans->allocating_chunk = true;
+
         /*
          * If we have mixed data/metadata chunks we want to make sure we keep
          * allocating mixed chunks instead of individual chunks.
@@ -3642,6 +3650,7 @@ again:
         check_system_chunk(trans, extent_root, flags);
  
         ret = btrfs_alloc_chunk(trans, extent_root, flags);
+       trans->allocating_chunk = false;
         if (ret < 0 && ret != -ENOSPC)
                 goto out;
  
@@ -3661,7 +3670,7 @@ out:
  
  static int can_overcommit(struct btrfs_root *root,
                           struct btrfs_space_info *space_info, u64 bytes,
-                         int flush)
+                         enum btrfs_reserve_flush_enum flush)
  {
         u64 profile = btrfs_get_alloc_profile(root, 0);
         u64 avail;
@@ -3685,11 +3694,11 @@ static int can_overcommit(struct btrfs_root *root,
                 avail >>= 1;
  
         /*
-        * If we aren't flushing don't let us overcommit too much, say
-        * 1/8th of the space.  If we can flush, let it overcommit up to
-        * 1/2 of the space.
+        * If we aren't flushing all things, let us overcommit up to
+        * 1/2th of the space. If we can flush, don't let us overcommit
+        * too much, let it overcommit up to 1/8 of the space.
          */
-       if (flush)
+       if (flush == BTRFS_RESERVE_FLUSH_ALL)
                 avail >>= 3;
         else
                 avail >>= 1;
@@ -3699,6 +3708,45 @@ static int can_overcommit(struct btrfs_root *root,
         return 0;
  }
  
+static inline int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
+                                                     unsigned long nr_pages,
+                                                     enum wb_reason reason)
+{
+       /* the flusher is dealing with the dirty inodes now. */
+       if (writeback_in_progress(sb->s_bdi))
+               return 1;
+
+       if (down_read_trylock(&sb->s_umount)) {
+               writeback_inodes_sb_nr(sb, nr_pages, reason);
+               up_read(&sb->s_umount);
+               return 1;
+       }
+
+       return 0;
+}
+
+void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
+                                 unsigned long nr_pages)
+{
+       struct super_block *sb = root->fs_info->sb;
+       int started;
+
+       /* If we can not start writeback, just sync all the delalloc file. */
+       started = writeback_inodes_sb_nr_if_idle_safe(sb, nr_pages,
+                                                     WB_REASON_FS_FREE_SPACE);
+       if (!started) {
+               /*
+                * We needn't worry the filesystem going from r/w to r/o though
+                * we don't acquire ->s_umount mutex, because the filesystem
+                * should guarantee the delalloc inodes list be empty after
+                * the filesystem is readonly(all dirty pages are written to
+                * the disk).
+                */
+               btrfs_start_delalloc_inodes(root, 0);
+               btrfs_wait_ordered_extents(root, 0);
+       }
+}
+
  /*
   * shrink metadata reservation for delalloc
   */
@@ -3713,13 +3761,15 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
         long time_left;
         unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
         int loops = 0;
+       enum btrfs_reserve_flush_enum flush;
  
         trans = (struct btrfs_trans_handle *)current->journal_info;
         block_rsv = &root->fs_info->delalloc_block_rsv;
         space_info = block_rsv->space_info;
  
         smp_mb();
-       delalloc_bytes = root->fs_info->delalloc_bytes;
+       delalloc_bytes = percpu_counter_sum_positive(
+                                               &root->fs_info->delalloc_bytes);
         if (delalloc_bytes == 0) {
                 if (trans)
                         return;
@@ -3730,9 +3780,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
         while (delalloc_bytes && loops < 3) {
                 max_reclaim = min(delalloc_bytes, to_reclaim);
                 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
-               writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
-                                              WB_REASON_FS_FREE_SPACE);
-
+               btrfs_writeback_inodes_sb_nr(root, nr_pages);
                 /*
                  * We need to wait for the async pages to actually start before
                  * we do anything.
@@ -3740,8 +3788,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
                 wait_event(root->fs_info->async_submit_wait,
                            !atomic_read(&root->fs_info->async_delalloc_pages));
  
+               if (!trans)
+                       flush = BTRFS_RESERVE_FLUSH_ALL;
+               else
+                       flush = BTRFS_RESERVE_NO_FLUSH;
                 spin_lock(&space_info->lock);
-               if (can_overcommit(root, space_info, orig, !trans)) {
+               if (can_overcommit(root, space_info, orig, flush)) {
                         spin_unlock(&space_info->lock);
                         break;
                 }
@@ -3756,7 +3808,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
                                 break;
                 }
                 smp_mb();
-               delalloc_bytes = root->fs_info->delalloc_bytes;
+               delalloc_bytes = percpu_counter_sum_positive(
+                                               &root->fs_info->delalloc_bytes);
         }
  }
  
@@ -3899,7 +3952,8 @@ static int flush_space(struct btrfs_root *root,
   */
  static int reserve_metadata_bytes(struct btrfs_root *root,
                                   struct btrfs_block_rsv *block_rsv,
-                                 u64 orig_bytes, int flush)
+                                 u64 orig_bytes,
+                                 enum btrfs_reserve_flush_enum flush)
  {
         struct btrfs_space_info *space_info = block_rsv->space_info;
         u64 used;
@@ -3912,10 +3966,11 @@ again:
         ret = 0;
         spin_lock(&space_info->lock);
         /*
-        * We only want to wait if somebody other than us is flushing and we are
-        * actually alloed to flush.
+        * We only want to wait if somebody other than us is flushing and we
+        * are actually allowed to flush all things.
          */
-       while (flush && !flushing && space_info->flush) {
+       while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
+              space_info->flush) {
                 spin_unlock(&space_info->lock);
                 /*
                  * If we have a trans handle we can't wait because the flusher
@@ -3981,23 +4036,40 @@ again:
          * Couldn't make our reservation, save our place so while we're trying
          * to reclaim space we can actually use it instead of somebody else
          * stealing it from us.
+        *
+        * We make the other tasks wait for the flush only when we can flush
+        * all things.
          */
-       if (ret && flush) {
+       if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
                 flushing = true;
                 space_info->flush = 1;
         }
  
         spin_unlock(&space_info->lock);
  
-       if (!ret || !flush)
+       if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
                 goto out;
  
         ret = flush_space(root, space_info, num_bytes, orig_bytes,
                           flush_state);
         flush_state++;
+
+       /*
+        * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
+        * would happen. So skip delalloc flush.
+        */
+       if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+           (flush_state == FLUSH_DELALLOC ||
+            flush_state == FLUSH_DELALLOC_WAIT))
+               flush_state = ALLOC_CHUNK;
+
         if (!ret)
                 goto again;
-       else if (flush_state <= COMMIT_TRANS)
+       else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+                flush_state < COMMIT_TRANS)
+               goto again;
+       else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+                flush_state <= COMMIT_TRANS)
                 goto again;
  
  out:
@@ -4148,9 +4220,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
         kfree(rsv);
  }
  
-static inline int __block_rsv_add(struct btrfs_root *root,
-                                 struct btrfs_block_rsv *block_rsv,
-                                 u64 num_bytes, int flush)
+int btrfs_block_rsv_add(struct btrfs_root *root,
+                       struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+                       enum btrfs_reserve_flush_enum flush)
  {
         int ret;
  
@@ -4166,20 +4238,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,
         return ret;
  }
  
-int btrfs_block_rsv_add(struct btrfs_root *root,
-                       struct btrfs_block_rsv *block_rsv,
-                       u64 num_bytes)
-{
-       return __block_rsv_add(root, block_rsv, num_bytes, 1);
-}
-
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
-                               struct btrfs_block_rsv *block_rsv,
-                               u64 num_bytes)
-{
-       return __block_rsv_add(root, block_rsv, num_bytes, 0);
-}
-
  int btrfs_block_rsv_check(struct btrfs_root *root,
                           struct btrfs_block_rsv *block_rsv, int min_factor)
  {
@@ -4198,9 +4256,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
         return ret;
  }
  
-static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
-                                          struct btrfs_block_rsv *block_rsv,
-                                          u64 min_reserved, int flush)
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+                          enum btrfs_reserve_flush_enum flush)
  {
         u64 num_bytes = 0;
         int ret = -ENOSPC;
@@ -4228,20 +4286,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
         return ret;
  }
  
-int btrfs_block_rsv_refill(struct btrfs_root *root,
-                          struct btrfs_block_rsv *block_rsv,
-                          u64 min_reserved)
-{
-       return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
-}
-
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
-                                  struct btrfs_block_rsv *block_rsv,
-                                  u64 min_reserved)
-{
-       return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
-}
-
  int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                             struct btrfs_block_rsv *dst_rsv,
                             u64 num_bytes)
@@ -4532,17 +4576,27 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
         u64 csum_bytes;
         unsigned nr_extents = 0;
         int extra_reserve = 0;
-       int flush = 1;
-       int ret;
+       enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
+       int ret = 0;
+       bool delalloc_lock = true;
  
-       /* Need to be holding the i_mutex here if we aren't free space cache */
-       if (btrfs_is_free_space_inode(inode))
-               flush = 0;
+       /* If we are a free space inode we need to not flush since we will be in
+        * the middle of a transaction commit.  We also don't need the delalloc
+        * mutex since we won't race with anybody.  We need this mostly to make
+        * lockdep shut its filthy mouth.
+        */
+       if (btrfs_is_free_space_inode(inode)) {
+               flush = BTRFS_RESERVE_NO_FLUSH;
+               delalloc_lock = false;
+       }
  
-       if (flush && btrfs_transaction_in_commit(root->fs_info))
+       if (flush != BTRFS_RESERVE_NO_FLUSH &&
+           btrfs_transaction_in_commit(root->fs_info))
                 schedule_timeout(1);
  
-       mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+       if (delalloc_lock)
+               mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+
         num_bytes = ALIGN(num_bytes, root->sectorsize);
  
         spin_lock(&BTRFS_I(inode)->lock);
@@ -4568,16 +4622,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
         csum_bytes = BTRFS_I(inode)->csum_bytes;
         spin_unlock(&BTRFS_I(inode)->lock);
  
-       if (root->fs_info->quota_enabled) {
+       if (root->fs_info->quota_enabled)
                 ret = btrfs_qgroup_reserve(root, num_bytes +
                                            nr_extents * root->leafsize);
-               if (ret) {
-                       mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
-                       return ret;
-               }
-       }
  
-       ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+       /*
+        * ret != 0 here means the qgroup reservation failed, we go straight to
+        * the shared error handling then.
+        */
+       if (ret == 0)
+               ret = reserve_metadata_bytes(root, block_rsv,
+                                            to_reserve, flush);
+
         if (ret) {
                 u64 to_free = 0;
                 unsigned dropped;
@@ -4607,7 +4663,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                                                       btrfs_ino(inode),
                                                       to_free, 0);
                 }
-               mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+               if (root->fs_info->quota_enabled) {
+                       btrfs_qgroup_free(root, num_bytes +
+                                               nr_extents * root->leafsize);
+               }
+               if (delalloc_lock)
+                       mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
                 return ret;
         }
  
@@ -4619,7 +4680,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
         }
         BTRFS_I(inode)->reserved_extents += nr_extents;
         spin_unlock(&BTRFS_I(inode)->lock);
-       mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+
+       if (delalloc_lock)
+               mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
  
         if (to_reserve)
                 trace_btrfs_space_reservation(root->fs_info,"delalloc",
@@ -4715,8 +4778,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
         btrfs_free_reserved_data_space(inode, num_bytes);
  }
  
-static int update_block_group(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root,
+static int update_block_group(struct btrfs_root *root,
                               u64 bytenr, u64 num_bytes, int alloc)
  {
         struct btrfs_block_group_cache *cache = NULL;
@@ -4753,7 +4815,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                  * space back to the block group, otherwise we will leak space.
                  */
                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
-                       cache_block_group(cache, trans, NULL, 1);
+                       cache_block_group(cache, 1);
  
                 byte_in_group = bytenr - cache->key.objectid;
                 WARN_ON(byte_in_group > cache->key.offset);
@@ -4803,6 +4865,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
         struct btrfs_block_group_cache *cache;
         u64 bytenr;
  
+       spin_lock(&root->fs_info->block_group_cache_lock);
+       bytenr = root->fs_info->first_logical_byte;
+       spin_unlock(&root->fs_info->block_group_cache_lock);
+
+       if (bytenr < (u64)-1)
+               return bytenr;
+
         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
         if (!cache)
                 return 0;
@@ -4853,8 +4922,7 @@ int btrfs_pin_extent(struct btrfs_root *root,
  /*
   * this function must be called within transaction
   */
-int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
-                                   struct btrfs_root *root,
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
                                     u64 bytenr, u64 num_bytes)
  {
         struct btrfs_block_group_cache *cache;
@@ -4868,7 +4936,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
          * to one because the slow code to read in the free extents does check
          * the pinned extents.
          */
-       cache_block_group(cache, trans, root, 1);
+       cache_block_group(cache, 1);
  
         pin_down_extent(root, cache, bytenr, num_bytes, 0);
  
@@ -4969,9 +5037,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
  {
         struct btrfs_fs_info *fs_info = root->fs_info;
         struct btrfs_block_group_cache *cache = NULL;
+       struct btrfs_space_info *space_info;
+       struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
         u64 len;
+       bool readonly;
  
         while (start <= end) {
+               readonly = false;
                 if (!cache ||
                     start >= cache->key.objectid + cache->key.offset) {
                         if (cache)
@@ -4989,15 +5061,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
                 }
  
                 start += len;
+               space_info = cache->space_info;
  
-               spin_lock(&cache->space_info->lock);
+               spin_lock(&space_info->lock);
                 spin_lock(&cache->lock);
                 cache->pinned -= len;
-               cache->space_info->bytes_pinned -= len;
-               if (cache->ro)
-                       cache->space_info->bytes_readonly += len;
+               space_info->bytes_pinned -= len;
+               if (cache->ro) {
+                       space_info->bytes_readonly += len;
+                       readonly = true;
+               }
                 spin_unlock(&cache->lock);
-               spin_unlock(&cache->space_info->lock);
+               if (!readonly && global_rsv->space_info == space_info) {
+                       spin_lock(&global_rsv->lock);
+                       if (!global_rsv->full) {
+                               len = min(len, global_rsv->size -
+                                         global_rsv->reserved);
+                               global_rsv->reserved += len;
+                               space_info->bytes_may_use += len;
+                               if (global_rsv->reserved >= global_rsv->size)
+                                       global_rsv->full = 1;
+                       }
+                       spin_unlock(&global_rsv->lock);
+               }
+               spin_unlock(&space_info->lock);
         }
  
         if (cache)
@@ -5246,7 +5333,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                         }
                 }
  
-               ret = update_block_group(trans, root, bytenr, num_bytes, 0);
+               ret = update_block_group(root, bytenr, num_bytes, 0);
                 if (ret) {
                         btrfs_abort_transaction(trans, extent_root, ret);
                         goto out;
@@ -5291,7 +5378,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
         if (head->extent_op) {
                 if (!head->must_insert_reserved)
                         goto out;
-               kfree(head->extent_op);
+               btrfs_free_delayed_extent_op(head->extent_op);
                 head->extent_op = NULL;
         }
  
@@ -5466,27 +5553,23 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
         return 0;
  }
  
-static int __get_block_group_index(u64 flags)
+int __get_raid_index(u64 flags)
  {
-       int index;
-
         if (flags & BTRFS_BLOCK_GROUP_RAID10)
-               index = 0;
+               return BTRFS_RAID_RAID10;
         else if (flags & BTRFS_BLOCK_GROUP_RAID1)
-               index = 1;
+               return BTRFS_RAID_RAID1;
         else if (flags & BTRFS_BLOCK_GROUP_DUP)
-               index = 2;
+               return BTRFS_RAID_DUP;
         else if (flags & BTRFS_BLOCK_GROUP_RAID0)
-               index = 3;
+               return BTRFS_RAID_RAID0;
         else
-               index = 4;
-
-       return index;
+               return BTRFS_RAID_SINGLE;
  }
  
  static int get_block_group_index(struct btrfs_block_group_cache *cache)
  {
-       return __get_block_group_index(cache->flags);
+       return __get_raid_index(cache->flags);
  }
  
  enum btrfs_loop_type {
@@ -5519,7 +5602,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
         int empty_cluster = 2 * 1024 * 1024;
         struct btrfs_space_info *space_info;
         int loop = 0;
-       int index = 0;
+       int index = __get_raid_index(data);
         int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
                 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
         bool found_uncached_bg = false;
@@ -5639,8 +5722,7 @@ have_block_group:
                 cached = block_group_cache_done(block_group);
                 if (unlikely(!cached)) {
                         found_uncached_bg = true;
-                       ret = cache_block_group(block_group, trans,
-                                               orig_root, 0);
+                       ret = cache_block_group(block_group, 0);
                         BUG_ON(ret < 0);
                         ret = 0;
                 }
@@ -6069,7 +6151,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
         btrfs_mark_buffer_dirty(path->nodes[0]);
         btrfs_free_path(path);
  
-       ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+       ret = update_block_group(root, ins->objectid, ins->offset, 1);
         if (ret) { /* -ENOENT, logic error */
                 printk(KERN_ERR "btrfs update block group failed for %llu "
                        "%llu\n", (unsigned long long)ins->objectid,
@@ -6133,7 +6215,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
         btrfs_mark_buffer_dirty(leaf);
         btrfs_free_path(path);
  
-       ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+       ret = update_block_group(root, ins->objectid, ins->offset, 1);
         if (ret) { /* -ENOENT, logic error */
                 printk(KERN_ERR "btrfs update block group failed for %llu "
                        "%llu\n", (unsigned long long)ins->objectid,
@@ -6176,7 +6258,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
         u64 num_bytes = ins->offset;
  
         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-       cache_block_group(block_group, trans, NULL, 0);
+       cache_block_group(block_group, 0);
         caching_ctl = get_caching_control(block_group);
  
         if (!caching_ctl) {
@@ -6269,7 +6351,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
         block_rsv = get_block_rsv(trans, root);
  
         if (block_rsv->size == 0) {
-               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+               ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+                                            BTRFS_RESERVE_NO_FLUSH);
                 /*
                  * If we couldn't reserve metadata bytes try and use some from
                  * the global reserve.
@@ -6292,11 +6375,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
                 static DEFINE_RATELIMIT_STATE(_rs,
                                 DEFAULT_RATELIMIT_INTERVAL,
                                 /*DEFAULT_RATELIMIT_BURST*/ 2);
-               if (__ratelimit(&_rs)) {
-                       printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
-                       WARN_ON(1);
-               }
-               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+               if (__ratelimit(&_rs))
+                       WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
+                            ret);
+               ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+                                            BTRFS_RESERVE_NO_FLUSH);
                 if (!ret) {
                         return block_rsv;
                 } else if (ret && block_rsv != global_rsv) {
@@ -6360,7 +6443,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
  
         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
                 struct btrfs_delayed_extent_op *extent_op;
-               extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+               extent_op = btrfs_alloc_delayed_extent_op();
                 BUG_ON(!extent_op); /* -ENOMEM */
                 if (key)
                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
@@ -6746,11 +6829,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                                                        &wc->flags[level]);
                         if (ret < 0) {
                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
+                               path->locks[level] = 0;
                                 return ret;
                         }
                         BUG_ON(wc->refs[level] == 0);
                         if (wc->refs[level] == 1) {
                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
+                               path->locks[level] = 0;
                                 return 1;
                         }
                 }
@@ -7427,7 +7512,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
          */
         target = get_restripe_target(root->fs_info, block_group->flags);
         if (target) {
-               index = __get_block_group_index(extended_to_chunk(target));
+               index = __get_raid_index(extended_to_chunk(target));
         } else {
                 /*
                  * this is just a balance, so if we were marked as full
@@ -7439,16 +7524,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                 index = get_block_group_index(block_group);
         }
  
-       if (index == 0) {
+       if (index == BTRFS_RAID_RAID10) {
                 dev_min = 4;
                 /* Divide by 2 */
                 min_free >>= 1;
-       } else if (index == 1) {
+       } else if (index == BTRFS_RAID_RAID1) {
                 dev_min = 2;
-       } else if (index == 2) {
+       } else if (index == BTRFS_RAID_DUP) {
                 /* Multiply by 2 */
                 min_free <<= 1;
-       } else if (index == 3) {
+       } else if (index == BTRFS_RAID_RAID0) {
                 dev_min = fs_devices->rw_devices;
                 do_div(min_free, dev_min);
         }
@@ -7461,7 +7546,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                  * check to make sure we can actually find a chunk with enough
                  * space to fit our block group in.
                  */
-               if (device->total_bytes > device->bytes_used + min_free) {
+               if (device->total_bytes > device->bytes_used + min_free &&
+                   !device->is_tgtdev_for_dev_replace) {
                         ret = find_free_dev_extent(device, min_free,
                                                    &dev_offset, NULL);
                         if (!ret)
@@ -7889,12 +7975,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
         u64 extra_flags = chunk_to_extended(flags) &
                                 BTRFS_EXTENDED_PROFILE_MASK;
  
+       write_seqlock(&fs_info->profiles_lock);
         if (flags & BTRFS_BLOCK_GROUP_DATA)
                 fs_info->avail_data_alloc_bits &= ~extra_flags;
         if (flags & BTRFS_BLOCK_GROUP_METADATA)
                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
                 fs_info->avail_system_alloc_bits &= ~extra_flags;
+       write_sequnlock(&fs_info->profiles_lock);
  }
  
  int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
@@ -7993,6 +8081,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         spin_lock(&root->fs_info->block_group_cache_lock);
         rb_erase(&block_group->cache_node,
                  &root->fs_info->block_group_cache_tree);
+
+       if (root->fs_info->first_logical_byte == block_group->key.objectid)
+               root->fs_info->first_logical_byte = (u64)-1;
         spin_unlock(&root->fs_info->block_group_cache_lock);
  
         down_write(&block_group->space_info->groups_sem);
@@ -8115,7 +8206,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
  
                 if (end - start >= range->minlen) {
                         if (!block_group_cache_done(cache)) {
-                               ret = cache_block_group(cache, NULL, root, 0);
+                               ret = cache_block_group(cache, 0);
                                 if (!ret)
                                         wait_block_group_cache_done(cache);
                         }