btrfs: fix wrong free space information of btrfs

[cascardo/linux.git] / fs / btrfs / extent-tree.c
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 9a325e4..04bfc3a 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -242,6 +242,12 @@ get_caching_control(struct btrfs_block_group_cache *cache)
                 return NULL;
         }
  
+       /* We're loading it the fast way, so we don't have a caching_ctl. */
+       if (!cache->caching_ctl) {
+               spin_unlock(&cache->lock);
+               return NULL;
+       }
+
         ctl = cache->caching_ctl;
         atomic_inc(&ctl->count);
         spin_unlock(&cache->lock);
@@ -423,6 +429,7 @@ err:
  
  static int cache_block_group(struct btrfs_block_group_cache *cache,
                              struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
                              int load_cache_only)
  {
         struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -436,9 +443,12 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
  
         /*
          * We can't do the read from on-disk cache during a commit since we need
-        * to have the normal tree locking.
+        * to have the normal tree locking.  Also if we are currently trying to
+        * allocate blocks for the tree root we can't do the fast caching since
+        * we likely hold important locks.
          */
-       if (!trans->transaction->in_commit) {
+       if (!trans->transaction->in_commit &&
+           (root && root != root->fs_info->tree_root)) {
                 spin_lock(&cache->lock);
                 if (cache->cached != BTRFS_CACHE_NO) {
                         spin_unlock(&cache->lock);
@@ -541,7 +551,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
  
         rcu_read_lock();
         list_for_each_entry_rcu(found, head, list) {
-               if (found->flags == flags) {
+               if (found->flags & flags) {
                         rcu_read_unlock();
                         return found;
                 }
@@ -574,6 +584,15 @@ static u64 div_factor(u64 num, int factor)
         return num;
  }
  
+static u64 div_factor_fine(u64 num, int factor)
+{
+       if (factor == 100)
+               return num;
+       num *= factor;
+       do_div(num, 100);
+       return num;
+}
+
  u64 btrfs_find_block_group(struct btrfs_root *root,
                            u64 search_start, u64 search_hint, int owner)
  {
@@ -2727,6 +2746,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
         struct btrfs_root *root = block_group->fs_info->tree_root;
         struct inode *inode = NULL;
         u64 alloc_hint = 0;
+       int dcs = BTRFS_DC_ERROR;
         int num_pages = 0;
         int retries = 0;
         int ret = 0;
@@ -2781,6 +2801,8 @@ again:
  
         spin_lock(&block_group->lock);
         if (block_group->cached != BTRFS_CACHE_FINISHED) {
+               /* We're not cached, don't bother trying to write stuff out */
+               dcs = BTRFS_DC_WRITTEN;
                 spin_unlock(&block_group->lock);
                 goto out_put;
         }
@@ -2807,6 +2829,8 @@ again:
         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
                                               num_pages, num_pages,
                                               &alloc_hint);
+       if (!ret)
+               dcs = BTRFS_DC_SETUP;
         btrfs_free_reserved_data_space(inode, num_pages);
  out_put:
         iput(inode);
@@ -2814,10 +2838,7 @@ out_free:
         btrfs_release_path(root, path);
  out:
         spin_lock(&block_group->lock);
-       if (ret)
-               block_group->disk_cache_state = BTRFS_DC_ERROR;
-       else
-               block_group->disk_cache_state = BTRFS_DC_SETUP;
+       block_group->disk_cache_state = dcs;
         spin_unlock(&block_group->lock);
  
         return ret;
@@ -2970,6 +2991,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
         if (found) {
                 spin_lock(&found->lock);
                 found->total_bytes += total_bytes;
+               found->disk_total += total_bytes * factor;
                 found->bytes_used += bytes_used;
                 found->disk_used += bytes_used * factor;
                 found->full = 0;
@@ -2989,6 +3011,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                                 BTRFS_BLOCK_GROUP_SYSTEM |
                                 BTRFS_BLOCK_GROUP_METADATA);
         found->total_bytes = total_bytes;
+       found->disk_total = total_bytes * factor;
         found->bytes_used = bytes_used;
         found->disk_used = bytes_used * factor;
         found->bytes_pinned = 0;
@@ -3021,7 +3044,13 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
  
  u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
  {
-       u64 num_devices = root->fs_info->fs_devices->rw_devices;
+       /*
+        * we add in the count of missing devices because we want
+        * to make sure that any RAID levels on a degraded FS
+        * continue to be honored.
+        */
+       u64 num_devices = root->fs_info->fs_devices->rw_devices +
+               root->fs_info->fs_devices->missing_devices;
  
         if (num_devices == 1)
                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -3061,7 +3090,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
         return btrfs_reduce_alloc_profile(root, flags);
  }
  
-static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
  {
         u64 flags;
  
@@ -3133,8 +3162,12 @@ alloc:
                                              bytes + 2 * 1024 * 1024,
                                              alloc_target, 0);
                         btrfs_end_transaction(trans, root);
-                       if (ret < 0)
-                               return ret;
+                       if (ret < 0) {
+                               if (ret != -ENOSPC)
+                                       return ret;
+                               else
+                                       goto commit_trans;
+                       }
  
                         if (!data_sinfo) {
                                 btrfs_set_inode_space_info(root, inode);
@@ -3145,6 +3178,7 @@ alloc:
                 spin_unlock(&data_sinfo->lock);
  
                 /* commit the current transaction and try again */
+commit_trans:
                 if (!committed && !root->fs_info->open_ioctl_trans) {
                         committed = 1;
                         trans = btrfs_join_transaction(root, 1);
@@ -3210,10 +3244,11 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
         rcu_read_unlock();
  }
  
-static int should_alloc_chunk(struct btrfs_space_info *sinfo,
-                             u64 alloc_bytes)
+static int should_alloc_chunk(struct btrfs_root *root,
+                             struct btrfs_space_info *sinfo, u64 alloc_bytes)
  {
         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+       u64 thresh;
  
         if (sinfo->bytes_used + sinfo->bytes_reserved +
             alloc_bytes + 256 * 1024 * 1024 < num_bytes)
@@ -3223,6 +3258,12 @@ static int should_alloc_chunk(struct btrfs_space_info *sinfo,
             alloc_bytes < div_factor(num_bytes, 8))
                 return 0;
  
+       thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+       thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+
+       if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
+               return 0;
+
         return 1;
  }
  
@@ -3254,12 +3295,20 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                 goto out;
         }
  
-       if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
+       if (!force && !should_alloc_chunk(extent_root, space_info,
+                                         alloc_bytes)) {
                 spin_unlock(&space_info->lock);
                 goto out;
         }
         spin_unlock(&space_info->lock);
  
+       /*
+        * If we have mixed data/metadata chunks we want to make sure we keep
+        * allocating mixed chunks instead of individual chunks.
+        */
+       if (btrfs_mixed_space_info(space_info))
+               flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
+
         /*
          * if we're doing a data chunk, go ahead and make sure that
          * we keep a reasonable number of metadata chunks allocated in the
@@ -3285,55 +3334,25 @@ out:
         return ret;
  }
  
-static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
-                               struct btrfs_space_info *sinfo, u64 num_bytes)
-{
-       int ret;
-       int end_trans = 0;
-
-       if (sinfo->full)
-               return 0;
-
-       spin_lock(&sinfo->lock);
-       ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
-       spin_unlock(&sinfo->lock);
-       if (!ret)
-               return 0;
-
-       if (!trans) {
-               trans = btrfs_join_transaction(root, 1);
-               BUG_ON(IS_ERR(trans));
-               end_trans = 1;
-       }
-
-       ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                            num_bytes + 2 * 1024 * 1024,
-                            get_alloc_profile(root, sinfo->flags), 0);
-
-       if (end_trans)
-               btrfs_end_transaction(trans, root);
-
-       return ret == 1 ? 1 : 0;
-}
-
  /*
   * shrink metadata reservation for delalloc
   */
  static int shrink_delalloc(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, u64 to_reclaim)
+                          struct btrfs_root *root, u64 to_reclaim, int sync)
  {
         struct btrfs_block_rsv *block_rsv;
+       struct btrfs_space_info *space_info;
         u64 reserved;
         u64 max_reclaim;
         u64 reclaimed = 0;
         int pause = 1;
-       int ret;
+       int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
  
         block_rsv = &root->fs_info->delalloc_block_rsv;
-       spin_lock(&block_rsv->lock);
-       reserved = block_rsv->reserved;
-       spin_unlock(&block_rsv->lock);
+       space_info = block_rsv->space_info;
+
+       smp_mb();
+       reserved = space_info->bytes_reserved;
  
         if (reserved == 0)
                 return 0;
@@ -3341,104 +3360,169 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
         max_reclaim = min(reserved, to_reclaim);
  
         while (1) {
-               ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
-               if (!ret) {
-                       __set_current_state(TASK_INTERRUPTIBLE);
-                       schedule_timeout(pause);
-                       pause <<= 1;
-                       if (pause > HZ / 10)
-                               pause = HZ / 10;
-               } else {
-                       pause = 1;
-               }
+               /* have the flusher threads jump in and do some IO */
+               smp_mb();
+               nr_pages = min_t(unsigned long, nr_pages,
+                      root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
+               writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
  
-               spin_lock(&block_rsv->lock);
-               if (reserved > block_rsv->reserved)
-                       reclaimed = reserved - block_rsv->reserved;
-               reserved = block_rsv->reserved;
-               spin_unlock(&block_rsv->lock);
+               spin_lock(&space_info->lock);
+               if (reserved > space_info->bytes_reserved)
+                       reclaimed += reserved - space_info->bytes_reserved;
+               reserved = space_info->bytes_reserved;
+               spin_unlock(&space_info->lock);
  
                 if (reserved == 0 || reclaimed >= max_reclaim)
                         break;
  
                 if (trans && trans->transaction->blocked)
                         return -EAGAIN;
+
+               __set_current_state(TASK_INTERRUPTIBLE);
+               schedule_timeout(pause);
+               pause <<= 1;
+               if (pause > HZ / 10)
+                       pause = HZ / 10;
+
         }
         return reclaimed >= to_reclaim;
  }
  
-static int should_retry_reserve(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
-                               struct btrfs_block_rsv *block_rsv,
-                               u64 num_bytes, int *retries)
+/*
+ * Retries tells us how many times we've called reserve_metadata_bytes.  The
+ * idea is if this is the first call (retries == 0) then we will add to our
+ * reserved count if we can't make the allocation in order to hold our place
+ * while we go and try and free up space.  That way for retries > 1 we don't try
+ * and add space, we just check to see if the amount of unused space is >= the
+ * total space, meaning that our reservation is valid.
+ *
+ * However if we don't intend to retry this reservation, pass -1 as retries so
+ * that it short circuits this logic.
+ */
+static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_block_rsv *block_rsv,
+                                 u64 orig_bytes, int flush)
  {
         struct btrfs_space_info *space_info = block_rsv->space_info;
-       int ret;
+       u64 unused;
+       u64 num_bytes = orig_bytes;
+       int retries = 0;
+       int ret = 0;
+       bool reserved = false;
+       bool committed = false;
  
-       if ((*retries) > 2)
-               return -ENOSPC;
+again:
+       ret = -ENOSPC;
+       if (reserved)
+               num_bytes = 0;
  
-       ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
-       if (ret)
-               return 1;
+       spin_lock(&space_info->lock);
+       unused = space_info->bytes_used + space_info->bytes_reserved +
+                space_info->bytes_pinned + space_info->bytes_readonly +
+                space_info->bytes_may_use;
  
-       if (trans && trans->transaction->in_commit)
-               return -ENOSPC;
+       /*
+        * The idea here is that we've not already over-reserved the block group
+        * then we can go ahead and save our reservation first and then start
+        * flushing if we need to.  Otherwise if we've already overcommitted
+        * lets start flushing stuff first and then come back and try to make
+        * our reservation.
+        */
+       if (unused <= space_info->total_bytes) {
+               unused = space_info->total_bytes - unused;
+               if (unused >= num_bytes) {
+                       if (!reserved)
+                               space_info->bytes_reserved += orig_bytes;
+                       ret = 0;
+               } else {
+                       /*
+                        * Ok set num_bytes to orig_bytes since we aren't
+                        * overocmmitted, this way we only try and reclaim what
+                        * we need.
+                        */
+                       num_bytes = orig_bytes;
+               }
+       } else {
+               /*
+                * Ok we're over committed, set num_bytes to the overcommitted
+                * amount plus the amount of bytes that we need for this
+                * reservation.
+                */
+               num_bytes = unused - space_info->total_bytes +
+                       (orig_bytes * (retries + 1));
+       }
  
-       ret = shrink_delalloc(trans, root, num_bytes);
-       if (ret)
-               return ret;
+       /*
+        * Couldn't make our reservation, save our place so while we're trying
+        * to reclaim space we can actually use it instead of somebody else
+        * stealing it from us.
+        */
+       if (ret && !reserved) {
+               space_info->bytes_reserved += orig_bytes;
+               reserved = true;
+       }
  
-       spin_lock(&space_info->lock);
-       if (space_info->bytes_pinned < num_bytes)
-               ret = 1;
         spin_unlock(&space_info->lock);
-       if (ret)
-               return -ENOSPC;
-
-       (*retries)++;
  
-       if (trans)
-               return -EAGAIN;
+       if (!ret)
+               return 0;
  
-       trans = btrfs_join_transaction(root, 1);
-       BUG_ON(IS_ERR(trans));
-       ret = btrfs_commit_transaction(trans, root);
-       BUG_ON(ret);
+       if (!flush)
+               goto out;
  
-       return 1;
-}
+       /*
+        * We do synchronous shrinking since we don't actually unreserve
+        * metadata until after the IO is completed.
+        */
+       ret = shrink_delalloc(trans, root, num_bytes, 1);
+       if (ret > 0)
+               return 0;
+       else if (ret < 0)
+               goto out;
  
-static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
-                                 u64 num_bytes)
-{
-       struct btrfs_space_info *space_info = block_rsv->space_info;
-       u64 unused;
-       int ret = -ENOSPC;
+       /*
+        * So if we were overcommitted it's possible that somebody else flushed
+        * out enough space and we simply didn't have enough space to reclaim,
+        * so go back around and try again.
+        */
+       if (retries < 2) {
+               retries++;
+               goto again;
+       }
  
         spin_lock(&space_info->lock);
-       unused = space_info->bytes_used + space_info->bytes_reserved +
-                space_info->bytes_pinned + space_info->bytes_readonly;
+       /*
+        * Not enough space to be reclaimed, don't bother committing the
+        * transaction.
+        */
+       if (space_info->bytes_pinned < orig_bytes)
+               ret = -ENOSPC;
+       spin_unlock(&space_info->lock);
+       if (ret)
+               goto out;
  
-       if (unused < space_info->total_bytes)
-               unused = space_info->total_bytes - unused;
-       else
-               unused = 0;
+       ret = -EAGAIN;
+       if (trans || committed)
+               goto out;
  
-       if (unused >= num_bytes) {
-               if (block_rsv->priority >= 10) {
-                       space_info->bytes_reserved += num_bytes;
-                       ret = 0;
-               } else {
-                       if ((unused + block_rsv->reserved) *
-                           block_rsv->priority >=
-                           (num_bytes + block_rsv->reserved) * 10) {
-                               space_info->bytes_reserved += num_bytes;
-                               ret = 0;
-                       }
-               }
+       ret = -ENOSPC;
+       trans = btrfs_join_transaction(root, 1);
+       if (IS_ERR(trans))
+               goto out;
+       ret = btrfs_commit_transaction(trans, root);
+       if (!ret) {
+               trans = NULL;
+               committed = true;
+               goto again;
+       }
+
+out:
+       if (reserved) {
+               spin_lock(&space_info->lock);
+               space_info->bytes_reserved -= orig_bytes;
+               spin_unlock(&space_info->lock);
         }
-       spin_unlock(&space_info->lock);
  
         return ret;
  }
@@ -3540,18 +3624,14 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
  {
         struct btrfs_block_rsv *block_rsv;
         struct btrfs_fs_info *fs_info = root->fs_info;
-       u64 alloc_target;
  
         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
         if (!block_rsv)
                 return NULL;
  
         btrfs_init_block_rsv(block_rsv);
-
-       alloc_target = btrfs_get_alloc_profile(root, 0);
         block_rsv->space_info = __find_space_info(fs_info,
                                                   BTRFS_BLOCK_GROUP_METADATA);
-
         return block_rsv;
  }
  
@@ -3582,23 +3662,19 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
  int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         struct btrfs_block_rsv *block_rsv,
-                       u64 num_bytes, int *retries)
+                       u64 num_bytes)
  {
         int ret;
  
         if (num_bytes == 0)
                 return 0;
-again:
-       ret = reserve_metadata_bytes(block_rsv, num_bytes);
+
+       ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
         if (!ret) {
                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
                 return 0;
         }
  
-       ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
-       if (ret > 0)
-               goto again;
-
         return ret;
  }
  
@@ -3633,7 +3709,8 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
                 return 0;
  
         if (block_rsv->refill_used) {
-               ret = reserve_metadata_bytes(block_rsv, num_bytes);
+               ret = reserve_metadata_bytes(trans, root, block_rsv,
+                                            num_bytes, 0);
                 if (!ret) {
                         block_rsv_add_bytes(block_rsv, num_bytes, 0);
                         return 0;
@@ -3712,6 +3789,8 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
  
         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
         spin_lock(&sinfo->lock);
+       if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
+               data_used = 0;
         meta_used = sinfo->bytes_used;
         spin_unlock(&sinfo->lock);
  
@@ -3739,7 +3818,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
         block_rsv->size = num_bytes;
  
         num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
-                   sinfo->bytes_reserved + sinfo->bytes_readonly;
+                   sinfo->bytes_reserved + sinfo->bytes_readonly +
+                   sinfo->bytes_may_use;
  
         if (sinfo->total_bytes > num_bytes) {
                 num_bytes = sinfo->total_bytes - num_bytes;
@@ -3810,7 +3890,7 @@ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
  
  int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
-                                int num_items, int *retries)
+                                int num_items)
  {
         u64 num_bytes;
         int ret;
@@ -3820,7 +3900,7 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
  
         num_bytes = calc_trans_metadata_size(root, num_items);
         ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
-                                 num_bytes, retries);
+                                 num_bytes);
         if (!ret) {
                 trans->bytes_reserved += num_bytes;
                 trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3894,14 +3974,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
         struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
         u64 to_reserve;
         int nr_extents;
-       int retries = 0;
         int ret;
  
         if (btrfs_transaction_in_commit(root->fs_info))
                 schedule_timeout(1);
  
         num_bytes = ALIGN(num_bytes, root->sectorsize);
-again:
+
         spin_lock(&BTRFS_I(inode)->accounting_lock);
         nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
         if (nr_extents > BTRFS_I(inode)->reserved_extents) {
@@ -3911,18 +3990,14 @@ again:
                 nr_extents = 0;
                 to_reserve = 0;
         }
+       spin_unlock(&BTRFS_I(inode)->accounting_lock);
  
         to_reserve += calc_csum_metadata_size(inode, num_bytes);
-       ret = reserve_metadata_bytes(block_rsv, to_reserve);
-       if (ret) {
-               spin_unlock(&BTRFS_I(inode)->accounting_lock);
-               ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
-                                          &retries);
-               if (ret > 0)
-                       goto again;
+       ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
+       if (ret)
                 return ret;
-       }
  
+       spin_lock(&BTRFS_I(inode)->accounting_lock);
         BTRFS_I(inode)->reserved_extents += nr_extents;
         atomic_inc(&BTRFS_I(inode)->outstanding_extents);
         spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -3930,7 +4005,7 @@ again:
         block_rsv_add_bytes(block_rsv, to_reserve, 1);
  
         if (block_rsv->size > 512 * 1024 * 1024)
-               shrink_delalloc(NULL, root, to_reserve);
+               shrink_delalloc(NULL, root, to_reserve, 0);
  
         return 0;
  }
@@ -4023,7 +4098,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                  * space back to the block group, otherwise we will leak space.
                  */
                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
-                       cache_block_group(cache, trans, 1);
+                       cache_block_group(cache, trans, NULL, 1);
  
                 byte_in_group = bytenr - cache->key.objectid;
                 WARN_ON(byte_in_group > cache->key.offset);
@@ -4781,6 +4856,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
         bool found_uncached_bg = false;
         bool failed_cluster_refill = false;
         bool failed_alloc = false;
+       bool use_cluster = true;
         u64 ideal_cache_percent = 0;
         u64 ideal_cache_offset = 0;
  
@@ -4795,16 +4871,24 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
                 return -ENOSPC;
         }
  
+       /*
+        * If the space info is for both data and metadata it means we have a
+        * small filesystem and we can't use the clustering stuff.
+        */
+       if (btrfs_mixed_space_info(space_info))
+               use_cluster = false;
+
         if (orig_root->ref_cows || empty_size)
                 allowed_chunk_alloc = 1;
  
-       if (data & BTRFS_BLOCK_GROUP_METADATA) {
+       if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
                 last_ptr = &root->fs_info->meta_alloc_cluster;
                 if (!btrfs_test_opt(root, SSD))
                         empty_cluster = 64 * 1024;
         }
  
-       if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
+       if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
+           btrfs_test_opt(root, SSD)) {
                 last_ptr = &root->fs_info->data_alloc_cluster;
         }
  
@@ -4864,11 +4948,31 @@ search:
                 btrfs_get_block_group(block_group);
                 search_start = block_group->key.objectid;
  
+               /*
+                * this can happen if we end up cycling through all the
+                * raid types, but we want to make sure we only allocate
+                * for the proper type.
+                */
+               if (!block_group_bits(block_group, data)) {
+                   u64 extra = BTRFS_BLOCK_GROUP_DUP |
+                               BTRFS_BLOCK_GROUP_RAID1 |
+                               BTRFS_BLOCK_GROUP_RAID10;
+
+                       /*
+                        * if they asked for extra copies and this block group
+                        * doesn't provide them, bail.  This does allow us to
+                        * fill raid0 from raid1.
+                        */
+                       if ((data & extra) && !(block_group->flags & extra))
+                               goto loop;
+               }
+
  have_block_group:
                 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
                         u64 free_percent;
  
-                       ret = cache_block_group(block_group, trans, 1);
+                       ret = cache_block_group(block_group, trans,
+                                               orig_root, 1);
                         if (block_group->cached == BTRFS_CACHE_FINISHED)
                                 goto have_block_group;
  
@@ -4892,7 +4996,8 @@ have_block_group:
                         if (loop > LOOP_CACHING_NOWAIT ||
                             (loop > LOOP_FIND_IDEAL &&
                              atomic_read(&space_info->caching_threads) < 2)) {
-                               ret = cache_block_group(block_group, trans, 0);
+                               ret = cache_block_group(block_group, trans,
+                                                       orig_root, 0);
                                 BUG_ON(ret);
                         }
                         found_uncached_bg = true;
@@ -5449,7 +5554,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
         u64 num_bytes = ins->offset;
  
         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-       cache_block_group(block_group, trans, 0);
+       cache_block_group(block_group, trans, NULL, 0);
         caching_ctl = get_caching_control(block_group);
  
         if (!caching_ctl) {
@@ -5539,7 +5644,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
         block_rsv = get_block_rsv(trans, root);
  
         if (block_rsv->size == 0) {
-               ret = reserve_metadata_bytes(block_rsv, blocksize);
+               ret = reserve_metadata_bytes(trans, root, block_rsv,
+                                            blocksize, 0);
                 if (ret)
                         return ERR_PTR(ret);
                 return block_rsv;
@@ -5549,11 +5655,6 @@ use_block_rsv(struct btrfs_trans_handle *trans,
         if (!ret)
                 return block_rsv;
  
-       WARN_ON(1);
-       printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
-               block_rsv->size, block_rsv->reserved,
-               block_rsv->freed[0], block_rsv->freed[1]);
-
         return ERR_PTR(-ENOSPC);
  }
  
@@ -5652,7 +5753,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
         u64 generation;
         u64 refs;
         u64 flags;
-       u64 last = 0;
         u32 nritems;
         u32 blocksize;
         struct btrfs_key key;
@@ -5720,7 +5820,6 @@ reada:
                                            generation);
                 if (ret)
                         break;
-               last = bytenr + blocksize;
                 nread++;
         }
         wc->reada_slot = slot;
@@ -6240,9 +6339,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                                            NULL, NULL);
                 BUG_ON(ret < 0);
                 if (ret > 0) {
-                       ret = btrfs_del_orphan_item(trans, tree_root,
-                                                   root->root_key.objectid);
-                       BUG_ON(ret);
+                       /* if we fail to delete the orphan item this time
+                        * around, it'll get picked up the next time.
+                        *
+                        * The most common failure here is just -ENOENT.
+                        */
+                       btrfs_del_orphan_item(trans, tree_root,
+                                             root->root_key.objectid);
                 }
         }
  
@@ -7818,7 +7921,14 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
         u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
  
-       num_devices = root->fs_info->fs_devices->rw_devices;
+       /*
+        * we add in the count of missing devices because we want
+        * to make sure that any RAID levels on a degraded FS
+        * continue to be honored.
+        */
+       num_devices = root->fs_info->fs_devices->rw_devices +
+               root->fs_info->fs_devices->missing_devices;
+
         if (num_devices == 1) {
                 stripped |= BTRFS_BLOCK_GROUP_DUP;
                 stripped = flags & ~stripped;
@@ -7866,13 +7976,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
  
         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
             sinfo->bytes_may_use + sinfo->bytes_readonly +
-           cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+           cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
                 sinfo->bytes_readonly += num_bytes;
                 sinfo->bytes_reserved += cache->reserved_pinned;
                 cache->reserved_pinned = 0;
                 cache->ro = 1;
                 ret = 0;
         }
+
         spin_unlock(&cache->lock);
         spin_unlock(&sinfo->lock);
         return ret;
@@ -7908,6 +8019,62 @@ out:
         return ret;
  }
  
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * list. takes mirrors into account.
+ */
+static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+{
+       struct btrfs_block_group_cache *block_group;
+       u64 free_bytes = 0;
+       int factor;
+
+       list_for_each_entry(block_group, groups_list, list) {
+               spin_lock(&block_group->lock);
+
+               if (!block_group->ro) {
+                       spin_unlock(&block_group->lock);
+                       continue;
+               }
+
+               if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                                         BTRFS_BLOCK_GROUP_RAID10 |
+                                         BTRFS_BLOCK_GROUP_DUP))
+                       factor = 2;
+               else
+                       factor = 1;
+
+               free_bytes += (block_group->key.offset -
+                              btrfs_block_group_used(&block_group->item)) *
+                              factor;
+
+               spin_unlock(&block_group->lock);
+       }
+
+       return free_bytes;
+}
+
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * space_info. takes mirrors into account.
+ */
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
+{
+       int i;
+       u64 free_bytes = 0;
+
+       spin_lock(&sinfo->lock);
+
+       for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+               if (!list_empty(&sinfo->block_groups[i]))
+                       free_bytes += __btrfs_get_ro_block_group_free_space(
+                                               &sinfo->block_groups[i]);
+
+       spin_unlock(&sinfo->lock);
+
+       return free_bytes;
+}
+
  int btrfs_set_block_group_rw(struct btrfs_root *root,
                               struct btrfs_block_group_cache *cache)
  {
@@ -7988,7 +8155,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
         mutex_lock(&root->fs_info->chunk_mutex);
         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
                 u64 min_free = btrfs_block_group_used(&block_group->item);
-               u64 dev_offset, max_avail;
+               u64 dev_offset;
  
                 /*
                  * check to make sure we can actually find a chunk with enough
@@ -7996,7 +8163,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                  */
                 if (device->total_bytes > device->bytes_used + min_free) {
                         ret = find_free_dev_extent(NULL, device, min_free,
-                                                  &dev_offset, &max_avail);
+                                                  &dev_offset, NULL);
                         if (!ret)
                                 break;
                         ret = -1;
@@ -8176,6 +8343,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
         if (cache_gen != 0 &&
             btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
                 need_clear = 1;
+       if (btrfs_test_opt(root, CLEAR_CACHE))
+               need_clear = 1;
+       if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
+               printk(KERN_INFO "btrfs: disk space caching is enabled\n");
  
         while (1) {
                 ret = find_first_block_group(root, path, &key);
@@ -8183,7 +8354,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                         break;
                 if (ret != 0)
                         goto error;
-
                 leaf = path->nodes[0];
                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
@@ -8367,6 +8537,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         struct btrfs_key key;
         struct inode *inode;
         int ret;
+       int factor;
  
         root = root->fs_info->extent_root;
  
@@ -8374,6 +8545,14 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         BUG_ON(!block_group);
         BUG_ON(!block_group->ro);
  
+       memcpy(&key, &block_group->key, sizeof(key));
+       if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
+                                 BTRFS_BLOCK_GROUP_RAID1 |
+                                 BTRFS_BLOCK_GROUP_RAID10))
+               factor = 2;
+       else
+               factor = 1;
+
         /* make sure this block group isn't part of an allocation cluster */
         cluster = &root->fs_info->data_alloc_cluster;
         spin_lock(&cluster->refill_lock);
@@ -8447,6 +8626,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         spin_lock(&block_group->space_info->lock);
         block_group->space_info->total_bytes -= block_group->key.offset;
         block_group->space_info->bytes_readonly -= block_group->key.offset;
+       block_group->space_info->disk_total -= block_group->key.offset * factor;
         spin_unlock(&block_group->space_info->lock);
  
         memcpy(&key, &block_group->key, sizeof(key));