Merge tag 'pm+acpi-3.19-rc1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/rafae...

[cascardo/linux.git] / fs / btrfs / extent-tree.c
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 47c1ba1..222d6ae 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -315,12 +315,6 @@ get_caching_control(struct btrfs_block_group_cache *cache)
         struct btrfs_caching_control *ctl;
  
         spin_lock(&cache->lock);
-       if (cache->cached != BTRFS_CACHE_STARTED) {
-               spin_unlock(&cache->lock);
-               return NULL;
-       }
-
-       /* We're loading it the fast way, so we don't have a caching_ctl. */
         if (!cache->caching_ctl) {
                 spin_unlock(&cache->lock);
                 return NULL;
@@ -594,6 +588,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
         spin_unlock(&cache->lock);
  
         if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
+               mutex_lock(&caching_ctl->mutex);
                 ret = load_free_space_cache(fs_info, cache);
  
                 spin_lock(&cache->lock);
@@ -601,15 +596,19 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                         cache->caching_ctl = NULL;
                         cache->cached = BTRFS_CACHE_FINISHED;
                         cache->last_byte_to_unpin = (u64)-1;
+                       caching_ctl->progress = (u64)-1;
                 } else {
                         if (load_cache_only) {
                                 cache->caching_ctl = NULL;
                                 cache->cached = BTRFS_CACHE_NO;
                         } else {
                                 cache->cached = BTRFS_CACHE_STARTED;
+                               cache->has_caching_ctl = 1;
                         }
                 }
                 spin_unlock(&cache->lock);
+               mutex_unlock(&caching_ctl->mutex);
+
                 wake_up(&caching_ctl->wait);
                 if (ret == 1) {
                         put_caching_control(caching_ctl);
@@ -627,6 +626,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                         cache->cached = BTRFS_CACHE_NO;
                 } else {
                         cache->cached = BTRFS_CACHE_STARTED;
+                       cache->has_caching_ctl = 1;
                 }
                 spin_unlock(&cache->lock);
                 wake_up(&caching_ctl->wait);
@@ -3162,7 +3162,19 @@ next_block_group(struct btrfs_root *root,
                  struct btrfs_block_group_cache *cache)
  {
         struct rb_node *node;
+
         spin_lock(&root->fs_info->block_group_cache_lock);
+
+       /* If our block group was removed, we need a full search. */
+       if (RB_EMPTY_NODE(&cache->cache_node)) {
+               const u64 next_bytenr = cache->key.objectid + cache->key.offset;
+
+               spin_unlock(&root->fs_info->block_group_cache_lock);
+               btrfs_put_block_group(cache);
+               cache = btrfs_lookup_first_block_group(root->fs_info,
+                                                      next_bytenr);
+               return cache;
+       }
         node = rb_next(&cache->cache_node);
         btrfs_put_block_group(cache);
         if (node) {
@@ -3504,6 +3516,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
         found->chunk_alloc = 0;
         found->flush = 0;
         init_waitqueue_head(&found->wait);
+       INIT_LIST_HEAD(&found->ro_bgs);
  
         ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
                                     info->space_info_kobj, "%s",
@@ -5425,7 +5438,17 @@ static int update_block_group(struct btrfs_root *root,
                         spin_unlock(&cache->space_info->lock);
                 } else {
                         old_val -= num_bytes;
+                       btrfs_set_block_group_used(&cache->item, old_val);
+                       cache->pinned += num_bytes;
+                       cache->space_info->bytes_pinned += num_bytes;
+                       cache->space_info->bytes_used -= num_bytes;
+                       cache->space_info->disk_used -= num_bytes * factor;
+                       spin_unlock(&cache->lock);
+                       spin_unlock(&cache->space_info->lock);
  
+                       set_extent_dirty(info->pinned_extents,
+                                        bytenr, bytenr + num_bytes - 1,
+                                        GFP_NOFS | __GFP_NOFAIL);
                         /*
                          * No longer have used bytes in this block group, queue
                          * it for deletion.
@@ -5439,17 +5462,6 @@ static int update_block_group(struct btrfs_root *root,
                                 }
                                 spin_unlock(&info->unused_bgs_lock);
                         }
-                       btrfs_set_block_group_used(&cache->item, old_val);
-                       cache->pinned += num_bytes;
-                       cache->space_info->bytes_pinned += num_bytes;
-                       cache->space_info->bytes_used -= num_bytes;
-                       cache->space_info->disk_used -= num_bytes * factor;
-                       spin_unlock(&cache->lock);
-                       spin_unlock(&cache->space_info->lock);
-
-                       set_extent_dirty(info->pinned_extents,
-                                        bytenr, bytenr + num_bytes - 1,
-                                        GFP_NOFS | __GFP_NOFAIL);
                 }
                 btrfs_put_block_group(cache);
                 total -= num_bytes;
@@ -8511,6 +8523,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
             min_allocable_bytes <= sinfo->total_bytes) {
                 sinfo->bytes_readonly += num_bytes;
                 cache->ro = 1;
+               list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
                 ret = 0;
         }
  out:
@@ -8565,15 +8578,20 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
  
  /*
   * helper to account the unused space of all the readonly block group in the
- * list. takes mirrors into account.
+ * space_info. takes mirrors into account.
   */
-static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
  {
         struct btrfs_block_group_cache *block_group;
         u64 free_bytes = 0;
         int factor;
  
-       list_for_each_entry(block_group, groups_list, list) {
+       /* It's df, we don't care if it's racey */
+       if (list_empty(&sinfo->ro_bgs))
+               return 0;
+
+       spin_lock(&sinfo->lock);
+       list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
                 spin_lock(&block_group->lock);
  
                 if (!block_group->ro) {
@@ -8594,26 +8612,6 @@ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
  
                 spin_unlock(&block_group->lock);
         }
-
-       return free_bytes;
-}
-
-/*
- * helper to account the unused space of all the readonly block group in the
- * space_info. takes mirrors into account.
- */
-u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
-{
-       int i;
-       u64 free_bytes = 0;
-
-       spin_lock(&sinfo->lock);
-
-       for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
-               if (!list_empty(&sinfo->block_groups[i]))
-                       free_bytes += __btrfs_get_ro_block_group_free_space(
-                                               &sinfo->block_groups[i]);
-
         spin_unlock(&sinfo->lock);
  
         return free_bytes;
@@ -8633,6 +8631,7 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
                     cache->bytes_super - btrfs_block_group_used(&cache->item);
         sinfo->bytes_readonly -= num_bytes;
         cache->ro = 0;
+       list_del_init(&cache->ro_list);
         spin_unlock(&cache->lock);
         spin_unlock(&sinfo->lock);
  }
@@ -9002,7 +9001,9 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
         INIT_LIST_HEAD(&cache->list);
         INIT_LIST_HEAD(&cache->cluster_list);
         INIT_LIST_HEAD(&cache->bg_list);
+       INIT_LIST_HEAD(&cache->ro_list);
         btrfs_init_free_space_ctl(cache);
+       atomic_set(&cache->trimming, 0);
  
         return cache;
  }
@@ -9195,9 +9196,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
         int ret = 0;
  
         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
-               list_del_init(&block_group->bg_list);
                 if (ret)
-                       continue;
+                       goto next;
  
                 spin_lock(&block_group->lock);
                 memcpy(&item, &block_group->item, sizeof(item));
@@ -9212,6 +9212,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
                                                key.objectid, key.offset);
                 if (ret)
                         btrfs_abort_transaction(trans, extent_root, ret);
+next:
+               list_del_init(&block_group->bg_list);
         }
  }
  
@@ -9304,7 +9306,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
  }
  
  int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 group_start)
+                            struct btrfs_root *root, u64 group_start,
+                            struct extent_map *em)
  {
         struct btrfs_path *path;
         struct btrfs_block_group_cache *block_group;
@@ -9316,6 +9319,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         int ret;
         int index;
         int factor;
+       struct btrfs_caching_control *caching_ctl = NULL;
+       bool remove_em;
  
         root = root->fs_info->extent_root;
  
@@ -9400,6 +9405,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         spin_lock(&root->fs_info->block_group_cache_lock);
         rb_erase(&block_group->cache_node,
                  &root->fs_info->block_group_cache_tree);
+       RB_CLEAR_NODE(&block_group->cache_node);
  
         if (root->fs_info->first_logical_byte == block_group->key.objectid)
                 root->fs_info->first_logical_byte = (u64)-1;
@@ -9411,6 +9417,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
          * are still on the list after taking the semaphore
          */
         list_del_init(&block_group->list);
+       list_del_init(&block_group->ro_list);
         if (list_empty(&block_group->space_info->block_groups[index])) {
                 kobj = block_group->space_info->block_group_kobjs[index];
                 block_group->space_info->block_group_kobjs[index] = NULL;
@@ -9422,8 +9429,32 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                 kobject_put(kobj);
         }
  
+       if (block_group->has_caching_ctl)
+               caching_ctl = get_caching_control(block_group);
         if (block_group->cached == BTRFS_CACHE_STARTED)
                 wait_block_group_cache_done(block_group);
+       if (block_group->has_caching_ctl) {
+               down_write(&root->fs_info->commit_root_sem);
+               if (!caching_ctl) {
+                       struct btrfs_caching_control *ctl;
+
+                       list_for_each_entry(ctl,
+                                   &root->fs_info->caching_block_groups, list)
+                               if (ctl->block_group == block_group) {
+                                       caching_ctl = ctl;
+                                       atomic_inc(&caching_ctl->count);
+                                       break;
+                               }
+               }
+               if (caching_ctl)
+                       list_del_init(&caching_ctl->list);
+               up_write(&root->fs_info->commit_root_sem);
+               if (caching_ctl) {
+                       /* Once for the caching bgs list and once for us. */
+                       put_caching_control(caching_ctl);
+                       put_caching_control(caching_ctl);
+               }
+       }
  
         btrfs_remove_free_space_cache(block_group);
  
@@ -9435,6 +9466,71 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
  
         memcpy(&key, &block_group->key, sizeof(key));
  
+       lock_chunks(root);
+       if (!list_empty(&em->list)) {
+               /* We're in the transaction->pending_chunks list. */
+               free_extent_map(em);
+       }
+       spin_lock(&block_group->lock);
+       block_group->removed = 1;
+       /*
+        * At this point trimming can't start on this block group, because we
+        * removed the block group from the tree fs_info->block_group_cache_tree
+        * so no one can't find it anymore and even if someone already got this
+        * block group before we removed it from the rbtree, they have already
+        * incremented block_group->trimming - if they didn't, they won't find
+        * any free space entries because we already removed them all when we
+        * called btrfs_remove_free_space_cache().
+        *
+        * And we must not remove the extent map from the fs_info->mapping_tree
+        * to prevent the same logical address range and physical device space
+        * ranges from being reused for a new block group. This is because our
+        * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
+        * completely transactionless, so while it is trimming a range the
+        * currently running transaction might finish and a new one start,
+        * allowing for new block groups to be created that can reuse the same
+        * physical device locations unless we take this special care.
+        */
+       remove_em = (atomic_read(&block_group->trimming) == 0);
+       /*
+        * Make sure a trimmer task always sees the em in the pinned_chunks list
+        * if it sees block_group->removed == 1 (needs to lock block_group->lock
+        * before checking block_group->removed).
+        */
+       if (!remove_em) {
+               /*
+                * Our em might be in trans->transaction->pending_chunks which
+                * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
+                * and so is the fs_info->pinned_chunks list.
+                *
+                * So at this point we must be holding the chunk_mutex to avoid
+                * any races with chunk allocation (more specifically at
+                * volumes.c:contains_pending_extent()), to ensure it always
+                * sees the em, either in the pending_chunks list or in the
+                * pinned_chunks list.
+                */
+               list_move_tail(&em->list, &root->fs_info->pinned_chunks);
+       }
+       spin_unlock(&block_group->lock);
+
+       if (remove_em) {
+               struct extent_map_tree *em_tree;
+
+               em_tree = &root->fs_info->mapping_tree.map_tree;
+               write_lock(&em_tree->lock);
+               /*
+                * The em might be in the pending_chunks list, so make sure the
+                * chunk mutex is locked, since remove_extent_mapping() will
+                * delete us from that list.
+                */
+               remove_extent_mapping(em_tree, em);
+               write_unlock(&em_tree->lock);
+               /* once for the tree */
+               free_extent_map(em);
+       }
+
+       unlock_chunks(root);
+
         btrfs_put_block_group(block_group);
         btrfs_put_block_group(block_group);
  
@@ -9523,10 +9619,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                  */
                 start = block_group->key.objectid;
                 end = start + block_group->key.offset - 1;
-               clear_extent_bits(&fs_info->freed_extents[0], start, end,
+               ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
                                   EXTENT_DIRTY, GFP_NOFS);
-               clear_extent_bits(&fs_info->freed_extents[1], start, end,
+               if (ret) {
+                       btrfs_set_block_group_rw(root, block_group);
+                       goto end_trans;
+               }
+               ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
                                   EXTENT_DIRTY, GFP_NOFS);
+               if (ret) {
+                       btrfs_set_block_group_rw(root, block_group);
+                       goto end_trans;
+               }
  
                 /* Reset pinned so btrfs_put_block_group doesn't complain */
                 block_group->pinned = 0;
@@ -9537,6 +9641,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                  */
                 ret = btrfs_remove_chunk(trans, root,
                                          block_group->key.objectid);
+end_trans:
                 btrfs_end_transaction(trans, root);
  next:
                 btrfs_put_block_group(block_group);
@@ -9657,12 +9762,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
  }
  
  /*
- * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
- * they are used to prevent the some tasks writing data into the page cache
- * by nocow before the subvolume is snapshoted, but flush the data into
- * the disk after the snapshot creation.
+ * btrfs_{start,end}_write_no_snapshoting() are similar to
+ * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
+ * data into the page cache through nocow before the subvolume is snapshoted,
+ * but flush the data into disk after the snapshot creation, or to prevent
+ * operations while snapshoting is ongoing and that cause the snapshot to be
+ * inconsistent (writes followed by expanding truncates for example).
   */
-void btrfs_end_nocow_write(struct btrfs_root *root)
+void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
  {
         percpu_counter_dec(&root->subv_writers->counter);
         /*
@@ -9674,7 +9781,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
                 wake_up(&root->subv_writers->wait);
  }
  
-int btrfs_start_nocow_write(struct btrfs_root *root)
+int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
  {
         if (atomic_read(&root->will_be_snapshoted))
                 return 0;
@@ -9685,7 +9792,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
          */
         smp_mb();
         if (atomic_read(&root->will_be_snapshoted)) {
-               btrfs_end_nocow_write(root);
+               btrfs_end_write_no_snapshoting(root);
                 return 0;
         }
         return 1;