Btrfs: introduce btrfs_{start, end}_nocow_write() for each subvolume

author Miao Xie <miaox@cn.fujitsu.com>

Thu, 6 Mar 2014 05:38:19 +0000 (13:38 +0800)

committer Josef Bacik <jbacik@fb.com>

Mon, 10 Mar 2014 19:17:22 +0000 (15:17 -0400)
author Miao Xie <miaox@cn.fujitsu.com>
Thu, 6 Mar 2014 05:38:19 +0000 (13:38 +0800)
committer Josef Bacik <jbacik@fb.com>
Mon, 10 Mar 2014 19:17:22 +0000 (15:17 -0400)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index b4d2e95..374bb2f 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1681,6 +1681,11 @@ struct btrfs_fs_info {
         unsigned int update_uuid_tree_gen:1;
  };
  
+struct btrfs_subvolume_writers {
+       struct percpu_counter   counter;
+       wait_queue_head_t       wait;
+};
+
  /*
   * in ram representation of the tree.  extent_root is used for all allocations
   * and for the extent tree extent_root root.
@@ -1829,6 +1834,8 @@ struct btrfs_root {
          * manipulation with the read-only status via SUBVOL_SETFLAGS
          */
         int send_in_progress;
+       struct btrfs_subvolume_writers *subv_writers;
+       atomic_t will_be_snapshoted;
  };
  
  struct btrfs_ioctl_defrag_range_args {
@@ -3353,6 +3360,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
  int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
                                          struct btrfs_fs_info *fs_info);
  int __get_raid_index(u64 flags);
+
+int btrfs_start_nocow_write(struct btrfs_root *root);
+void btrfs_end_nocow_write(struct btrfs_root *root);
  /* ctree.c */
  int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                      int level, int *slot);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index f7d84d9..7d09ca4 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1149,6 +1149,32 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
         }
  }
  
+static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
+{
+       struct btrfs_subvolume_writers *writers;
+       int ret;
+
+       writers = kmalloc(sizeof(*writers), GFP_NOFS);
+       if (!writers)
+               return ERR_PTR(-ENOMEM);
+
+       ret = percpu_counter_init(&writers->counter, 0);
+       if (ret < 0) {
+               kfree(writers);
+               return ERR_PTR(ret);
+       }
+
+       init_waitqueue_head(&writers->wait);
+       return writers;
+}
+
+static void
+btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
+{
+       percpu_counter_destroy(&writers->counter);
+       kfree(writers);
+}
+
  static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
                          u32 stripesize, struct btrfs_root *root,
                          struct btrfs_fs_info *fs_info,
@@ -1205,6 +1231,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
         atomic_set(&root->log_batch, 0);
         atomic_set(&root->orphan_inodes, 0);
         atomic_set(&root->refs, 1);
+       atomic_set(&root->will_be_snapshoted, 0);
         root->log_transid = 0;
         root->log_transid_committed = -1;
         root->last_log_commit = 0;
@@ -1502,6 +1529,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
  int btrfs_init_fs_root(struct btrfs_root *root)
  {
         int ret;
+       struct btrfs_subvolume_writers *writers;
  
         root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
         root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@@ -1511,6 +1539,13 @@ int btrfs_init_fs_root(struct btrfs_root *root)
                 goto fail;
         }
  
+       writers = btrfs_alloc_subvolume_writers();
+       if (IS_ERR(writers)) {
+               ret = PTR_ERR(writers);
+               goto fail;
+       }
+       root->subv_writers = writers;
+
         btrfs_init_free_ino_ctl(root);
         mutex_init(&root->fs_commit_mutex);
         spin_lock_init(&root->cache_lock);
@@ -1518,8 +1553,11 @@ int btrfs_init_fs_root(struct btrfs_root *root)
  
         ret = get_anon_bdev(&root->anon_dev);
         if (ret)
-               goto fail;
+               goto free_writers;
         return 0;
+
+free_writers:
+       btrfs_free_subvolume_writers(root->subv_writers);
  fail:
         kfree(root->free_ino_ctl);
         kfree(root->free_ino_pinned);
@@ -3459,6 +3497,8 @@ static void free_fs_root(struct btrfs_root *root)
         root->orphan_block_rsv = NULL;
         if (root->anon_dev)
                 free_anon_bdev(root->anon_dev);
+       if (root->subv_writers)
+               btrfs_free_subvolume_writers(root->subv_writers);
         free_extent_buffer(root->node);
         free_extent_buffer(root->commit_root);
         kfree(root->free_ino_ctl);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 19ea8ad..6b821c6 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8938,3 +8938,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
         range->len = trimmed;
         return ret;
  }
+
+/*
+ * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
+ * they are used to prevent the some tasks writing data into the page cache
+ * by nocow before the subvolume is snapshoted, but flush the data into
+ * the disk after the snapshot creation.
+ */
+void btrfs_end_nocow_write(struct btrfs_root *root)
+{
+       percpu_counter_dec(&root->subv_writers->counter);
+       /*
+        * Make sure counter is updated before we wake up
+        * waiters.
+        */
+       smp_mb();
+       if (waitqueue_active(&root->subv_writers->wait))
+               wake_up(&root->subv_writers->wait);
+}
+
+int btrfs_start_nocow_write(struct btrfs_root *root)
+{
+       if (unlikely(atomic_read(&root->will_be_snapshoted)))
+               return 0;
+
+       percpu_counter_inc(&root->subv_writers->counter);
+       /*
+        * Make sure counter is updated before we check for snapshot creation.
+        */
+       smp_mb();
+       if (unlikely(atomic_read(&root->will_be_snapshoted))) {
+               btrfs_end_nocow_write(root);
+               return 0;
+       }
+       return 1;
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index d88f2dc..006cf9f 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1410,6 +1410,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
         u64 num_bytes;
         int ret;
  
+       ret = btrfs_start_nocow_write(root);
+       if (!ret)
+               return -ENOSPC;
+
         lockstart = round_down(pos, root->sectorsize);
         lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
  
@@ -1427,11 +1431,13 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
  
         num_bytes = lockend - lockstart + 1;
         ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
-       if (ret <= 0)
+       if (ret <= 0) {
                 ret = 0;
-       else
+               btrfs_end_nocow_write(root);
+       } else {
                 *write_bytes = min_t(size_t, *write_bytes ,
                                      num_bytes - pos + lockstart);
+       }
  
         unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
  
@@ -1520,6 +1526,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                         if (!only_release_metadata)
                                 btrfs_free_reserved_data_space(inode,
                                                                reserve_bytes);
+                       else
+                               btrfs_end_nocow_write(root);
                         break;
                 }
  
@@ -1608,6 +1616,9 @@ again:
                 }
  
                 release_bytes = 0;
+               if (only_release_metadata)
+                       btrfs_end_nocow_write(root);
+
                 if (only_release_metadata && copied > 0) {
                         u64 lockstart = round_down(pos, root->sectorsize);
                         u64 lockend = lockstart +
@@ -1634,10 +1645,12 @@ again:
         kfree(pages);
  
         if (release_bytes) {
-               if (only_release_metadata)
+               if (only_release_metadata) {
+                       btrfs_end_nocow_write(root);
                         btrfs_delalloc_release_metadata(inode, release_bytes);
-               else
+               } else {
                         btrfs_delalloc_release_space(inode, release_bytes);
+               }
         }
  
         return num_written ? num_written : ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c

index 1ae45bd..57bc9f3 100644 (file)
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -611,6 +611,23 @@ fail:
         return ret;
  }
  
+static void btrfs_wait_nocow_write(struct btrfs_root *root)
+{
+       s64 writers;
+       DEFINE_WAIT(wait);
+
+       do {
+               prepare_to_wait(&root->subv_writers->wait, &wait,
+                               TASK_UNINTERRUPTIBLE);
+
+               writers = percpu_counter_sum(&root->subv_writers->counter);
+               if (writers)
+                       schedule();
+
+               finish_wait(&root->subv_writers->wait, &wait);
+       } while (writers);
+}
+
  static int create_snapshot(struct btrfs_root *root, struct inode *dir,
                            struct dentry *dentry, char *name, int namelen,
                            u64 *async_transid, bool readonly,
@@ -624,15 +641,21 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
         if (!root->ref_cows)
                 return -EINVAL;
  
+       atomic_inc(&root->will_be_snapshoted);
+       smp_mb__after_atomic_inc();
+       btrfs_wait_nocow_write(root);
+
         ret = btrfs_start_delalloc_inodes(root, 0);
         if (ret)
-               return ret;
+               goto out;
  
         btrfs_wait_ordered_extents(root, -1);
  
         pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
-       if (!pending_snapshot)
-               return -ENOMEM;
+       if (!pending_snapshot) {
+               ret = -ENOMEM;
+               goto out;
+       }
  
         btrfs_init_block_rsv(&pending_snapshot->block_rsv,
                              BTRFS_BLOCK_RSV_TEMP);
@@ -649,7 +672,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
                                         &pending_snapshot->qgroup_reserved,
                                         false);
         if (ret)
-               goto out;
+               goto free;
  
         pending_snapshot->dentry = dentry;
         pending_snapshot->root = root;
@@ -700,8 +723,10 @@ fail:
         btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
                                          &pending_snapshot->block_rsv,
                                          pending_snapshot->qgroup_reserved);
-out:
+free:
         kfree(pending_snapshot);
+out:
+       atomic_dec(&root->will_be_snapshoted);
         return ret;
  }
author	Miao Xie <miaox@cn.fujitsu.com>
	Thu, 6 Mar 2014 05:38:19 +0000 (13:38 +0800)
committer	Josef Bacik <jbacik@fb.com>
	Mon, 10 Mar 2014 19:17:22 +0000 (15:17 -0400)
fs/btrfs/ctree.h		patch \| blob \| history
fs/btrfs/disk-io.c		patch \| blob \| history
fs/btrfs/extent-tree.c		patch \| blob \| history
fs/btrfs/file.c		patch \| blob \| history
fs/btrfs/ioctl.c		patch \| blob \| history