Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux...

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 12 Dec 2014 19:15:23 +0000 (11:15 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 12 Dec 2014 19:15:23 +0000 (11:15 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 12 Dec 2014 19:15:23 +0000 (11:15 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 12 Dec 2014 19:15:23 +0000 (11:15 -0800)
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c

index cb7f3fe..d897ef8 100644 (file)
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -94,6 +94,7 @@
  #include <linux/mutex.h>
  #include <linux/genhd.h>
  #include <linux/blkdev.h>
+#include <linux/vmalloc.h>
  #include "ctree.h"
  #include "disk-io.h"
  #include "hash.h"
@@ -326,9 +327,6 @@ static int btrfsic_handle_extent_data(struct btrfsic_state *state,
  static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
                              struct btrfsic_block_data_ctx *block_ctx_out,
                              int mirror_num);
-static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
-                                 u32 len, struct block_device *bdev,
-                                 struct btrfsic_block_data_ctx *block_ctx_out);
  static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
  static int btrfsic_read_block(struct btrfsic_state *state,
                               struct btrfsic_block_data_ctx *block_ctx);
@@ -1326,24 +1324,25 @@ static int btrfsic_create_link_to_next_block(
                 l = NULL;
                 next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
         } else {
-               if (next_block->logical_bytenr != next_bytenr &&
-                   !(!next_block->is_metadata &&
-                     0 == next_block->logical_bytenr)) {
-                       printk(KERN_INFO
-                              "Referenced block @%llu (%s/%llu/%d)"
-                              " found in hash table, %c,"
-                              " bytenr mismatch (!= stored %llu).\n",
-                              next_bytenr, next_block_ctx->dev->name,
-                              next_block_ctx->dev_bytenr, *mirror_nump,
-                              btrfsic_get_block_type(state, next_block),
-                              next_block->logical_bytenr);
-               } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-                       printk(KERN_INFO
-                              "Referenced block @%llu (%s/%llu/%d)"
-                              " found in hash table, %c.\n",
-                              next_bytenr, next_block_ctx->dev->name,
-                              next_block_ctx->dev_bytenr, *mirror_nump,
-                              btrfsic_get_block_type(state, next_block));
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
+                       if (next_block->logical_bytenr != next_bytenr &&
+                           !(!next_block->is_metadata &&
+                             0 == next_block->logical_bytenr))
+                               printk(KERN_INFO
+                                      "Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
+                                      next_bytenr, next_block_ctx->dev->name,
+                                      next_block_ctx->dev_bytenr, *mirror_nump,
+                                      btrfsic_get_block_type(state,
+                                                             next_block),
+                                      next_block->logical_bytenr);
+                       else
+                               printk(KERN_INFO
+                                      "Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n",
+                                      next_bytenr, next_block_ctx->dev->name,
+                                      next_block_ctx->dev_bytenr, *mirror_nump,
+                                      btrfsic_get_block_type(state,
+                                                             next_block));
+               }
                 next_block->logical_bytenr = next_bytenr;
  
                 next_block->mirror_num = *mirror_nump;
@@ -1529,7 +1528,9 @@ static int btrfsic_handle_extent_data(
                                 return -1;
                         }
                         if (!block_was_created) {
-                               if (next_block->logical_bytenr != next_bytenr &&
+                               if ((state->print_mask &
+                                    BTRFSIC_PRINT_MASK_VERBOSE) &&
+                                   next_block->logical_bytenr != next_bytenr &&
                                     !(!next_block->is_metadata &&
                                       0 == next_block->logical_bytenr)) {
                                         printk(KERN_INFO
@@ -1607,25 +1608,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
         return ret;
  }
  
-static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
-                                 u32 len, struct block_device *bdev,
-                                 struct btrfsic_block_data_ctx *block_ctx_out)
-{
-       block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
-       block_ctx_out->dev_bytenr = bytenr;
-       block_ctx_out->start = bytenr;
-       block_ctx_out->len = len;
-       block_ctx_out->datav = NULL;
-       block_ctx_out->pagev = NULL;
-       block_ctx_out->mem_to_free = NULL;
-       if (NULL != block_ctx_out->dev) {
-               return 0;
-       } else {
-               printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
-               return -ENXIO;
-       }
-}
-
  static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
  {
         if (block_ctx->mem_to_free) {
@@ -1901,25 +1883,26 @@ again:
                                                                dev_state,
                                                                dev_bytenr);
                         }
-                       if (block->logical_bytenr != bytenr &&
-                           !(!block->is_metadata &&
-                             block->logical_bytenr == 0))
-                               printk(KERN_INFO
-                                      "Written block @%llu (%s/%llu/%d)"
-                                      " found in hash table, %c,"
-                                      " bytenr mismatch"
-                                      " (!= stored %llu).\n",
-                                      bytenr, dev_state->name, dev_bytenr,
-                                      block->mirror_num,
-                                      btrfsic_get_block_type(state, block),
-                                      block->logical_bytenr);
-                       else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-                               printk(KERN_INFO
-                                      "Written block @%llu (%s/%llu/%d)"
-                                      " found in hash table, %c.\n",
-                                      bytenr, dev_state->name, dev_bytenr,
-                                      block->mirror_num,
-                                      btrfsic_get_block_type(state, block));
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
+                               if (block->logical_bytenr != bytenr &&
+                                   !(!block->is_metadata &&
+                                     block->logical_bytenr == 0))
+                                       printk(KERN_INFO
+                                              "Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
+                                              bytenr, dev_state->name,
+                                              dev_bytenr,
+                                              block->mirror_num,
+                                              btrfsic_get_block_type(state,
+                                                                     block),
+                                              block->logical_bytenr);
+                               else
+                                       printk(KERN_INFO
+                                              "Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
+                                              bytenr, dev_state->name,
+                                              dev_bytenr, block->mirror_num,
+                                              btrfsic_get_block_type(state,
+                                                                     block));
+                       }
                         block->logical_bytenr = bytenr;
                 } else {
                         if (num_pages * PAGE_CACHE_SIZE <
@@ -2002,24 +1985,13 @@ again:
                         }
                 }
  
-               if (block->is_superblock)
-                       ret = btrfsic_map_superblock(state, bytenr,
-                                                    processed_len,
-                                                    bdev, &block_ctx);
-               else
-                       ret = btrfsic_map_block(state, bytenr, processed_len,
-                                               &block_ctx, 0);
-               if (ret) {
-                       printk(KERN_INFO
-                              "btrfsic: btrfsic_map_block(root @%llu)"
-                              " failed!\n", bytenr);
-                       goto continue_loop;
-               }
-               block_ctx.datav = mapped_datav;
-               /* the following is required in case of writes to mirrors,
-                * use the same that was used for the lookup */
                 block_ctx.dev = dev_state;
                 block_ctx.dev_bytenr = dev_bytenr;
+               block_ctx.start = bytenr;
+               block_ctx.len = processed_len;
+               block_ctx.pagev = NULL;
+               block_ctx.mem_to_free = NULL;
+               block_ctx.datav = mapped_datav;
  
                 if (is_metadata || state->include_extent_data) {
                         block->never_written = 0;
@@ -2133,10 +2105,6 @@ again:
                         /* this is getting ugly for the
                          * include_extent_data case... */
                         bytenr = 0;     /* unknown */
-                       block_ctx.start = bytenr;
-                       block_ctx.len = processed_len;
-                       block_ctx.mem_to_free = NULL;
-                       block_ctx.pagev = NULL;
                 } else {
                         processed_len = state->metablock_size;
                         bytenr = btrfs_stack_header_bytenr(
@@ -2149,22 +2117,15 @@ again:
                                        "Written block @%llu (%s/%llu/?)"
                                        " !found in hash table, M.\n",
                                        bytenr, dev_state->name, dev_bytenr);
-
-                       ret = btrfsic_map_block(state, bytenr, processed_len,
-                                               &block_ctx, 0);
-                       if (ret) {
-                               printk(KERN_INFO
-                                      "btrfsic: btrfsic_map_block(root @%llu)"
-                                      " failed!\n",
-                                      dev_bytenr);
-                               goto continue_loop;
-                       }
                 }
-               block_ctx.datav = mapped_datav;
-               /* the following is required in case of writes to mirrors,
-                * use the same that was used for the lookup */
+
                 block_ctx.dev = dev_state;
                 block_ctx.dev_bytenr = dev_bytenr;
+               block_ctx.start = bytenr;
+               block_ctx.len = processed_len;
+               block_ctx.pagev = NULL;
+               block_ctx.mem_to_free = NULL;
+               block_ctx.datav = mapped_datav;
  
                 block = btrfsic_block_alloc();
                 if (NULL == block) {
@@ -3130,10 +3091,13 @@ int btrfsic_mount(struct btrfs_root *root,
                        root->sectorsize, PAGE_CACHE_SIZE);
                 return -1;
         }
-       state = kzalloc(sizeof(*state), GFP_NOFS);
-       if (NULL == state) {
-               printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
-               return -1;
+       state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+       if (!state) {
+               state = vzalloc(sizeof(*state));
+               if (!state) {
+                       printk(KERN_INFO "btrfs check-integrity: vzalloc() failed!\n");
+                       return -1;
+               }
         }
  
         if (!btrfsic_is_initialized) {
@@ -3277,5 +3241,8 @@ void btrfsic_unmount(struct btrfs_root *root,
  
         mutex_unlock(&btrfsic_mutex);
  
-       kfree(state);
+       if (is_vmalloc_addr(state))
+               vfree(state);
+       else
+               kfree(state);
  }
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c

index dcd9be3..e9df886 100644 (file)
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -224,16 +224,19 @@ out:
   * Clear the writeback bits on all of the file
   * pages for a compressed write
   */
-static noinline void end_compressed_writeback(struct inode *inode, u64 start,
-                                             unsigned long ram_size)
+static noinline void end_compressed_writeback(struct inode *inode,
+                                             const struct compressed_bio *cb)
  {
-       unsigned long index = start >> PAGE_CACHE_SHIFT;
-       unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+       unsigned long index = cb->start >> PAGE_CACHE_SHIFT;
+       unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT;
         struct page *pages[16];
         unsigned long nr_pages = end_index - index + 1;
         int i;
         int ret;
  
+       if (cb->errors)
+               mapping_set_error(inode->i_mapping, -EIO);
+
         while (nr_pages > 0) {
                 ret = find_get_pages_contig(inode->i_mapping, index,
                                      min_t(unsigned long,
@@ -244,6 +247,8 @@ static noinline void end_compressed_writeback(struct inode *inode, u64 start,
                         continue;
                 }
                 for (i = 0; i < ret; i++) {
+                       if (cb->errors)
+                               SetPageError(pages[i]);
                         end_page_writeback(pages[i]);
                         page_cache_release(pages[i]);
                 }
@@ -287,10 +292,11 @@ static void end_compressed_bio_write(struct bio *bio, int err)
         tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
                                          cb->start,
                                          cb->start + cb->len - 1,
-                                        NULL, 1);
+                                        NULL,
+                                        err ? 0 : 1);
         cb->compressed_pages[0]->mapping = NULL;
  
-       end_compressed_writeback(inode, cb->start, cb->len);
+       end_compressed_writeback(inode, cb);
         /* note, our inode could be gone now */
  
         /*
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c

index 150822e..14a72ed 100644 (file)
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2929,7 +2929,7 @@ done:
          */
         if (!p->leave_spinning)
                 btrfs_set_path_blocking(p);
-       if (ret < 0)
+       if (ret < 0 && !p->skip_release_on_error)
                 btrfs_release_path(p);
         return ret;
  }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index fe69edd..e6fbbd7 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -607,6 +607,7 @@ struct btrfs_path {
         unsigned int leave_spinning:1;
         unsigned int search_commit_root:1;
         unsigned int need_commit_sem:1;
+       unsigned int skip_release_on_error:1;
  };
  
  /*
@@ -1170,6 +1171,7 @@ struct btrfs_space_info {
         struct percpu_counter total_bytes_pinned;
  
         struct list_head list;
+       struct list_head ro_bgs;
  
         struct rw_semaphore groups_sem;
         /* for block groups in our same type */
@@ -1276,6 +1278,8 @@ struct btrfs_block_group_cache {
         unsigned int ro:1;
         unsigned int dirty:1;
         unsigned int iref:1;
+       unsigned int has_caching_ctl:1;
+       unsigned int removed:1;
  
         int disk_cache_state;
  
@@ -1305,6 +1309,11 @@ struct btrfs_block_group_cache {
  
         /* For delayed block group creation or deletion of empty block groups */
         struct list_head bg_list;
+
+       /* For read-only block groups */
+       struct list_head ro_list;
+
+       atomic_t trimming;
  };
  
  /* delayed seq elem */
@@ -1402,6 +1411,11 @@ struct btrfs_fs_info {
          */
         u64 last_trans_log_full_commit;
         unsigned long mount_opt;
+       /*
+        * Track requests for actions that need to be done during transaction
+        * commit (like for some mount options).
+        */
+       unsigned long pending_changes;
         unsigned long compress_type:4;
         int commit_interval;
         /*
@@ -1729,6 +1743,12 @@ struct btrfs_fs_info {
  
         /* For btrfs to record security options */
         struct security_mnt_opts security_opts;
+
+       /*
+        * Chunks that can't be freed yet (under a trim/discard operation)
+        * and will be latter freed. Protected by fs_info->chunk_mutex.
+        */
+       struct list_head pinned_chunks;
  };
  
  struct btrfs_subvolume_writers {
@@ -2093,7 +2113,6 @@ struct btrfs_ioctl_defrag_range_args {
  #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
  #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR       (1 << 22)
  #define BTRFS_MOUNT_RESCAN_UUID_TREE   (1 << 23)
-#define        BTRFS_MOUNT_CHANGE_INODE_CACHE  (1 << 24)
  
  #define BTRFS_DEFAULT_COMMIT_INTERVAL  (30)
  #define BTRFS_DEFAULT_MAX_INLINE       (8192)
@@ -2103,6 +2122,7 @@ struct btrfs_ioctl_defrag_range_args {
  #define btrfs_raw_test_opt(o, opt)     ((o) & BTRFS_MOUNT_##opt)
  #define btrfs_test_opt(root, opt)      ((root)->fs_info->mount_opt & \
                                          BTRFS_MOUNT_##opt)
+
  #define btrfs_set_and_info(root, opt, fmt, args...)                    \
  {                                                                      \
         if (!btrfs_test_opt(root, opt))                                 \
@@ -2117,6 +2137,49 @@ struct btrfs_ioctl_defrag_range_args {
         btrfs_clear_opt(root->fs_info->mount_opt, opt);                 \
  }
  
+/*
+ * Requests for changes that need to be done during transaction commit.
+ *
+ * Internal mount options that are used for special handling of the real
+ * mount options (eg. cannot be set during remount and have to be set during
+ * transaction commit)
+ */
+
+#define BTRFS_PENDING_SET_INODE_MAP_CACHE      (0)
+#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE    (1)
+#define BTRFS_PENDING_COMMIT                   (2)
+
+#define btrfs_test_pending(info, opt)  \
+       test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+#define btrfs_set_pending(info, opt)   \
+       set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+#define btrfs_clear_pending(info, opt) \
+       clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
+
+/*
+ * Helpers for setting pending mount option changes.
+ *
+ * Expects corresponding macros
+ * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name
+ */
+#define btrfs_set_pending_and_info(info, opt, fmt, args...)            \
+do {                                                                   \
+       if (!btrfs_raw_test_opt((info)->mount_opt, opt)) {              \
+               btrfs_info((info), fmt, ##args);                        \
+               btrfs_set_pending((info), SET_##opt);                   \
+               btrfs_clear_pending((info), CLEAR_##opt);               \
+       }                                                               \
+} while(0)
+
+#define btrfs_clear_pending_and_info(info, opt, fmt, args...)          \
+do {                                                                   \
+       if (btrfs_raw_test_opt((info)->mount_opt, opt)) {               \
+               btrfs_info((info), fmt, ##args);                        \
+               btrfs_set_pending((info), CLEAR_##opt);                 \
+               btrfs_clear_pending((info), SET_##opt);                 \
+       }                                                               \
+} while(0)
+
  /*
   * Inode flags
   */
@@ -3351,7 +3414,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                            u64 type, u64 chunk_objectid, u64 chunk_offset,
                            u64 size);
  int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 group_start);
+                            struct btrfs_root *root, u64 group_start,
+                            struct extent_map *em);
  void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
  void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root);
@@ -3427,8 +3491,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
  int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
                                          struct btrfs_fs_info *fs_info);
  int __get_raid_index(u64 flags);
-int btrfs_start_nocow_write(struct btrfs_root *root);
-void btrfs_end_nocow_write(struct btrfs_root *root);
+int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
+void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
  /* ctree.c */
  int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                      int level, int *slot);
@@ -3686,6 +3750,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
  int verify_dir_item(struct btrfs_root *root,
                     struct extent_buffer *leaf,
                     struct btrfs_dir_item *dir_item);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+                                                struct btrfs_path *path,
+                                                const char *name,
+                                                int name_len);
  
  /* orphan.c */
  int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -3857,6 +3925,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
                                     struct btrfs_trans_handle *trans, int mode,
                                     u64 start, u64 num_bytes, u64 min_size,
                                     loff_t actual_len, u64 *alloc_hint);
+int btrfs_inode_check_errors(struct inode *inode);
  extern const struct dentry_operations btrfs_dentry_operations;
  
  /* ioctl.c */
@@ -3901,6 +3970,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
                       struct page **pages, size_t num_pages,
                       loff_t pos, size_t write_bytes,
                       struct extent_state **cached);
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
  
  /* tree-defrag.c */
  int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -4097,7 +4167,12 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
  /* dev-replace.c */
  void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
  void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
-void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
+
+static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+{
+       btrfs_bio_counter_sub(fs_info, 1);
+}
  
  /* reada.c */
  struct reada_control {
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c

index 6f662b3..ca6a3a3 100644 (file)
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -316,11 +316,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
         struct btrfs_device *tgt_device = NULL;
         struct btrfs_device *src_device = NULL;
  
-       if (btrfs_fs_incompat(fs_info, RAID56)) {
-               btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
-               return -EOPNOTSUPP;
-       }
-
         switch (args->start.cont_reading_from_srcdev_mode) {
         case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
         case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
@@ -422,9 +417,15 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
                               &dev_replace->scrub_progress, 0, 1);
  
         ret = btrfs_dev_replace_finishing(root->fs_info, ret);
-       WARN_ON(ret);
+       /* don't warn if EINPROGRESS, someone else might be running scrub */
+       if (ret == -EINPROGRESS) {
+               args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
+               ret = 0;
+       } else {
+               WARN_ON(ret);
+       }
  
-       return 0;
+       return ret;
  
  leave:
         dev_replace->srcdev = NULL;
@@ -542,7 +543,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                         btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
                 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
  
-               return 0;
+               return scrub_ret;
         }
  
         printk_in_rcu(KERN_INFO
@@ -571,15 +572,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
         list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
         fs_info->fs_devices->rw_devices++;
  
-       /* replace the sysfs entry */
-       btrfs_kobj_rm_device(fs_info, src_device);
-       btrfs_kobj_add_device(fs_info, tgt_device);
-
         btrfs_dev_replace_unlock(dev_replace);
  
         btrfs_rm_dev_replace_blocked(fs_info);
  
-       btrfs_rm_dev_replace_srcdev(fs_info, src_device);
+       btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device);
  
         btrfs_rm_dev_replace_unblocked(fs_info);
  
@@ -594,6 +591,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
         mutex_unlock(&uuid_mutex);
  
+       /* replace the sysfs entry */
+       btrfs_kobj_rm_device(fs_info, src_device);
+       btrfs_kobj_add_device(fs_info, tgt_device);
+       btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
+
         /* write back the superblocks */
         trans = btrfs_start_transaction(root, 0);
         if (!IS_ERR(trans))
@@ -920,9 +922,9 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
         percpu_counter_inc(&fs_info->bio_counter);
  }
  
-void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
  {
-       percpu_counter_dec(&fs_info->bio_counter);
+       percpu_counter_sub(&fs_info->bio_counter, amount);
  
         if (waitqueue_active(&fs_info->replace_wait))
                 wake_up(&fs_info->replace_wait);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c

index fc8df86..1752625 100644 (file)
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -21,10 +21,6 @@
  #include "hash.h"
  #include "transaction.h"
  
-static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
-                             struct btrfs_path *path,
-                             const char *name, int name_len);
-
  /*
   * insert a name into a directory, doing overflow properly if there is a hash
   * collision.  data_size indicates how big the item inserted should be.  On
@@ -383,9 +379,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
   * this walks through all the entries in a dir item and finds one
   * for a specific name.
   */
-static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
-                             struct btrfs_path *path,
-                             const char *name, int name_len)
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+                                                struct btrfs_path *path,
+                                                const char *name, int name_len)
  {
         struct btrfs_dir_item *dir_item;
         unsigned long name_ptr;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index 1bf9f89..3096512 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2384,6 +2384,8 @@ int open_ctree(struct super_block *sb,
         init_waitqueue_head(&fs_info->transaction_blocked_wait);
         init_waitqueue_head(&fs_info->async_submit_wait);
  
+       INIT_LIST_HEAD(&fs_info->pinned_chunks);
+
         ret = btrfs_alloc_stripe_hash_table(fs_info);
         if (ret) {
                 err = ret;
@@ -2830,9 +2832,11 @@ retry_root_backup:
                 btrfs_set_opt(fs_info->mount_opt, SSD);
         }
  
-       /* Set the real inode map cache flag */
-       if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE))
-               btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE);
+       /*
+        * Mount does not set all options immediatelly, we can do it now and do
+        * not have to wait for transaction commit
+        */
+       btrfs_apply_pending_changes(fs_info);
  
  #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
         if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
@@ -3713,6 +3717,17 @@ void close_ctree(struct btrfs_root *root)
  
         btrfs_free_block_rsv(root, root->orphan_block_rsv);
         root->orphan_block_rsv = NULL;
+
+       lock_chunks(root);
+       while (!list_empty(&fs_info->pinned_chunks)) {
+               struct extent_map *em;
+
+               em = list_first_entry(&fs_info->pinned_chunks,
+                                     struct extent_map, list);
+               list_del_init(&em->list);
+               free_extent_map(em);
+       }
+       unlock_chunks(root);
  }
  
  int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -3839,12 +3854,12 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
          */
         if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
                 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
-                               sb->root);
+                               btrfs_super_root(sb));
         if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
-               printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
-                               sb->chunk_root);
+               printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",
+                               btrfs_super_chunk_root(sb));
         if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
-               printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+               printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
                                 btrfs_super_log_root(sb));
  
         if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
@@ -4129,6 +4144,25 @@ again:
         return 0;
  }
  
+static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
+                                      struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_ordered_extent *ordered;
+
+       spin_lock(&fs_info->trans_lock);
+       while (!list_empty(&cur_trans->pending_ordered)) {
+               ordered = list_first_entry(&cur_trans->pending_ordered,
+                                          struct btrfs_ordered_extent,
+                                          trans_list);
+               list_del_init(&ordered->trans_list);
+               spin_unlock(&fs_info->trans_lock);
+
+               btrfs_put_ordered_extent(ordered);
+               spin_lock(&fs_info->trans_lock);
+       }
+       spin_unlock(&fs_info->trans_lock);
+}
+
  void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
                                    struct btrfs_root *root)
  {
@@ -4140,6 +4174,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
         cur_trans->state = TRANS_STATE_UNBLOCKED;
         wake_up(&root->fs_info->transaction_wait);
  
+       btrfs_free_pending_ordered(cur_trans, root->fs_info);
         btrfs_destroy_delayed_inodes(root);
         btrfs_assert_delayed_root_empty(root);
  
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 47c1ba1..222d6ae 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -315,12 +315,6 @@ get_caching_control(struct btrfs_block_group_cache *cache)
         struct btrfs_caching_control *ctl;
  
         spin_lock(&cache->lock);
-       if (cache->cached != BTRFS_CACHE_STARTED) {
-               spin_unlock(&cache->lock);
-               return NULL;
-       }
-
-       /* We're loading it the fast way, so we don't have a caching_ctl. */
         if (!cache->caching_ctl) {
                 spin_unlock(&cache->lock);
                 return NULL;
@@ -594,6 +588,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
         spin_unlock(&cache->lock);
  
         if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
+               mutex_lock(&caching_ctl->mutex);
                 ret = load_free_space_cache(fs_info, cache);
  
                 spin_lock(&cache->lock);
@@ -601,15 +596,19 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                         cache->caching_ctl = NULL;
                         cache->cached = BTRFS_CACHE_FINISHED;
                         cache->last_byte_to_unpin = (u64)-1;
+                       caching_ctl->progress = (u64)-1;
                 } else {
                         if (load_cache_only) {
                                 cache->caching_ctl = NULL;
                                 cache->cached = BTRFS_CACHE_NO;
                         } else {
                                 cache->cached = BTRFS_CACHE_STARTED;
+                               cache->has_caching_ctl = 1;
                         }
                 }
                 spin_unlock(&cache->lock);
+               mutex_unlock(&caching_ctl->mutex);
+
                 wake_up(&caching_ctl->wait);
                 if (ret == 1) {
                         put_caching_control(caching_ctl);
@@ -627,6 +626,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                         cache->cached = BTRFS_CACHE_NO;
                 } else {
                         cache->cached = BTRFS_CACHE_STARTED;
+                       cache->has_caching_ctl = 1;
                 }
                 spin_unlock(&cache->lock);
                 wake_up(&caching_ctl->wait);
@@ -3162,7 +3162,19 @@ next_block_group(struct btrfs_root *root,
                  struct btrfs_block_group_cache *cache)
  {
         struct rb_node *node;
+
         spin_lock(&root->fs_info->block_group_cache_lock);
+
+       /* If our block group was removed, we need a full search. */
+       if (RB_EMPTY_NODE(&cache->cache_node)) {
+               const u64 next_bytenr = cache->key.objectid + cache->key.offset;
+
+               spin_unlock(&root->fs_info->block_group_cache_lock);
+               btrfs_put_block_group(cache);
+               cache = btrfs_lookup_first_block_group(root->fs_info,
+                                                      next_bytenr);
+               return cache;
+       }
         node = rb_next(&cache->cache_node);
         btrfs_put_block_group(cache);
         if (node) {
@@ -3504,6 +3516,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
         found->chunk_alloc = 0;
         found->flush = 0;
         init_waitqueue_head(&found->wait);
+       INIT_LIST_HEAD(&found->ro_bgs);
  
         ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
                                     info->space_info_kobj, "%s",
@@ -5425,7 +5438,17 @@ static int update_block_group(struct btrfs_root *root,
                         spin_unlock(&cache->space_info->lock);
                 } else {
                         old_val -= num_bytes;
+                       btrfs_set_block_group_used(&cache->item, old_val);
+                       cache->pinned += num_bytes;
+                       cache->space_info->bytes_pinned += num_bytes;
+                       cache->space_info->bytes_used -= num_bytes;
+                       cache->space_info->disk_used -= num_bytes * factor;
+                       spin_unlock(&cache->lock);
+                       spin_unlock(&cache->space_info->lock);
  
+                       set_extent_dirty(info->pinned_extents,
+                                        bytenr, bytenr + num_bytes - 1,
+                                        GFP_NOFS | __GFP_NOFAIL);
                         /*
                          * No longer have used bytes in this block group, queue
                          * it for deletion.
@@ -5439,17 +5462,6 @@ static int update_block_group(struct btrfs_root *root,
                                 }
                                 spin_unlock(&info->unused_bgs_lock);
                         }
-                       btrfs_set_block_group_used(&cache->item, old_val);
-                       cache->pinned += num_bytes;
-                       cache->space_info->bytes_pinned += num_bytes;
-                       cache->space_info->bytes_used -= num_bytes;
-                       cache->space_info->disk_used -= num_bytes * factor;
-                       spin_unlock(&cache->lock);
-                       spin_unlock(&cache->space_info->lock);
-
-                       set_extent_dirty(info->pinned_extents,
-                                        bytenr, bytenr + num_bytes - 1,
-                                        GFP_NOFS | __GFP_NOFAIL);
                 }
                 btrfs_put_block_group(cache);
                 total -= num_bytes;
@@ -8511,6 +8523,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
             min_allocable_bytes <= sinfo->total_bytes) {
                 sinfo->bytes_readonly += num_bytes;
                 cache->ro = 1;
+               list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
                 ret = 0;
         }
  out:
@@ -8565,15 +8578,20 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
  
  /*
   * helper to account the unused space of all the readonly block group in the
- * list. takes mirrors into account.
+ * space_info. takes mirrors into account.
   */
-static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
  {
         struct btrfs_block_group_cache *block_group;
         u64 free_bytes = 0;
         int factor;
  
-       list_for_each_entry(block_group, groups_list, list) {
+       /* It's df, we don't care if it's racey */
+       if (list_empty(&sinfo->ro_bgs))
+               return 0;
+
+       spin_lock(&sinfo->lock);
+       list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
                 spin_lock(&block_group->lock);
  
                 if (!block_group->ro) {
@@ -8594,26 +8612,6 @@ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
  
                 spin_unlock(&block_group->lock);
         }
-
-       return free_bytes;
-}
-
-/*
- * helper to account the unused space of all the readonly block group in the
- * space_info. takes mirrors into account.
- */
-u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
-{
-       int i;
-       u64 free_bytes = 0;
-
-       spin_lock(&sinfo->lock);
-
-       for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
-               if (!list_empty(&sinfo->block_groups[i]))
-                       free_bytes += __btrfs_get_ro_block_group_free_space(
-                                               &sinfo->block_groups[i]);
-
         spin_unlock(&sinfo->lock);
  
         return free_bytes;
@@ -8633,6 +8631,7 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
                     cache->bytes_super - btrfs_block_group_used(&cache->item);
         sinfo->bytes_readonly -= num_bytes;
         cache->ro = 0;
+       list_del_init(&cache->ro_list);
         spin_unlock(&cache->lock);
         spin_unlock(&sinfo->lock);
  }
@@ -9002,7 +9001,9 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
         INIT_LIST_HEAD(&cache->list);
         INIT_LIST_HEAD(&cache->cluster_list);
         INIT_LIST_HEAD(&cache->bg_list);
+       INIT_LIST_HEAD(&cache->ro_list);
         btrfs_init_free_space_ctl(cache);
+       atomic_set(&cache->trimming, 0);
  
         return cache;
  }
@@ -9195,9 +9196,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
         int ret = 0;
  
         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
-               list_del_init(&block_group->bg_list);
                 if (ret)
-                       continue;
+                       goto next;
  
                 spin_lock(&block_group->lock);
                 memcpy(&item, &block_group->item, sizeof(item));
@@ -9212,6 +9212,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
                                                key.objectid, key.offset);
                 if (ret)
                         btrfs_abort_transaction(trans, extent_root, ret);
+next:
+               list_del_init(&block_group->bg_list);
         }
  }
  
@@ -9304,7 +9306,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
  }
  
  int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 group_start)
+                            struct btrfs_root *root, u64 group_start,
+                            struct extent_map *em)
  {
         struct btrfs_path *path;
         struct btrfs_block_group_cache *block_group;
@@ -9316,6 +9319,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         int ret;
         int index;
         int factor;
+       struct btrfs_caching_control *caching_ctl = NULL;
+       bool remove_em;
  
         root = root->fs_info->extent_root;
  
@@ -9400,6 +9405,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         spin_lock(&root->fs_info->block_group_cache_lock);
         rb_erase(&block_group->cache_node,
                  &root->fs_info->block_group_cache_tree);
+       RB_CLEAR_NODE(&block_group->cache_node);
  
         if (root->fs_info->first_logical_byte == block_group->key.objectid)
                 root->fs_info->first_logical_byte = (u64)-1;
@@ -9411,6 +9417,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
          * are still on the list after taking the semaphore
          */
         list_del_init(&block_group->list);
+       list_del_init(&block_group->ro_list);
         if (list_empty(&block_group->space_info->block_groups[index])) {
                 kobj = block_group->space_info->block_group_kobjs[index];
                 block_group->space_info->block_group_kobjs[index] = NULL;
@@ -9422,8 +9429,32 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                 kobject_put(kobj);
         }
  
+       if (block_group->has_caching_ctl)
+               caching_ctl = get_caching_control(block_group);
         if (block_group->cached == BTRFS_CACHE_STARTED)
                 wait_block_group_cache_done(block_group);
+       if (block_group->has_caching_ctl) {
+               down_write(&root->fs_info->commit_root_sem);
+               if (!caching_ctl) {
+                       struct btrfs_caching_control *ctl;
+
+                       list_for_each_entry(ctl,
+                                   &root->fs_info->caching_block_groups, list)
+                               if (ctl->block_group == block_group) {
+                                       caching_ctl = ctl;
+                                       atomic_inc(&caching_ctl->count);
+                                       break;
+                               }
+               }
+               if (caching_ctl)
+                       list_del_init(&caching_ctl->list);
+               up_write(&root->fs_info->commit_root_sem);
+               if (caching_ctl) {
+                       /* Once for the caching bgs list and once for us. */
+                       put_caching_control(caching_ctl);
+                       put_caching_control(caching_ctl);
+               }
+       }
  
         btrfs_remove_free_space_cache(block_group);
  
@@ -9435,6 +9466,71 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
  
         memcpy(&key, &block_group->key, sizeof(key));
  
+       lock_chunks(root);
+       if (!list_empty(&em->list)) {
+               /* We're in the transaction->pending_chunks list. */
+               free_extent_map(em);
+       }
+       spin_lock(&block_group->lock);
+       block_group->removed = 1;
+       /*
+        * At this point trimming can't start on this block group, because we
+        * removed the block group from the tree fs_info->block_group_cache_tree
+        * so no one can't find it anymore and even if someone already got this
+        * block group before we removed it from the rbtree, they have already
+        * incremented block_group->trimming - if they didn't, they won't find
+        * any free space entries because we already removed them all when we
+        * called btrfs_remove_free_space_cache().
+        *
+        * And we must not remove the extent map from the fs_info->mapping_tree
+        * to prevent the same logical address range and physical device space
+        * ranges from being reused for a new block group. This is because our
+        * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
+        * completely transactionless, so while it is trimming a range the
+        * currently running transaction might finish and a new one start,
+        * allowing for new block groups to be created that can reuse the same
+        * physical device locations unless we take this special care.
+        */
+       remove_em = (atomic_read(&block_group->trimming) == 0);
+       /*
+        * Make sure a trimmer task always sees the em in the pinned_chunks list
+        * if it sees block_group->removed == 1 (needs to lock block_group->lock
+        * before checking block_group->removed).
+        */
+       if (!remove_em) {
+               /*
+                * Our em might be in trans->transaction->pending_chunks which
+                * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
+                * and so is the fs_info->pinned_chunks list.
+                *
+                * So at this point we must be holding the chunk_mutex to avoid
+                * any races with chunk allocation (more specifically at
+                * volumes.c:contains_pending_extent()), to ensure it always
+                * sees the em, either in the pending_chunks list or in the
+                * pinned_chunks list.
+                */
+               list_move_tail(&em->list, &root->fs_info->pinned_chunks);
+       }
+       spin_unlock(&block_group->lock);
+
+       if (remove_em) {
+               struct extent_map_tree *em_tree;
+
+               em_tree = &root->fs_info->mapping_tree.map_tree;
+               write_lock(&em_tree->lock);
+               /*
+                * The em might be in the pending_chunks list, so make sure the
+                * chunk mutex is locked, since remove_extent_mapping() will
+                * delete us from that list.
+                */
+               remove_extent_mapping(em_tree, em);
+               write_unlock(&em_tree->lock);
+               /* once for the tree */
+               free_extent_map(em);
+       }
+
+       unlock_chunks(root);
+
         btrfs_put_block_group(block_group);
         btrfs_put_block_group(block_group);
  
@@ -9523,10 +9619,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                  */
                 start = block_group->key.objectid;
                 end = start + block_group->key.offset - 1;
-               clear_extent_bits(&fs_info->freed_extents[0], start, end,
+               ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
                                   EXTENT_DIRTY, GFP_NOFS);
-               clear_extent_bits(&fs_info->freed_extents[1], start, end,
+               if (ret) {
+                       btrfs_set_block_group_rw(root, block_group);
+                       goto end_trans;
+               }
+               ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
                                   EXTENT_DIRTY, GFP_NOFS);
+               if (ret) {
+                       btrfs_set_block_group_rw(root, block_group);
+                       goto end_trans;
+               }
  
                 /* Reset pinned so btrfs_put_block_group doesn't complain */
                 block_group->pinned = 0;
@@ -9537,6 +9641,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                  */
                 ret = btrfs_remove_chunk(trans, root,
                                          block_group->key.objectid);
+end_trans:
                 btrfs_end_transaction(trans, root);
  next:
                 btrfs_put_block_group(block_group);
@@ -9657,12 +9762,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
  }
  
  /*
- * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
- * they are used to prevent the some tasks writing data into the page cache
- * by nocow before the subvolume is snapshoted, but flush the data into
- * the disk after the snapshot creation.
+ * btrfs_{start,end}_write_no_snapshoting() are similar to
+ * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
+ * data into the page cache through nocow before the subvolume is snapshoted,
+ * but flush the data into disk after the snapshot creation, or to prevent
+ * operations while snapshoting is ongoing and that cause the snapshot to be
+ * inconsistent (writes followed by expanding truncates for example).
   */
-void btrfs_end_nocow_write(struct btrfs_root *root)
+void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
  {
         percpu_counter_dec(&root->subv_writers->counter);
         /*
@@ -9674,7 +9781,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
                 wake_up(&root->subv_writers->wait);
  }
  
-int btrfs_start_nocow_write(struct btrfs_root *root)
+int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
  {
         if (atomic_read(&root->will_be_snapshoted))
                 return 0;
@@ -9685,7 +9792,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
          */
         smp_mb();
         if (atomic_read(&root->will_be_snapshoted)) {
-               btrfs_end_nocow_write(root);
+               btrfs_end_write_no_snapshoting(root);
                 return 0;
         }
         return 1;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index bf3f424..4ebabd2 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -595,9 +595,14 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                 clear = 1;
  again:
         if (!prealloc && (mask & __GFP_WAIT)) {
+               /*
+                * Don't care for allocation failure here because we might end
+                * up not needing the pre-allocated extent state at all, which
+                * is the case if we only have in the tree extent states that
+                * cover our input range and don't cover too any other range.
+                * If we end up needing a new extent state we allocate it later.
+                */
                 prealloc = alloc_extent_state(mask);
-               if (!prealloc)
-                       return -ENOMEM;
         }
  
         spin_lock(&tree->lock);
@@ -796,17 +801,25 @@ static void set_state_bits(struct extent_io_tree *tree,
         state->state |= bits_to_set;
  }
  
-static void cache_state(struct extent_state *state,
-                       struct extent_state **cached_ptr)
+static void cache_state_if_flags(struct extent_state *state,
+                                struct extent_state **cached_ptr,
+                                const u64 flags)
  {
         if (cached_ptr && !(*cached_ptr)) {
-               if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
+               if (!flags || (state->state & flags)) {
                         *cached_ptr = state;
                         atomic_inc(&state->refs);
                 }
         }
  }
  
+static void cache_state(struct extent_state *state,
+                       struct extent_state **cached_ptr)
+{
+       return cache_state_if_flags(state, cached_ptr,
+                                   EXTENT_IOBITS | EXTENT_BOUNDARY);
+}
+
  /*
   * set some bits on a range in the tree.  This may require allocations or
   * sleeping, so the gfp mask is used to indicate what is allowed.
@@ -1058,13 +1071,21 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
         int err = 0;
         u64 last_start;
         u64 last_end;
+       bool first_iteration = true;
  
         btrfs_debug_check_extent_io_range(tree, start, end);
  
  again:
         if (!prealloc && (mask & __GFP_WAIT)) {
+               /*
+                * Best effort, don't worry if extent state allocation fails
+                * here for the first iteration. We might have a cached state
+                * that matches exactly the target range, in which case no
+                * extent state allocations are needed. We'll only know this
+                * after locking the tree.
+                */
                 prealloc = alloc_extent_state(mask);
-               if (!prealloc)
+               if (!prealloc && !first_iteration)
                         return -ENOMEM;
         }
  
@@ -1234,6 +1255,7 @@ search_again:
         spin_unlock(&tree->lock);
         if (mask & __GFP_WAIT)
                 cond_resched();
+       first_iteration = false;
         goto again;
  }
  
@@ -1482,7 +1504,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
         state = find_first_extent_bit_state(tree, start, bits);
  got_it:
         if (state) {
-               cache_state(state, cached_state);
+               cache_state_if_flags(state, cached_state, 0);
                 *start_ret = state->start;
                 *end_ret = state->end;
                 ret = 0;
@@ -1746,6 +1768,9 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
         if (page_ops == 0)
                 return 0;
  
+       if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
+               mapping_set_error(inode->i_mapping, -EIO);
+
         while (nr_pages > 0) {
                 ret = find_get_pages_contig(inode->i_mapping, index,
                                      min_t(unsigned long,
@@ -1763,6 +1788,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
                                 clear_page_dirty_for_io(pages[i]);
                         if (page_ops & PAGE_SET_WRITEBACK)
                                 set_page_writeback(pages[i]);
+                       if (page_ops & PAGE_SET_ERROR)
+                               SetPageError(pages[i]);
                         if (page_ops & PAGE_END_WRITEBACK)
                                 end_page_writeback(pages[i]);
                         if (page_ops & PAGE_UNLOCK)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h

index 6d4b938..ece9ce8 100644 (file)
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -49,6 +49,7 @@
  #define PAGE_SET_WRITEBACK     (1 << 2)
  #define PAGE_END_WRITEBACK     (1 << 3)
  #define PAGE_SET_PRIVATE2      (1 << 4)
+#define PAGE_SET_ERROR         (1 << 5)
  
  /*
   * page->private values.  Every page that is controlled by the extent
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c

index 225302b..6a98bdd 100644 (file)
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -287,8 +287,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
         if (!em)
                 goto out;
  
-       if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
-               list_move(&em->list, &tree->modified_extents);
         em->generation = gen;
         clear_bit(EXTENT_FLAG_PINNED, &em->flags);
         em->mod_start = em->start;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index a18ceab..e409025 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1428,7 +1428,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
         u64 num_bytes;
         int ret;
  
-       ret = btrfs_start_nocow_write(root);
+       ret = btrfs_start_write_no_snapshoting(root);
         if (!ret)
                 return -ENOSPC;
  
@@ -1451,7 +1451,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
         ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
         if (ret <= 0) {
                 ret = 0;
-               btrfs_end_nocow_write(root);
+               btrfs_end_write_no_snapshoting(root);
         } else {
                 *write_bytes = min_t(size_t, *write_bytes ,
                                      num_bytes - pos + lockstart);
@@ -1543,7 +1543,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                                 btrfs_free_reserved_data_space(inode,
                                                                reserve_bytes);
                         else
-                               btrfs_end_nocow_write(root);
+                               btrfs_end_write_no_snapshoting(root);
                         break;
                 }
  
@@ -1632,7 +1632,7 @@ again:
  
                 release_bytes = 0;
                 if (only_release_metadata)
-                       btrfs_end_nocow_write(root);
+                       btrfs_end_write_no_snapshoting(root);
  
                 if (only_release_metadata && copied > 0) {
                         u64 lockstart = round_down(pos, root->sectorsize);
@@ -1661,7 +1661,7 @@ again:
  
         if (release_bytes) {
                 if (only_release_metadata) {
-                       btrfs_end_nocow_write(root);
+                       btrfs_end_write_no_snapshoting(root);
                         btrfs_delalloc_release_metadata(inode, release_bytes);
                 } else {
                         btrfs_delalloc_release_space(inode, release_bytes);
@@ -1676,6 +1676,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
                                     loff_t pos)
  {
         struct file *file = iocb->ki_filp;
+       struct inode *inode = file_inode(file);
         ssize_t written;
         ssize_t written_buffered;
         loff_t endbyte;
@@ -1692,8 +1693,15 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
                 err = written_buffered;
                 goto out;
         }
+       /*
+        * Ensure all data is persisted. We want the next direct IO read to be
+        * able to read what was just written.
+        */
         endbyte = pos + written_buffered - 1;
-       err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+       err = btrfs_fdatawrite_range(inode, pos, endbyte);
+       if (err)
+               goto out;
+       err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
         if (err)
                 goto out;
         written += written_buffered;
@@ -1854,10 +1862,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
         int ret;
  
         atomic_inc(&BTRFS_I(inode)->sync_writers);
-       ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
-       if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-                            &BTRFS_I(inode)->runtime_flags))
-               ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+       ret = btrfs_fdatawrite_range(inode, start, end);
         atomic_dec(&BTRFS_I(inode)->sync_writers);
  
         return ret;
@@ -2810,3 +2815,29 @@ int btrfs_auto_defrag_init(void)
  
         return 0;
  }
+
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
+{
+       int ret;
+
+       /*
+        * So with compression we will find and lock a dirty page and clear the
+        * first one as dirty, setup an async extent, and immediately return
+        * with the entire range locked but with nobody actually marked with
+        * writeback.  So we can't just filemap_write_and_wait_range() and
+        * expect it to work since it will just kick off a thread to do the
+        * actual work.  So we need to call filemap_fdatawrite_range _again_
+        * since it will wait on the page lock, which won't be unlocked until
+        * after the pages have been marked as writeback and so we're good to go
+        * from there.  We have to do this otherwise we'll miss the ordered
+        * extents and that results in badness.  Please Josef, do not think you
+        * know better and pull this out at some point in the future, it is
+        * right and you are wrong.
+        */
+       ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+       if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+                            &BTRFS_I(inode)->runtime_flags))
+               ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+
+       return ret;
+}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c

index 3384819..030847b 100644 (file)
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -27,10 +27,17 @@
  #include "disk-io.h"
  #include "extent_io.h"
  #include "inode-map.h"
+#include "volumes.h"
  
  #define BITS_PER_BITMAP                (PAGE_CACHE_SIZE * 8)
  #define MAX_CACHE_BYTES_PER_GIG        (32 * 1024)
  
+struct btrfs_trim_range {
+       u64 start;
+       u64 bytes;
+       struct list_head list;
+};
+
  static int link_free_space(struct btrfs_free_space_ctl *ctl,
                            struct btrfs_free_space *info);
  static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
@@ -881,6 +888,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
         int ret;
         struct btrfs_free_cluster *cluster = NULL;
         struct rb_node *node = rb_first(&ctl->free_space_offset);
+       struct btrfs_trim_range *trim_entry;
  
         /* Get the cluster for this block_group if it exists */
         if (block_group && !list_empty(&block_group->cluster_list)) {
@@ -916,6 +924,21 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
                         cluster = NULL;
                 }
         }
+
+       /*
+        * Make sure we don't miss any range that was removed from our rbtree
+        * because trimming is running. Otherwise after a umount+mount (or crash
+        * after committing the transaction) we would leak free space and get
+        * an inconsistent free space cache report from fsck.
+        */
+       list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
+               ret = io_ctl_add_entry(io_ctl, trim_entry->start,
+                                      trim_entry->bytes, NULL);
+               if (ret)
+                       goto fail;
+               *entries += 1;
+       }
+
         return 0;
  fail:
         return -ENOSPC;
@@ -1135,12 +1158,15 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
  
         io_ctl_set_generation(&io_ctl, trans->transid);
  
+       mutex_lock(&ctl->cache_writeout_mutex);
         /* Write out the extent entries in the free space cache */
         ret = write_cache_extent_entries(&io_ctl, ctl,
                                          block_group, &entries, &bitmaps,
                                          &bitmap_list);
-       if (ret)
+       if (ret) {
+               mutex_unlock(&ctl->cache_writeout_mutex);
                 goto out_nospc;
+       }
  
         /*
          * Some spaces that are freed in the current transaction are pinned,
@@ -1148,11 +1174,18 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
          * committed, we shouldn't lose them.
          */
         ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
-       if (ret)
+       if (ret) {
+               mutex_unlock(&ctl->cache_writeout_mutex);
                 goto out_nospc;
+       }
  
-       /* At last, we write out all the bitmaps. */
+       /*
+        * At last, we write out all the bitmaps and keep cache_writeout_mutex
+        * locked while doing it because a concurrent trim can be manipulating
+        * or freeing the bitmap.
+        */
         ret = write_bitmap_entries(&io_ctl, &bitmap_list);
+       mutex_unlock(&ctl->cache_writeout_mutex);
         if (ret)
                 goto out_nospc;
  
@@ -2295,6 +2328,8 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
         ctl->start = block_group->key.objectid;
         ctl->private = block_group;
         ctl->op = &free_space_op;
+       INIT_LIST_HEAD(&ctl->trimming_ranges);
+       mutex_init(&ctl->cache_writeout_mutex);
  
         /*
          * we only want to have 32k of ram per block group for keeping
@@ -2911,10 +2946,12 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
  
  static int do_trimming(struct btrfs_block_group_cache *block_group,
                        u64 *total_trimmed, u64 start, u64 bytes,
-                      u64 reserved_start, u64 reserved_bytes)
+                      u64 reserved_start, u64 reserved_bytes,
+                      struct btrfs_trim_range *trim_entry)
  {
         struct btrfs_space_info *space_info = block_group->space_info;
         struct btrfs_fs_info *fs_info = block_group->fs_info;
+       struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
         int ret;
         int update = 0;
         u64 trimmed = 0;
@@ -2934,7 +2971,10 @@ static int do_trimming(struct btrfs_block_group_cache *block_group,
         if (!ret)
                 *total_trimmed += trimmed;
  
+       mutex_lock(&ctl->cache_writeout_mutex);
         btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+       list_del(&trim_entry->list);
+       mutex_unlock(&ctl->cache_writeout_mutex);
  
         if (update) {
                 spin_lock(&space_info->lock);
@@ -2962,16 +3002,21 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
         u64 bytes;
  
         while (start < end) {
+               struct btrfs_trim_range trim_entry;
+
+               mutex_lock(&ctl->cache_writeout_mutex);
                 spin_lock(&ctl->tree_lock);
  
                 if (ctl->free_space < minlen) {
                         spin_unlock(&ctl->tree_lock);
+                       mutex_unlock(&ctl->cache_writeout_mutex);
                         break;
                 }
  
                 entry = tree_search_offset(ctl, start, 0, 1);
                 if (!entry) {
                         spin_unlock(&ctl->tree_lock);
+                       mutex_unlock(&ctl->cache_writeout_mutex);
                         break;
                 }
  
@@ -2980,6 +3025,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
                         node = rb_next(&entry->offset_index);
                         if (!node) {
                                 spin_unlock(&ctl->tree_lock);
+                               mutex_unlock(&ctl->cache_writeout_mutex);
                                 goto out;
                         }
                         entry = rb_entry(node, struct btrfs_free_space,
@@ -2988,6 +3034,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
  
                 if (entry->offset >= end) {
                         spin_unlock(&ctl->tree_lock);
+                       mutex_unlock(&ctl->cache_writeout_mutex);
                         break;
                 }
  
@@ -2997,6 +3044,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
                 bytes = min(extent_start + extent_bytes, end) - start;
                 if (bytes < minlen) {
                         spin_unlock(&ctl->tree_lock);
+                       mutex_unlock(&ctl->cache_writeout_mutex);
                         goto next;
                 }
  
@@ -3004,9 +3052,13 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
                 kmem_cache_free(btrfs_free_space_cachep, entry);
  
                 spin_unlock(&ctl->tree_lock);
+               trim_entry.start = extent_start;
+               trim_entry.bytes = extent_bytes;
+               list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
+               mutex_unlock(&ctl->cache_writeout_mutex);
  
                 ret = do_trimming(block_group, total_trimmed, start, bytes,
-                                 extent_start, extent_bytes);
+                                 extent_start, extent_bytes, &trim_entry);
                 if (ret)
                         break;
  next:
@@ -3035,17 +3087,21 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
  
         while (offset < end) {
                 bool next_bitmap = false;
+               struct btrfs_trim_range trim_entry;
  
+               mutex_lock(&ctl->cache_writeout_mutex);
                 spin_lock(&ctl->tree_lock);
  
                 if (ctl->free_space < minlen) {
                         spin_unlock(&ctl->tree_lock);
+                       mutex_unlock(&ctl->cache_writeout_mutex);
                         break;
                 }
  
                 entry = tree_search_offset(ctl, offset, 1, 0);
                 if (!entry) {
                         spin_unlock(&ctl->tree_lock);
+                       mutex_unlock(&ctl->cache_writeout_mutex);
                         next_bitmap = true;
                         goto next;
                 }
@@ -3054,6 +3110,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
                 ret2 = search_bitmap(ctl, entry, &start, &bytes);
                 if (ret2 || start >= end) {
                         spin_unlock(&ctl->tree_lock);
+                       mutex_unlock(&ctl->cache_writeout_mutex);
                         next_bitmap = true;
                         goto next;
                 }
@@ -3061,6 +3118,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
                 bytes = min(bytes, end - start);
                 if (bytes < minlen) {
                         spin_unlock(&ctl->tree_lock);
+                       mutex_unlock(&ctl->cache_writeout_mutex);
                         goto next;
                 }
  
@@ -3069,9 +3127,13 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
                         free_bitmap(ctl, entry);
  
                 spin_unlock(&ctl->tree_lock);
+               trim_entry.start = start;
+               trim_entry.bytes = bytes;
+               list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
+               mutex_unlock(&ctl->cache_writeout_mutex);
  
                 ret = do_trimming(block_group, total_trimmed, start, bytes,
-                                 start, bytes);
+                                 start, bytes, &trim_entry);
                 if (ret)
                         break;
  next:
@@ -3101,11 +3163,52 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
  
         *trimmed = 0;
  
+       spin_lock(&block_group->lock);
+       if (block_group->removed) {
+               spin_unlock(&block_group->lock);
+               return 0;
+       }
+       atomic_inc(&block_group->trimming);
+       spin_unlock(&block_group->lock);
+
         ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
         if (ret)
-               return ret;
+               goto out;
  
         ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+out:
+       spin_lock(&block_group->lock);
+       if (atomic_dec_and_test(&block_group->trimming) &&
+           block_group->removed) {
+               struct extent_map_tree *em_tree;
+               struct extent_map *em;
+
+               spin_unlock(&block_group->lock);
+
+               em_tree = &block_group->fs_info->mapping_tree.map_tree;
+               write_lock(&em_tree->lock);
+               em = lookup_extent_mapping(em_tree, block_group->key.objectid,
+                                          1);
+               BUG_ON(!em); /* logic error, can't happen */
+               remove_extent_mapping(em_tree, em);
+               write_unlock(&em_tree->lock);
+
+               lock_chunks(block_group->fs_info->chunk_root);
+               list_del_init(&em->list);
+               unlock_chunks(block_group->fs_info->chunk_root);
+
+               /* once for us and once for the tree */
+               free_extent_map(em);
+               free_extent_map(em);
+
+               /*
+                * We've left one free space entry and other tasks trimming
+                * this block group have left 1 entry each one. Free them.
+                */
+               __btrfs_remove_free_space_cache(block_group->free_space_ctl);
+       } else {
+               spin_unlock(&block_group->lock);
+       }
  
         return ret;
  }
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h

index 0cf4977..88b2238 100644 (file)
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -38,6 +38,8 @@ struct btrfs_free_space_ctl {
         u64 start;
         struct btrfs_free_space_op *op;
         void *private;
+       struct mutex cache_writeout_mutex;
+       struct list_head trimming_ranges;
  };
  
  struct btrfs_free_space_op {
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c

index 83d646b..74faea3 100644 (file)
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root)
                           root->root_key.objectid);
         if (IS_ERR(tsk)) {
                 btrfs_warn(root->fs_info, "failed to start inode caching task");
-               btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
+               btrfs_clear_pending_and_info(root->fs_info, INODE_MAP_CACHE,
                                 "disabling inode map caching");
         }
  }
@@ -364,6 +364,8 @@ void btrfs_init_free_ino_ctl(struct btrfs_root *root)
         ctl->start = 0;
         ctl->private = NULL;
         ctl->op = &free_ino_op;
+       INIT_LIST_HEAD(&ctl->trimming_ranges);
+       mutex_init(&ctl->cache_writeout_mutex);
  
         /*
          * Initially we allow to use 16K of ram to cache chunks of
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index ff0dcc0..e687bb0 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -382,7 +382,7 @@ static inline int inode_need_compress(struct inode *inode)
   * are written in the same order that the flusher thread sent them
   * down.
   */
-static noinline int compress_file_range(struct inode *inode,
+static noinline void compress_file_range(struct inode *inode,
                                         struct page *locked_page,
                                         u64 start, u64 end,
                                         struct async_cow *async_cow,
@@ -411,14 +411,6 @@ static noinline int compress_file_range(struct inode *inode,
             (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
                 btrfs_add_inode_defrag(NULL, inode);
  
-       /*
-        * skip compression for a small file range(<=blocksize) that
-        * isn't an inline extent, since it dosen't save disk space at all.
-        */
-       if ((end - start + 1) <= blocksize &&
-           (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
-               goto cleanup_and_bail_uncompressed;
-
         actual_end = min_t(u64, isize, end + 1);
  again:
         will_compress = 0;
@@ -440,6 +432,14 @@ again:
  
         total_compressed = actual_end - start;
  
+       /*
+        * skip compression for a small file range(<=blocksize) that
+        * isn't an inline extent, since it dosen't save disk space at all.
+        */
+       if (total_compressed <= blocksize &&
+          (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+               goto cleanup_and_bail_uncompressed;
+
         /* we want to make sure that amount of ram required to uncompress
          * an extent is reasonable, so we limit the total size in ram
          * of a compressed extent to 128k.  This is a crucial number
@@ -527,7 +527,10 @@ cont:
                 if (ret <= 0) {
                         unsigned long clear_flags = EXTENT_DELALLOC |
                                 EXTENT_DEFRAG;
+                       unsigned long page_error_op;
+
                         clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
+                       page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
  
                         /*
                          * inline extent creation worked or returned error,
@@ -538,6 +541,7 @@ cont:
                                                      clear_flags, PAGE_UNLOCK |
                                                      PAGE_CLEAR_DIRTY |
                                                      PAGE_SET_WRITEBACK |
+                                                    page_error_op |
                                                      PAGE_END_WRITEBACK);
                         goto free_pages_out;
                 }
@@ -620,8 +624,7 @@ cleanup_and_bail_uncompressed:
                 *num_added += 1;
         }
  
-out:
-       return ret;
+       return;
  
  free_pages_out:
         for (i = 0; i < nr_pages_ret; i++) {
@@ -629,8 +632,22 @@ free_pages_out:
                 page_cache_release(pages[i]);
         }
         kfree(pages);
+}
  
-       goto out;
+static void free_async_extent_pages(struct async_extent *async_extent)
+{
+       int i;
+
+       if (!async_extent->pages)
+               return;
+
+       for (i = 0; i < async_extent->nr_pages; i++) {
+               WARN_ON(async_extent->pages[i]->mapping);
+               page_cache_release(async_extent->pages[i]);
+       }
+       kfree(async_extent->pages);
+       async_extent->nr_pages = 0;
+       async_extent->pages = NULL;
  }
  
  /*
@@ -639,7 +656,7 @@ free_pages_out:
   * queued.  We walk all the async extents created by compress_file_range
   * and send them down to the disk.
   */
-static noinline int submit_compressed_extents(struct inode *inode,
+static noinline void submit_compressed_extents(struct inode *inode,
                                               struct async_cow *async_cow)
  {
         struct async_extent *async_extent;
@@ -651,9 +668,6 @@ static noinline int submit_compressed_extents(struct inode *inode,
         struct extent_io_tree *io_tree;
         int ret = 0;
  
-       if (list_empty(&async_cow->extents))
-               return 0;
-
  again:
         while (!list_empty(&async_cow->extents)) {
                 async_extent = list_entry(async_cow->extents.next,
@@ -709,15 +723,7 @@ retry:
                                            async_extent->compressed_size,
                                            0, alloc_hint, &ins, 1, 1);
                 if (ret) {
-                       int i;
-
-                       for (i = 0; i < async_extent->nr_pages; i++) {
-                               WARN_ON(async_extent->pages[i]->mapping);
-                               page_cache_release(async_extent->pages[i]);
-                       }
-                       kfree(async_extent->pages);
-                       async_extent->nr_pages = 0;
-                       async_extent->pages = NULL;
+                       free_async_extent_pages(async_extent);
  
                         if (ret == -ENOSPC) {
                                 unlock_extent(io_tree, async_extent->start,
@@ -814,15 +820,26 @@ retry:
                                     ins.objectid,
                                     ins.offset, async_extent->pages,
                                     async_extent->nr_pages);
+               if (ret) {
+                       struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+                       struct page *p = async_extent->pages[0];
+                       const u64 start = async_extent->start;
+                       const u64 end = start + async_extent->ram_size - 1;
+
+                       p->mapping = inode->i_mapping;
+                       tree->ops->writepage_end_io_hook(p, start, end,
+                                                        NULL, 0);
+                       p->mapping = NULL;
+                       extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+                                                    PAGE_END_WRITEBACK |
+                                                    PAGE_SET_ERROR);
+                       free_async_extent_pages(async_extent);
+               }
                 alloc_hint = ins.objectid + ins.offset;
                 kfree(async_extent);
-               if (ret)
-                       goto out;
                 cond_resched();
         }
-       ret = 0;
-out:
-       return ret;
+       return;
  out_free_reserve:
         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
  out_free:
@@ -832,7 +849,9 @@ out_free:
                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
                                      PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
-                                    PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+                                    PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
+                                    PAGE_SET_ERROR);
+       free_async_extent_pages(async_extent);
         kfree(async_extent);
         goto again;
  }
@@ -1318,7 +1337,7 @@ next_slot:
                          * we fall into common COW way.
                          */
                         if (!nolock) {
-                               err = btrfs_start_nocow_write(root);
+                               err = btrfs_start_write_no_snapshoting(root);
                                 if (!err)
                                         goto out_check;
                         }
@@ -1342,7 +1361,7 @@ out_check:
                 if (extent_end <= start) {
                         path->slots[0]++;
                         if (!nolock && nocow)
-                               btrfs_end_nocow_write(root);
+                               btrfs_end_write_no_snapshoting(root);
                         goto next_slot;
                 }
                 if (!nocow) {
@@ -1362,7 +1381,7 @@ out_check:
                                              page_started, nr_written, 1);
                         if (ret) {
                                 if (!nolock && nocow)
-                                       btrfs_end_nocow_write(root);
+                                       btrfs_end_write_no_snapshoting(root);
                                 goto error;
                         }
                         cow_start = (u64)-1;
@@ -1413,7 +1432,7 @@ out_check:
                                                       num_bytes);
                         if (ret) {
                                 if (!nolock && nocow)
-                                       btrfs_end_nocow_write(root);
+                                       btrfs_end_write_no_snapshoting(root);
                                 goto error;
                         }
                 }
@@ -1424,7 +1443,7 @@ out_check:
                                              EXTENT_DELALLOC, PAGE_UNLOCK |
                                              PAGE_SET_PRIVATE2);
                 if (!nolock && nocow)
-                       btrfs_end_nocow_write(root);
+                       btrfs_end_write_no_snapshoting(root);
                 cur_offset = extent_end;
                 if (cur_offset > end)
                         break;
@@ -4580,6 +4599,26 @@ next:
         return err;
  }
  
+static int wait_snapshoting_atomic_t(atomic_t *a)
+{
+       schedule();
+       return 0;
+}
+
+static void wait_for_snapshot_creation(struct btrfs_root *root)
+{
+       while (true) {
+               int ret;
+
+               ret = btrfs_start_write_no_snapshoting(root);
+               if (ret)
+                       break;
+               wait_on_atomic_t(&root->will_be_snapshoted,
+                                wait_snapshoting_atomic_t,
+                                TASK_UNINTERRUPTIBLE);
+       }
+}
+
  static int btrfs_setsize(struct inode *inode, struct iattr *attr)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4604,17 +4643,30 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
  
         if (newsize > oldsize) {
                 truncate_pagecache(inode, newsize);
+               /*
+                * Don't do an expanding truncate while snapshoting is ongoing.
+                * This is to ensure the snapshot captures a fully consistent
+                * state of this file - if the snapshot captures this expanding
+                * truncation, it must capture all writes that happened before
+                * this truncation.
+                */
+               wait_for_snapshot_creation(root);
                 ret = btrfs_cont_expand(inode, oldsize, newsize);
-               if (ret)
+               if (ret) {
+                       btrfs_end_write_no_snapshoting(root);
                         return ret;
+               }
  
                 trans = btrfs_start_transaction(root, 1);
-               if (IS_ERR(trans))
+               if (IS_ERR(trans)) {
+                       btrfs_end_write_no_snapshoting(root);
                         return PTR_ERR(trans);
+               }
  
                 i_size_write(inode, newsize);
                 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
                 ret = btrfs_update_inode(trans, root, inode);
+               btrfs_end_write_no_snapshoting(root);
                 btrfs_end_transaction(trans, root);
         } else {
  
@@ -7000,9 +7052,12 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                         btrfs_put_ordered_extent(ordered);
                 } else {
                         /* Screw you mmap */
-                       ret = filemap_write_and_wait_range(inode->i_mapping,
-                                                          lockstart,
-                                                          lockend);
+                       ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
+                       if (ret)
+                               break;
+                       ret = filemap_fdatawait_range(inode->i_mapping,
+                                                     lockstart,
+                                                     lockend);
                         if (ret)
                                 break;
  
@@ -9442,6 +9497,21 @@ out_inode:
  
  }
  
+/* Inspired by filemap_check_errors() */
+int btrfs_inode_check_errors(struct inode *inode)
+{
+       int ret = 0;
+
+       if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
+           test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
+               ret = -ENOSPC;
+       if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
+           test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
+               ret = -EIO;
+
+       return ret;
+}
+
  static const struct inode_operations btrfs_dir_inode_operations = {
         .getattr        = btrfs_getattr,
         .lookup         = btrfs_lookup,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c

index 080fe66..d49fe8a 100644 (file)
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -617,7 +617,7 @@ fail:
         return ret;
  }
  
-static void btrfs_wait_nocow_write(struct btrfs_root *root)
+static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
  {
         s64 writers;
         DEFINE_WAIT(wait);
@@ -649,7 +649,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
  
         atomic_inc(&root->will_be_snapshoted);
         smp_mb__after_atomic();
-       btrfs_wait_nocow_write(root);
+       btrfs_wait_for_no_snapshoting_writes(root);
  
         ret = btrfs_start_delalloc_inodes(root, 0);
         if (ret)
@@ -717,35 +717,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
         if (ret)
                 goto fail;
  
-       /*
-        * If orphan cleanup did remove any orphans, it means the tree was
-        * modified and therefore the commit root is not the same as the
-        * current root anymore. This is a problem, because send uses the
-        * commit root and therefore can see inode items that don't exist
-        * in the current root anymore, and for example make calls to
-        * btrfs_iget, which will do tree lookups based on the current root
-        * and not on the commit root. Those lookups will fail, returning a
-        * -ESTALE error, and making send fail with that error. So make sure
-        * a send does not see any orphans we have just removed, and that it
-        * will see the same inodes regardless of whether a transaction
-        * commit happened before it started (meaning that the commit root
-        * will be the same as the current root) or not.
-        */
-       if (readonly && pending_snapshot->snap->node !=
-           pending_snapshot->snap->commit_root) {
-               trans = btrfs_join_transaction(pending_snapshot->snap);
-               if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
-                       ret = PTR_ERR(trans);
-                       goto fail;
-               }
-               if (!IS_ERR(trans)) {
-                       ret = btrfs_commit_transaction(trans,
-                                                      pending_snapshot->snap);
-                       if (ret)
-                               goto fail;
-               }
-       }
-
         inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
         if (IS_ERR(inode)) {
                 ret = PTR_ERR(inode);
@@ -761,7 +732,8 @@ fail:
  free:
         kfree(pending_snapshot);
  out:
-       atomic_dec(&root->will_be_snapshoted);
+       if (atomic_dec_and_test(&root->will_be_snapshoted))
+               wake_up_atomic_t(&root->will_be_snapshoted);
         return ret;
  }
  
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c

index ac734ec..534544e 100644 (file)
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
         INIT_LIST_HEAD(&entry->work_list);
         init_completion(&entry->completion);
         INIT_LIST_HEAD(&entry->log_list);
+       INIT_LIST_HEAD(&entry->trans_list);
  
         trace_btrfs_ordered_extent_add(inode, entry);
  
@@ -431,19 +432,31 @@ out:
  
  /* Needs to either be called under a log transaction or the log_mutex */
  void btrfs_get_logged_extents(struct inode *inode,
-                             struct list_head *logged_list)
+                             struct list_head *logged_list,
+                             const loff_t start,
+                             const loff_t end)
  {
         struct btrfs_ordered_inode_tree *tree;
         struct btrfs_ordered_extent *ordered;
         struct rb_node *n;
+       struct rb_node *prev;
  
         tree = &BTRFS_I(inode)->ordered_tree;
         spin_lock_irq(&tree->lock);
-       for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+       n = __tree_search(&tree->tree, end, &prev);
+       if (!n)
+               n = prev;
+       for (; n; n = rb_prev(n)) {
                 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+               if (ordered->file_offset > end)
+                       continue;
+               if (entry_end(ordered) <= start)
+                       break;
                 if (!list_empty(&ordered->log_list))
                         continue;
-               list_add_tail(&ordered->log_list, logged_list);
+               if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+                       continue;
+               list_add(&ordered->log_list, logged_list);
                 atomic_inc(&ordered->refs);
         }
         spin_unlock_irq(&tree->lock);
@@ -472,7 +485,8 @@ void btrfs_submit_logged_extents(struct list_head *logged_list,
         spin_unlock_irq(&log->log_extents_lock[index]);
  }
  
-void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *log, u64 transid)
  {
         struct btrfs_ordered_extent *ordered;
         int index = transid % 2;
@@ -497,7 +511,8 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
                 wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
                                                    &ordered->flags));
  
-               btrfs_put_ordered_extent(ordered);
+               if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+                       list_add_tail(&ordered->trans_list, &trans->ordered);
                 spin_lock_irq(&log->log_extents_lock[index]);
         }
         spin_unlock_irq(&log->log_extents_lock[index]);
@@ -725,30 +740,10 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
         /* start IO across the range first to instantiate any delalloc
          * extents
          */
-       ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+       ret = btrfs_fdatawrite_range(inode, start, orig_end);
         if (ret)
                 return ret;
-       /*
-        * So with compression we will find and lock a dirty page and clear the
-        * first one as dirty, setup an async extent, and immediately return
-        * with the entire range locked but with nobody actually marked with
-        * writeback.  So we can't just filemap_write_and_wait_range() and
-        * expect it to work since it will just kick off a thread to do the
-        * actual work.  So we need to call filemap_fdatawrite_range _again_
-        * since it will wait on the page lock, which won't be unlocked until
-        * after the pages have been marked as writeback and so we're good to go
-        * from there.  We have to do this otherwise we'll miss the ordered
-        * extents and that results in badness.  Please Josef, do not think you
-        * know better and pull this out at some point in the future, it is
-        * right and you are wrong.
-        */
-       if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-                    &BTRFS_I(inode)->runtime_flags)) {
-               ret = filemap_fdatawrite_range(inode->i_mapping, start,
-                                              orig_end);
-               if (ret)
-                       return ret;
-       }
+
         ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
         if (ret)
                 return ret;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h

index d81a274..e96cd4c 100644 (file)
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -71,6 +71,8 @@ struct btrfs_ordered_sum {
                                        ordered extent */
  #define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */
  
+#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
+                                * in the logging code. */
  struct btrfs_ordered_extent {
         /* logical offset in the file */
         u64 file_offset;
@@ -121,6 +123,9 @@ struct btrfs_ordered_extent {
         /* If we need to wait on this to be done */
         struct list_head log_list;
  
+       /* If the transaction needs to wait on this ordered extent */
+       struct list_head trans_list;
+
         /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
         wait_queue_head_t wait;
  
@@ -193,11 +198,14 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
  int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
  void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
  void btrfs_get_logged_extents(struct inode *inode,
-                             struct list_head *logged_list);
+                             struct list_head *logged_list,
+                             const loff_t start,
+                             const loff_t end);
  void btrfs_put_logged_extents(struct list_head *logged_list);
  void btrfs_submit_logged_extents(struct list_head *logged_list,
                                  struct btrfs_root *log);
-void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *log, u64 transid);
  void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
  int __init ordered_data_init(void);
  void ordered_data_exit(void);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c

index 6a41631..8ab2a17 100644 (file)
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -58,9 +58,23 @@
   */
  #define RBIO_CACHE_READY_BIT   3
  
+/*
+ * bbio and raid_map is managed by the caller, so we shouldn't free
+ * them here. And besides that, all rbios with this flag should not
+ * be cached, because we need raid_map to check the rbios' stripe
+ * is the same or not, but it is very likely that the caller has
+ * free raid_map, so don't cache those rbios.
+ */
+#define RBIO_HOLD_BBIO_MAP_BIT 4
  
  #define RBIO_CACHE_SIZE 1024
  
+enum btrfs_rbio_ops {
+       BTRFS_RBIO_WRITE        = 0,
+       BTRFS_RBIO_READ_REBUILD = 1,
+       BTRFS_RBIO_PARITY_SCRUB = 2,
+};
+
  struct btrfs_raid_bio {
         struct btrfs_fs_info *fs_info;
         struct btrfs_bio *bbio;
@@ -117,13 +131,16 @@ struct btrfs_raid_bio {
         /* number of data stripes (no p/q) */
         int nr_data;
  
+       int real_stripes;
+
+       int stripe_npages;
         /*
          * set if we're doing a parity rebuild
          * for a read from higher up, which is handled
          * differently from a parity rebuild as part of
          * rmw
          */
-       int read_rebuild;
+       enum btrfs_rbio_ops operation;
  
         /* first bad stripe */
         int faila;
@@ -131,6 +148,7 @@ struct btrfs_raid_bio {
         /* second bad stripe (for raid6 use) */
         int failb;
  
+       int scrubp;
         /*
          * number of pages needed to represent the full
          * stripe
@@ -144,8 +162,13 @@ struct btrfs_raid_bio {
          */
         int bio_list_bytes;
  
+       int generic_bio_cnt;
+
         atomic_t refs;
  
+       atomic_t stripes_pending;
+
+       atomic_t error;
         /*
          * these are two arrays of pointers.  We allocate the
          * rbio big enough to hold them both and setup their
@@ -162,6 +185,11 @@ struct btrfs_raid_bio {
          * here for faster lookup
          */
         struct page **bio_pages;
+
+       /*
+        * bitmap to record which horizontal stripe has data
+        */
+       unsigned long *dbitmap;
  };
  
  static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
@@ -176,6 +204,10 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio);
  static void index_rbio_pages(struct btrfs_raid_bio *rbio);
  static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
  
+static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
+                                        int need_check);
+static void async_scrub_parity(struct btrfs_raid_bio *rbio);
+
  /*
   * the stripe hash table is used for locking, and to collect
   * bios in hopes of making a full stripe
@@ -324,6 +356,7 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
  {
         bio_list_merge(&dest->bio_list, &victim->bio_list);
         dest->bio_list_bytes += victim->bio_list_bytes;
+       dest->generic_bio_cnt += victim->generic_bio_cnt;
         bio_list_init(&victim->bio_list);
  }
  
@@ -577,11 +610,20 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
             cur->raid_map[0])
                 return 0;
  
-       /* reads can't merge with writes */
-       if (last->read_rebuild !=
-           cur->read_rebuild) {
+       /* we can't merge with different operations */
+       if (last->operation != cur->operation)
+               return 0;
+       /*
+        * We've need read the full stripe from the drive.
+        * check and repair the parity and write the new results.
+        *
+        * We're not allowed to add any new bios to the
+        * bio list here, anyone else that wants to
+        * change this stripe needs to do their own rmw.
+        */
+       if (last->operation == BTRFS_RBIO_PARITY_SCRUB ||
+           cur->operation == BTRFS_RBIO_PARITY_SCRUB)
                 return 0;
-       }
  
         return 1;
  }
@@ -601,7 +643,7 @@ static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
   */
  static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
  {
-       if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
+       if (rbio->nr_data + 1 == rbio->real_stripes)
                 return NULL;
  
         index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
@@ -772,11 +814,14 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
                         spin_unlock(&rbio->bio_list_lock);
                         spin_unlock_irqrestore(&h->lock, flags);
  
-                       if (next->read_rebuild)
+                       if (next->operation == BTRFS_RBIO_READ_REBUILD)
                                 async_read_rebuild(next);
-                       else {
+                       else if (next->operation == BTRFS_RBIO_WRITE) {
                                 steal_rbio(rbio, next);
                                 async_rmw_stripe(next);
+                       } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
+                               steal_rbio(rbio, next);
+                               async_scrub_parity(next);
                         }
  
                         goto done_nolock;
@@ -796,6 +841,21 @@ done_nolock:
                 remove_rbio_from_cache(rbio);
  }
  
+static inline void
+__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
+{
+       if (need) {
+               kfree(raid_map);
+               kfree(bbio);
+       }
+}
+
+static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
+{
+       __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
+                       !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
+}
+
  static void __free_raid_bio(struct btrfs_raid_bio *rbio)
  {
         int i;
@@ -814,8 +874,9 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
                         rbio->stripe_pages[i] = NULL;
                 }
         }
-       kfree(rbio->raid_map);
-       kfree(rbio->bbio);
+
+       free_bbio_and_raid_map(rbio);
+
         kfree(rbio);
  }
  
@@ -833,6 +894,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
  {
         struct bio *cur = bio_list_get(&rbio->bio_list);
         struct bio *next;
+
+       if (rbio->generic_bio_cnt)
+               btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
+
         free_raid_bio(rbio);
  
         while (cur) {
@@ -858,13 +923,13 @@ static void raid_write_end_io(struct bio *bio, int err)
  
         bio_put(bio);
  
-       if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+       if (!atomic_dec_and_test(&rbio->stripes_pending))
                 return;
  
         err = 0;
  
         /* OK, we have read all the stripes we need to. */
-       if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+       if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
                 err = -EIO;
  
         rbio_orig_end_io(rbio, err, 0);
@@ -925,16 +990,16 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
  {
         struct btrfs_raid_bio *rbio;
         int nr_data = 0;
-       int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
+       int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+       int num_pages = rbio_nr_pages(stripe_len, real_stripes);
+       int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
         void *p;
  
-       rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
+       rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
+                      DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
                         GFP_NOFS);
-       if (!rbio) {
-               kfree(raid_map);
-               kfree(bbio);
+       if (!rbio)
                 return ERR_PTR(-ENOMEM);
-       }
  
         bio_list_init(&rbio->bio_list);
         INIT_LIST_HEAD(&rbio->plug_list);
@@ -946,9 +1011,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
         rbio->fs_info = root->fs_info;
         rbio->stripe_len = stripe_len;
         rbio->nr_pages = num_pages;
+       rbio->real_stripes = real_stripes;
+       rbio->stripe_npages = stripe_npages;
         rbio->faila = -1;
         rbio->failb = -1;
         atomic_set(&rbio->refs, 1);
+       atomic_set(&rbio->error, 0);
+       atomic_set(&rbio->stripes_pending, 0);
  
         /*
          * the stripe_pages and bio_pages array point to the extra
@@ -957,11 +1026,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
         p = rbio + 1;
         rbio->stripe_pages = p;
         rbio->bio_pages = p + sizeof(struct page *) * num_pages;
+       rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
  
-       if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
-               nr_data = bbio->num_stripes - 2;
+       if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
+               nr_data = real_stripes - 2;
         else
-               nr_data = bbio->num_stripes - 1;
+               nr_data = real_stripes - 1;
  
         rbio->nr_data = nr_data;
         return rbio;
@@ -1073,7 +1143,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
  static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
  {
         if (rbio->faila >= 0 || rbio->failb >= 0) {
-               BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
+               BUG_ON(rbio->faila == rbio->real_stripes - 1);
                 __raid56_parity_recover(rbio);
         } else {
                 finish_rmw(rbio);
@@ -1134,7 +1204,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
  static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
  {
         struct btrfs_bio *bbio = rbio->bbio;
-       void *pointers[bbio->num_stripes];
+       void *pointers[rbio->real_stripes];
         int stripe_len = rbio->stripe_len;
         int nr_data = rbio->nr_data;
         int stripe;
@@ -1148,11 +1218,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
  
         bio_list_init(&bio_list);
  
-       if (bbio->num_stripes - rbio->nr_data == 1) {
-               p_stripe = bbio->num_stripes - 1;
-       } else if (bbio->num_stripes - rbio->nr_data == 2) {
-               p_stripe = bbio->num_stripes - 2;
-               q_stripe = bbio->num_stripes - 1;
+       if (rbio->real_stripes - rbio->nr_data == 1) {
+               p_stripe = rbio->real_stripes - 1;
+       } else if (rbio->real_stripes - rbio->nr_data == 2) {
+               p_stripe = rbio->real_stripes - 2;
+               q_stripe = rbio->real_stripes - 1;
         } else {
                 BUG();
         }
@@ -1169,7 +1239,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
         set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
         spin_unlock_irq(&rbio->bio_list_lock);
  
-       atomic_set(&rbio->bbio->error, 0);
+       atomic_set(&rbio->error, 0);
  
         /*
          * now that we've set rmw_locked, run through the
@@ -1209,7 +1279,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
                         SetPageUptodate(p);
                         pointers[stripe++] = kmap(p);
  
-                       raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
+                       raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
                                                 pointers);
                 } else {
                         /* raid5 */
@@ -1218,7 +1288,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
                 }
  
  
-               for (stripe = 0; stripe < bbio->num_stripes; stripe++)
+               for (stripe = 0; stripe < rbio->real_stripes; stripe++)
                         kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
         }
  
@@ -1227,7 +1297,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
          * higher layers (the bio_list in our rbio) and our p/q.  Ignore
          * everything else.
          */
-       for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
+       for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
                 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
                         struct page *page;
                         if (stripe < rbio->nr_data) {
@@ -1245,8 +1315,34 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
                 }
         }
  
-       atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
-       BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
+       if (likely(!bbio->num_tgtdevs))
+               goto write_data;
+
+       for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+               if (!bbio->tgtdev_map[stripe])
+                       continue;
+
+               for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+                       struct page *page;
+                       if (stripe < rbio->nr_data) {
+                               page = page_in_rbio(rbio, stripe, pagenr, 1);
+                               if (!page)
+                                       continue;
+                       } else {
+                              page = rbio_stripe_page(rbio, stripe, pagenr);
+                       }
+
+                       ret = rbio_add_io_page(rbio, &bio_list, page,
+                                              rbio->bbio->tgtdev_map[stripe],
+                                              pagenr, rbio->stripe_len);
+                       if (ret)
+                               goto cleanup;
+               }
+       }
+
+write_data:
+       atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
+       BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
  
         while (1) {
                 bio = bio_list_pop(&bio_list);
@@ -1283,7 +1379,8 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
                 stripe = &rbio->bbio->stripes[i];
                 stripe_start = stripe->physical;
                 if (physical >= stripe_start &&
-                   physical < stripe_start + rbio->stripe_len) {
+                   physical < stripe_start + rbio->stripe_len &&
+                   bio->bi_bdev == stripe->dev->bdev) {
                         return i;
                 }
         }
@@ -1331,11 +1428,11 @@ static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
         if (rbio->faila == -1) {
                 /* first failure on this rbio */
                 rbio->faila = failed;
-               atomic_inc(&rbio->bbio->error);
+               atomic_inc(&rbio->error);
         } else if (rbio->failb == -1) {
                 /* second failure on this rbio */
                 rbio->failb = failed;
-               atomic_inc(&rbio->bbio->error);
+               atomic_inc(&rbio->error);
         } else {
                 ret = -EIO;
         }
@@ -1394,11 +1491,11 @@ static void raid_rmw_end_io(struct bio *bio, int err)
  
         bio_put(bio);
  
-       if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+       if (!atomic_dec_and_test(&rbio->stripes_pending))
                 return;
  
         err = 0;
-       if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+       if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
                 goto cleanup;
  
         /*
@@ -1439,7 +1536,6 @@ static void async_read_rebuild(struct btrfs_raid_bio *rbio)
  static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
  {
         int bios_to_read = 0;
-       struct btrfs_bio *bbio = rbio->bbio;
         struct bio_list bio_list;
         int ret;
         int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
@@ -1455,7 +1551,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
  
         index_rbio_pages(rbio);
  
-       atomic_set(&rbio->bbio->error, 0);
+       atomic_set(&rbio->error, 0);
         /*
          * build a list of bios to read all the missing parts of this
          * stripe
@@ -1503,7 +1599,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
          * the bbio may be freed once we submit the last bio.  Make sure
          * not to touch it after that
          */
-       atomic_set(&bbio->stripes_pending, bios_to_read);
+       atomic_set(&rbio->stripes_pending, bios_to_read);
         while (1) {
                 bio = bio_list_pop(&bio_list);
                 if (!bio)
@@ -1686,19 +1782,30 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
         struct btrfs_raid_bio *rbio;
         struct btrfs_plug_cb *plug = NULL;
         struct blk_plug_cb *cb;
+       int ret;
  
         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
-       if (IS_ERR(rbio))
+       if (IS_ERR(rbio)) {
+               __free_bbio_and_raid_map(bbio, raid_map, 1);
                 return PTR_ERR(rbio);
+       }
         bio_list_add(&rbio->bio_list, bio);
         rbio->bio_list_bytes = bio->bi_iter.bi_size;
+       rbio->operation = BTRFS_RBIO_WRITE;
+
+       btrfs_bio_counter_inc_noblocked(root->fs_info);
+       rbio->generic_bio_cnt = 1;
  
         /*
          * don't plug on full rbios, just get them out the door
          * as quickly as we can
          */
-       if (rbio_is_full(rbio))
-               return full_stripe_write(rbio);
+       if (rbio_is_full(rbio)) {
+               ret = full_stripe_write(rbio);
+               if (ret)
+                       btrfs_bio_counter_dec(root->fs_info);
+               return ret;
+       }
  
         cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
                                sizeof(*plug));
@@ -1709,10 +1816,13 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
                         INIT_LIST_HEAD(&plug->rbio_list);
                 }
                 list_add_tail(&rbio->plug_list, &plug->rbio_list);
+               ret = 0;
         } else {
-               return __raid56_parity_write(rbio);
+               ret = __raid56_parity_write(rbio);
+               if (ret)
+                       btrfs_bio_counter_dec(root->fs_info);
         }
-       return 0;
+       return ret;
  }
  
  /*
@@ -1730,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
         int err;
         int i;
  
-       pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
+       pointers = kzalloc(rbio->real_stripes * sizeof(void *),
                            GFP_NOFS);
         if (!pointers) {
                 err = -ENOMEM;
@@ -1740,7 +1850,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
         faila = rbio->faila;
         failb = rbio->failb;
  
-       if (rbio->read_rebuild) {
+       if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
                 spin_lock_irq(&rbio->bio_list_lock);
                 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
                 spin_unlock_irq(&rbio->bio_list_lock);
@@ -1749,15 +1859,23 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
         index_rbio_pages(rbio);
  
         for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+               /*
+                * Now we just use bitmap to mark the horizontal stripes in
+                * which we have data when doing parity scrub.
+                */
+               if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
+                   !test_bit(pagenr, rbio->dbitmap))
+                       continue;
+
                 /* setup our array of pointers with pages
                  * from each stripe
                  */
-               for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
+               for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
                         /*
                          * if we're rebuilding a read, we have to use
                          * pages from the bio list
                          */
-                       if (rbio->read_rebuild &&
+                       if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
                             (stripe == faila || stripe == failb)) {
                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
                         } else {
@@ -1767,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                 }
  
                 /* all raid6 handling here */
-               if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
+               if (rbio->raid_map[rbio->real_stripes - 1] ==
                     RAID6_Q_STRIPE) {
  
                         /*
@@ -1817,10 +1935,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                         }
  
                         if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
-                               raid6_datap_recov(rbio->bbio->num_stripes,
+                               raid6_datap_recov(rbio->real_stripes,
                                                   PAGE_SIZE, faila, pointers);
                         } else {
-                               raid6_2data_recov(rbio->bbio->num_stripes,
+                               raid6_2data_recov(rbio->real_stripes,
                                                   PAGE_SIZE, faila, failb,
                                                   pointers);
                         }
@@ -1850,7 +1968,7 @@ pstripe:
                  * know they can be trusted.  If this was a read reconstruction,
                  * other endio functions will fiddle the uptodate bits
                  */
-               if (!rbio->read_rebuild) {
+               if (rbio->operation == BTRFS_RBIO_WRITE) {
                         for (i = 0;  i < nr_pages; i++) {
                                 if (faila != -1) {
                                         page = rbio_stripe_page(rbio, faila, i);
@@ -1862,12 +1980,12 @@ pstripe:
                                 }
                         }
                 }
-               for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
+               for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
                         /*
                          * if we're rebuilding a read, we have to use
                          * pages from the bio list
                          */
-                       if (rbio->read_rebuild &&
+                       if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
                             (stripe == faila || stripe == failb)) {
                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
                         } else {
@@ -1882,9 +2000,9 @@ cleanup:
         kfree(pointers);
  
  cleanup_io:
-
-       if (rbio->read_rebuild) {
-               if (err == 0)
+       if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
+               if (err == 0 &&
+                   !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
                         cache_rbio_pages(rbio);
                 else
                         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -1893,7 +2011,13 @@ cleanup_io:
         } else if (err == 0) {
                 rbio->faila = -1;
                 rbio->failb = -1;
-               finish_rmw(rbio);
+
+               if (rbio->operation == BTRFS_RBIO_WRITE)
+                       finish_rmw(rbio);
+               else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
+                       finish_parity_scrub(rbio, 0);
+               else
+                       BUG();
         } else {
                 rbio_orig_end_io(rbio, err, 0);
         }
@@ -1917,10 +2041,10 @@ static void raid_recover_end_io(struct bio *bio, int err)
                 set_bio_pages_uptodate(bio);
         bio_put(bio);
  
-       if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+       if (!atomic_dec_and_test(&rbio->stripes_pending))
                 return;
  
-       if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+       if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
                 rbio_orig_end_io(rbio, -EIO, 0);
         else
                 __raid_recover_end_io(rbio);
@@ -1937,7 +2061,6 @@ static void raid_recover_end_io(struct bio *bio, int err)
  static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
  {
         int bios_to_read = 0;
-       struct btrfs_bio *bbio = rbio->bbio;
         struct bio_list bio_list;
         int ret;
         int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
@@ -1951,16 +2074,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
         if (ret)
                 goto cleanup;
  
-       atomic_set(&rbio->bbio->error, 0);
+       atomic_set(&rbio->error, 0);
  
         /*
          * read everything that hasn't failed.  Thanks to the
          * stripe cache, it is possible that some or all of these
          * pages are going to be uptodate.
          */
-       for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
+       for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
                 if (rbio->faila == stripe || rbio->failb == stripe) {
-                       atomic_inc(&rbio->bbio->error);
+                       atomic_inc(&rbio->error);
                         continue;
                 }
  
@@ -1990,7 +2113,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
                  * were up to date, or we might have no bios to read because
                  * the devices were gone.
                  */
-               if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
+               if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
                         __raid_recover_end_io(rbio);
                         goto out;
                 } else {
@@ -2002,7 +2125,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
          * the bbio may be freed once we submit the last bio.  Make sure
          * not to touch it after that
          */
-       atomic_set(&bbio->stripes_pending, bios_to_read);
+       atomic_set(&rbio->stripes_pending, bios_to_read);
         while (1) {
                 bio = bio_list_pop(&bio_list);
                 if (!bio)
@@ -2021,7 +2144,7 @@ out:
         return 0;
  
  cleanup:
-       if (rbio->read_rebuild)
+       if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
                 rbio_orig_end_io(rbio, -EIO, 0);
         return -EIO;
  }
@@ -2034,34 +2157,42 @@ cleanup:
   */
  int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
                           struct btrfs_bio *bbio, u64 *raid_map,
-                         u64 stripe_len, int mirror_num)
+                         u64 stripe_len, int mirror_num, int generic_io)
  {
         struct btrfs_raid_bio *rbio;
         int ret;
  
         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
-       if (IS_ERR(rbio))
+       if (IS_ERR(rbio)) {
+               __free_bbio_and_raid_map(bbio, raid_map, generic_io);
                 return PTR_ERR(rbio);
+       }
  
-       rbio->read_rebuild = 1;
+       rbio->operation = BTRFS_RBIO_READ_REBUILD;
         bio_list_add(&rbio->bio_list, bio);
         rbio->bio_list_bytes = bio->bi_iter.bi_size;
  
         rbio->faila = find_logical_bio_stripe(rbio, bio);
         if (rbio->faila == -1) {
                 BUG();
-               kfree(raid_map);
-               kfree(bbio);
+               __free_bbio_and_raid_map(bbio, raid_map, generic_io);
                 kfree(rbio);
                 return -EIO;
         }
  
+       if (generic_io) {
+               btrfs_bio_counter_inc_noblocked(root->fs_info);
+               rbio->generic_bio_cnt = 1;
+       } else {
+               set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
+       }
+
         /*
          * reconstruct from the q stripe if they are
          * asking for mirror 3
          */
         if (mirror_num == 3)
-               rbio->failb = bbio->num_stripes - 2;
+               rbio->failb = rbio->real_stripes - 2;
  
         ret = lock_stripe_add(rbio);
  
@@ -2098,3 +2229,483 @@ static void read_rebuild_work(struct btrfs_work *work)
         rbio = container_of(work, struct btrfs_raid_bio, work);
         __raid56_parity_recover(rbio);
  }
+
+/*
+ * The following code is used to scrub/replace the parity stripe
+ *
+ * Note: We need make sure all the pages that add into the scrub/replace
+ * raid bio are correct and not be changed during the scrub/replace. That
+ * is those pages just hold metadata or file data with checksum.
+ */
+
+struct btrfs_raid_bio *
+raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
+                              struct btrfs_bio *bbio, u64 *raid_map,
+                              u64 stripe_len, struct btrfs_device *scrub_dev,
+                              unsigned long *dbitmap, int stripe_nsectors)
+{
+       struct btrfs_raid_bio *rbio;
+       int i;
+
+       rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+       if (IS_ERR(rbio))
+               return NULL;
+       bio_list_add(&rbio->bio_list, bio);
+       /*
+        * This is a special bio which is used to hold the completion handler
+        * and make the scrub rbio is similar to the other types
+        */
+       ASSERT(!bio->bi_iter.bi_size);
+       rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
+
+       for (i = 0; i < rbio->real_stripes; i++) {
+               if (bbio->stripes[i].dev == scrub_dev) {
+                       rbio->scrubp = i;
+                       break;
+               }
+       }
+
+       /* Now we just support the sectorsize equals to page size */
+       ASSERT(root->sectorsize == PAGE_SIZE);
+       ASSERT(rbio->stripe_npages == stripe_nsectors);
+       bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
+
+       return rbio;
+}
+
+void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
+                                  struct page *page, u64 logical)
+{
+       int stripe_offset;
+       int index;
+
+       ASSERT(logical >= rbio->raid_map[0]);
+       ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
+                               rbio->stripe_len * rbio->nr_data);
+       stripe_offset = (int)(logical - rbio->raid_map[0]);
+       index = stripe_offset >> PAGE_CACHE_SHIFT;
+       rbio->bio_pages[index] = page;
+}
+
+/*
+ * We just scrub the parity that we have correct data on the same horizontal,
+ * so we needn't allocate all pages for all the stripes.
+ */
+static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
+{
+       int i;
+       int bit;
+       int index;
+       struct page *page;
+
+       for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
+               for (i = 0; i < rbio->real_stripes; i++) {
+                       index = i * rbio->stripe_npages + bit;
+                       if (rbio->stripe_pages[index])
+                               continue;
+
+                       page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                       if (!page)
+                               return -ENOMEM;
+                       rbio->stripe_pages[index] = page;
+                       ClearPageUptodate(page);
+               }
+       }
+       return 0;
+}
+
+/*
+ * end io function used by finish_rmw.  When we finally
+ * get here, we've written a full stripe
+ */
+static void raid_write_parity_end_io(struct bio *bio, int err)
+{
+       struct btrfs_raid_bio *rbio = bio->bi_private;
+
+       if (err)
+               fail_bio_stripe(rbio, bio);
+
+       bio_put(bio);
+
+       if (!atomic_dec_and_test(&rbio->stripes_pending))
+               return;
+
+       err = 0;
+
+       if (atomic_read(&rbio->error))
+               err = -EIO;
+
+       rbio_orig_end_io(rbio, err, 0);
+}
+
+static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
+                                        int need_check)
+{
+       struct btrfs_bio *bbio = rbio->bbio;
+       void *pointers[rbio->real_stripes];
+       DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
+       int nr_data = rbio->nr_data;
+       int stripe;
+       int pagenr;
+       int p_stripe = -1;
+       int q_stripe = -1;
+       struct page *p_page = NULL;
+       struct page *q_page = NULL;
+       struct bio_list bio_list;
+       struct bio *bio;
+       int is_replace = 0;
+       int ret;
+
+       bio_list_init(&bio_list);
+
+       if (rbio->real_stripes - rbio->nr_data == 1) {
+               p_stripe = rbio->real_stripes - 1;
+       } else if (rbio->real_stripes - rbio->nr_data == 2) {
+               p_stripe = rbio->real_stripes - 2;
+               q_stripe = rbio->real_stripes - 1;
+       } else {
+               BUG();
+       }
+
+       if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
+               is_replace = 1;
+               bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
+       }
+
+       /*
+        * Because the higher layers(scrubber) are unlikely to
+        * use this area of the disk again soon, so don't cache
+        * it.
+        */
+       clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+       if (!need_check)
+               goto writeback;
+
+       p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+       if (!p_page)
+               goto cleanup;
+       SetPageUptodate(p_page);
+
+       if (q_stripe != -1) {
+               q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+               if (!q_page) {
+                       __free_page(p_page);
+                       goto cleanup;
+               }
+               SetPageUptodate(q_page);
+       }
+
+       atomic_set(&rbio->error, 0);
+
+       for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
+               struct page *p;
+               void *parity;
+               /* first collect one page from each data stripe */
+               for (stripe = 0; stripe < nr_data; stripe++) {
+                       p = page_in_rbio(rbio, stripe, pagenr, 0);
+                       pointers[stripe] = kmap(p);
+               }
+
+               /* then add the parity stripe */
+               pointers[stripe++] = kmap(p_page);
+
+               if (q_stripe != -1) {
+
+                       /*
+                        * raid6, add the qstripe and call the
+                        * library function to fill in our p/q
+                        */
+                       pointers[stripe++] = kmap(q_page);
+
+                       raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
+                                               pointers);
+               } else {
+                       /* raid5 */
+                       memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
+                       run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+               }
+
+               /* Check scrubbing pairty and repair it */
+               p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
+               parity = kmap(p);
+               if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
+                       memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
+               else
+                       /* Parity is right, needn't writeback */
+                       bitmap_clear(rbio->dbitmap, pagenr, 1);
+               kunmap(p);
+
+               for (stripe = 0; stripe < rbio->real_stripes; stripe++)
+                       kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+       }
+
+       __free_page(p_page);
+       if (q_page)
+               __free_page(q_page);
+
+writeback:
+       /*
+        * time to start writing.  Make bios for everything from the
+        * higher layers (the bio_list in our rbio) and our p/q.  Ignore
+        * everything else.
+        */
+       for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
+               struct page *page;
+
+               page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
+               ret = rbio_add_io_page(rbio, &bio_list,
+                              page, rbio->scrubp, pagenr, rbio->stripe_len);
+               if (ret)
+                       goto cleanup;
+       }
+
+       if (!is_replace)
+               goto submit_write;
+
+       for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
+               struct page *page;
+
+               page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
+               ret = rbio_add_io_page(rbio, &bio_list, page,
+                                      bbio->tgtdev_map[rbio->scrubp],
+                                      pagenr, rbio->stripe_len);
+               if (ret)
+                       goto cleanup;
+       }
+
+submit_write:
+       nr_data = bio_list_size(&bio_list);
+       if (!nr_data) {
+               /* Every parity is right */
+               rbio_orig_end_io(rbio, 0, 0);
+               return;
+       }
+
+       atomic_set(&rbio->stripes_pending, nr_data);
+
+       while (1) {
+               bio = bio_list_pop(&bio_list);
+               if (!bio)
+                       break;
+
+               bio->bi_private = rbio;
+               bio->bi_end_io = raid_write_parity_end_io;
+               BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+               submit_bio(WRITE, bio);
+       }
+       return;
+
+cleanup:
+       rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
+{
+       if (stripe >= 0 && stripe < rbio->nr_data)
+               return 1;
+       return 0;
+}
+
+/*
+ * While we're doing the parity check and repair, we could have errors
+ * in reading pages off the disk.  This checks for errors and if we're
+ * not able to read the page it'll trigger parity reconstruction.  The
+ * parity scrub will be finished after we've reconstructed the failed
+ * stripes
+ */
+static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
+{
+       if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+               goto cleanup;
+
+       if (rbio->faila >= 0 || rbio->failb >= 0) {
+               int dfail = 0, failp = -1;
+
+               if (is_data_stripe(rbio, rbio->faila))
+                       dfail++;
+               else if (is_parity_stripe(rbio->faila))
+                       failp = rbio->faila;
+
+               if (is_data_stripe(rbio, rbio->failb))
+                       dfail++;
+               else if (is_parity_stripe(rbio->failb))
+                       failp = rbio->failb;
+
+               /*
+                * Because we can not use a scrubbing parity to repair
+                * the data, so the capability of the repair is declined.
+                * (In the case of RAID5, we can not repair anything)
+                */
+               if (dfail > rbio->bbio->max_errors - 1)
+                       goto cleanup;
+
+               /*
+                * If all data is good, only parity is correctly, just
+                * repair the parity.
+                */
+               if (dfail == 0) {
+                       finish_parity_scrub(rbio, 0);
+                       return;
+               }
+
+               /*
+                * Here means we got one corrupted data stripe and one
+                * corrupted parity on RAID6, if the corrupted parity
+                * is scrubbing parity, luckly, use the other one to repair
+                * the data, or we can not repair the data stripe.
+                */
+               if (failp != rbio->scrubp)
+                       goto cleanup;
+
+               __raid_recover_end_io(rbio);
+       } else {
+               finish_parity_scrub(rbio, 1);
+       }
+       return;
+
+cleanup:
+       rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+/*
+ * end io for the read phase of the rmw cycle.  All the bios here are physical
+ * stripe bios we've read from the disk so we can recalculate the parity of the
+ * stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid56_parity_scrub_end_io(struct bio *bio, int err)
+{
+       struct btrfs_raid_bio *rbio = bio->bi_private;
+
+       if (err)
+               fail_bio_stripe(rbio, bio);
+       else
+               set_bio_pages_uptodate(bio);
+
+       bio_put(bio);
+
+       if (!atomic_dec_and_test(&rbio->stripes_pending))
+               return;
+
+       /*
+        * this will normally call finish_rmw to start our write
+        * but if there are any failed stripes we'll reconstruct
+        * from parity first
+        */
+       validate_rbio_for_parity_scrub(rbio);
+}
+
+static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
+{
+       int bios_to_read = 0;
+       struct bio_list bio_list;
+       int ret;
+       int pagenr;
+       int stripe;
+       struct bio *bio;
+
+       ret = alloc_rbio_essential_pages(rbio);
+       if (ret)
+               goto cleanup;
+
+       bio_list_init(&bio_list);
+
+       atomic_set(&rbio->error, 0);
+       /*
+        * build a list of bios to read all the missing parts of this
+        * stripe
+        */
+       for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+               for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
+                       struct page *page;
+                       /*
+                        * we want to find all the pages missing from
+                        * the rbio and read them from the disk.  If
+                        * page_in_rbio finds a page in the bio list
+                        * we don't need to read it off the stripe.
+                        */
+                       page = page_in_rbio(rbio, stripe, pagenr, 1);
+                       if (page)
+                               continue;
+
+                       page = rbio_stripe_page(rbio, stripe, pagenr);
+                       /*
+                        * the bio cache may have handed us an uptodate
+                        * page.  If so, be happy and use it
+                        */
+                       if (PageUptodate(page))
+                               continue;
+
+                       ret = rbio_add_io_page(rbio, &bio_list, page,
+                                      stripe, pagenr, rbio->stripe_len);
+                       if (ret)
+                               goto cleanup;
+               }
+       }
+
+       bios_to_read = bio_list_size(&bio_list);
+       if (!bios_to_read) {
+               /*
+                * this can happen if others have merged with
+                * us, it means there is nothing left to read.
+                * But if there are missing devices it may not be
+                * safe to do the full stripe write yet.
+                */
+               goto finish;
+       }
+
+       /*
+        * the bbio may be freed once we submit the last bio.  Make sure
+        * not to touch it after that
+        */
+       atomic_set(&rbio->stripes_pending, bios_to_read);
+       while (1) {
+               bio = bio_list_pop(&bio_list);
+               if (!bio)
+                       break;
+
+               bio->bi_private = rbio;
+               bio->bi_end_io = raid56_parity_scrub_end_io;
+
+               btrfs_bio_wq_end_io(rbio->fs_info, bio,
+                                   BTRFS_WQ_ENDIO_RAID56);
+
+               BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+               submit_bio(READ, bio);
+       }
+       /* the actual write will happen once the reads are done */
+       return;
+
+cleanup:
+       rbio_orig_end_io(rbio, -EIO, 0);
+       return;
+
+finish:
+       validate_rbio_for_parity_scrub(rbio);
+}
+
+static void scrub_parity_work(struct btrfs_work *work)
+{
+       struct btrfs_raid_bio *rbio;
+
+       rbio = container_of(work, struct btrfs_raid_bio, work);
+       raid56_parity_scrub_stripe(rbio);
+}
+
+static void async_scrub_parity(struct btrfs_raid_bio *rbio)
+{
+       btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+                       scrub_parity_work, NULL, NULL);
+
+       btrfs_queue_work(rbio->fs_info->rmw_workers,
+                        &rbio->work);
+}
+
+void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
+{
+       if (!lock_stripe_add(rbio))
+               async_scrub_parity(rbio);
+}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h

index ea5d73b..31d4a15 100644 (file)
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -39,13 +39,25 @@ static inline int nr_data_stripes(struct map_lookup *map)
  #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) ||                \
                              ((x) == RAID6_Q_STRIPE))
  
+struct btrfs_raid_bio;
+struct btrfs_device;
+
  int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
-                                struct btrfs_bio *bbio, u64 *raid_map,
-                                u64 stripe_len, int mirror_num);
+                         struct btrfs_bio *bbio, u64 *raid_map,
+                         u64 stripe_len, int mirror_num, int generic_io);
  int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
                                struct btrfs_bio *bbio, u64 *raid_map,
                                u64 stripe_len);
  
+struct btrfs_raid_bio *
+raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
+                              struct btrfs_bio *bbio, u64 *raid_map,
+                              u64 stripe_len, struct btrfs_device *scrub_dev,
+                              unsigned long *dbitmap, int stripe_nsectors);
+void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
+                                  struct page *page, u64 logical);
+void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
+
  int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
  void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
  #endif
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c

index efa0831..f2bb13a 100644 (file)
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -63,10 +63,18 @@ struct scrub_ctx;
   */
  #define SCRUB_MAX_PAGES_PER_BLOCK      16      /* 64k per node/leaf/sector */
  
+struct scrub_recover {
+       atomic_t                refs;
+       struct btrfs_bio        *bbio;
+       u64                     *raid_map;
+       u64                     map_length;
+};
+
  struct scrub_page {
         struct scrub_block      *sblock;
         struct page             *page;
         struct btrfs_device     *dev;
+       struct list_head        list;
         u64                     flags;  /* extent flags */
         u64                     generation;
         u64                     logical;
@@ -79,6 +87,8 @@ struct scrub_page {
                 unsigned int    io_error:1;
         };
         u8                      csum[BTRFS_CSUM_SIZE];
+
+       struct scrub_recover    *recover;
  };
  
  struct scrub_bio {
@@ -105,14 +115,52 @@ struct scrub_block {
         atomic_t                outstanding_pages;
         atomic_t                ref_count; /* free mem on transition to zero */
         struct scrub_ctx        *sctx;
+       struct scrub_parity     *sparity;
         struct {
                 unsigned int    header_error:1;
                 unsigned int    checksum_error:1;
                 unsigned int    no_io_error_seen:1;
                 unsigned int    generation_error:1; /* also sets header_error */
+
+               /* The following is for the data used to check parity */
+               /* It is for the data with checksum */
+               unsigned int    data_corrected:1;
         };
  };
  
+/* Used for the chunks with parity stripe such RAID5/6 */
+struct scrub_parity {
+       struct scrub_ctx        *sctx;
+
+       struct btrfs_device     *scrub_dev;
+
+       u64                     logic_start;
+
+       u64                     logic_end;
+
+       int                     nsectors;
+
+       int                     stripe_len;
+
+       atomic_t                ref_count;
+
+       struct list_head        spages;
+
+       /* Work of parity check and repair */
+       struct btrfs_work       work;
+
+       /* Mark the parity blocks which have data */
+       unsigned long           *dbitmap;
+
+       /*
+        * Mark the parity blocks which have data, but errors happen when
+        * read data or check data
+        */
+       unsigned long           *ebitmap;
+
+       unsigned long           bitmap[0];
+};
+
  struct scrub_wr_ctx {
         struct scrub_bio *wr_curr_bio;
         struct btrfs_device *tgtdev;
@@ -196,7 +244,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
  static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
                                 struct scrub_block *sblock, int is_metadata,
                                 int have_csum, u8 *csum, u64 generation,
-                               u16 csum_size);
+                               u16 csum_size, int retry_failed_mirror);
  static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                                          struct scrub_block *sblock,
                                          int is_metadata, int have_csum,
@@ -218,6 +266,8 @@ static void scrub_block_get(struct scrub_block *sblock);
  static void scrub_block_put(struct scrub_block *sblock);
  static void scrub_page_get(struct scrub_page *spage);
  static void scrub_page_put(struct scrub_page *spage);
+static void scrub_parity_get(struct scrub_parity *sparity);
+static void scrub_parity_put(struct scrub_parity *sparity);
  static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
                                     struct scrub_page *spage);
  static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -790,6 +840,20 @@ out:
         scrub_pending_trans_workers_dec(sctx);
  }
  
+static inline void scrub_get_recover(struct scrub_recover *recover)
+{
+       atomic_inc(&recover->refs);
+}
+
+static inline void scrub_put_recover(struct scrub_recover *recover)
+{
+       if (atomic_dec_and_test(&recover->refs)) {
+               kfree(recover->bbio);
+               kfree(recover->raid_map);
+               kfree(recover);
+       }
+}
+
  /*
   * scrub_handle_errored_block gets called when either verification of the
   * pages failed or the bio failed to read, e.g. with EIO. In the latter
@@ -906,7 +970,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
  
         /* build and submit the bios for the failed mirror, check checksums */
         scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
-                           csum, generation, sctx->csum_size);
+                           csum, generation, sctx->csum_size, 1);
  
         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
             sblock_bad->no_io_error_seen) {
@@ -920,6 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                  */
                 spin_lock(&sctx->stat_lock);
                 sctx->stat.unverified_errors++;
+               sblock_to_check->data_corrected = 1;
                 spin_unlock(&sctx->stat_lock);
  
                 if (sctx->is_dev_replace)
@@ -1019,7 +1084,7 @@ nodatasum_case:
                 /* build and submit the bios, check checksums */
                 scrub_recheck_block(fs_info, sblock_other, is_metadata,
                                     have_csum, csum, generation,
-                                   sctx->csum_size);
+                                   sctx->csum_size, 0);
  
                 if (!sblock_other->header_error &&
                     !sblock_other->checksum_error &&
@@ -1169,7 +1234,7 @@ nodatasum_case:
                          */
                         scrub_recheck_block(fs_info, sblock_bad,
                                             is_metadata, have_csum, csum,
-                                           generation, sctx->csum_size);
+                                           generation, sctx->csum_size, 1);
                         if (!sblock_bad->header_error &&
                             !sblock_bad->checksum_error &&
                             sblock_bad->no_io_error_seen)
@@ -1180,6 +1245,7 @@ nodatasum_case:
  corrected_error:
                         spin_lock(&sctx->stat_lock);
                         sctx->stat.corrected_errors++;
+                       sblock_to_check->data_corrected = 1;
                         spin_unlock(&sctx->stat_lock);
                         printk_ratelimited_in_rcu(KERN_ERR
                                 "BTRFS: fixed up error at logical %llu on dev %s\n",
@@ -1201,11 +1267,18 @@ out:
                      mirror_index++) {
                         struct scrub_block *sblock = sblocks_for_recheck +
                                                      mirror_index;
+                       struct scrub_recover *recover;
                         int page_index;
  
                         for (page_index = 0; page_index < sblock->page_count;
                              page_index++) {
                                 sblock->pagev[page_index]->sblock = NULL;
+                               recover = sblock->pagev[page_index]->recover;
+                               if (recover) {
+                                       scrub_put_recover(recover);
+                                       sblock->pagev[page_index]->recover =
+                                                                       NULL;
+                               }
                                 scrub_page_put(sblock->pagev[page_index]);
                         }
                 }
@@ -1215,14 +1288,63 @@ out:
         return 0;
  }
  
+static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
+{
+       if (raid_map) {
+               if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
+                       return 3;
+               else
+                       return 2;
+       } else {
+               return (int)bbio->num_stripes;
+       }
+}
+
+static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
+                                                u64 mapped_length,
+                                                int nstripes, int mirror,
+                                                int *stripe_index,
+                                                u64 *stripe_offset)
+{
+       int i;
+
+       if (raid_map) {
+               /* RAID5/6 */
+               for (i = 0; i < nstripes; i++) {
+                       if (raid_map[i] == RAID6_Q_STRIPE ||
+                           raid_map[i] == RAID5_P_STRIPE)
+                               continue;
+
+                       if (logical >= raid_map[i] &&
+                           logical < raid_map[i] + mapped_length)
+                               break;
+               }
+
+               *stripe_index = i;
+               *stripe_offset = logical - raid_map[i];
+       } else {
+               /* The other RAID type */
+               *stripe_index = mirror;
+               *stripe_offset = 0;
+       }
+}
+
  static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
                                      struct btrfs_fs_info *fs_info,
                                      struct scrub_block *original_sblock,
                                      u64 length, u64 logical,
                                      struct scrub_block *sblocks_for_recheck)
  {
+       struct scrub_recover *recover;
+       struct btrfs_bio *bbio;
+       u64 *raid_map;
+       u64 sublen;
+       u64 mapped_length;
+       u64 stripe_offset;
+       int stripe_index;
         int page_index;
         int mirror_index;
+       int nmirrors;
         int ret;
  
         /*
@@ -1233,23 +1355,39 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
  
         page_index = 0;
         while (length > 0) {
-               u64 sublen = min_t(u64, length, PAGE_SIZE);
-               u64 mapped_length = sublen;
-               struct btrfs_bio *bbio = NULL;
+               sublen = min_t(u64, length, PAGE_SIZE);
+               mapped_length = sublen;
+               bbio = NULL;
+               raid_map = NULL;
  
                 /*
                  * with a length of PAGE_SIZE, each returned stripe
                  * represents one mirror
                  */
-               ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
-                                     &mapped_length, &bbio, 0);
+               ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
+                                      &mapped_length, &bbio, 0, &raid_map);
                 if (ret || !bbio || mapped_length < sublen) {
                         kfree(bbio);
+                       kfree(raid_map);
                         return -EIO;
                 }
  
+               recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
+               if (!recover) {
+                       kfree(bbio);
+                       kfree(raid_map);
+                       return -ENOMEM;
+               }
+
+               atomic_set(&recover->refs, 1);
+               recover->bbio = bbio;
+               recover->raid_map = raid_map;
+               recover->map_length = mapped_length;
+
                 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
-               for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
+
+               nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
+               for (mirror_index = 0; mirror_index < nmirrors;
                      mirror_index++) {
                         struct scrub_block *sblock;
                         struct scrub_page *page;
@@ -1265,26 +1403,38 @@ leave_nomem:
                                 spin_lock(&sctx->stat_lock);
                                 sctx->stat.malloc_errors++;
                                 spin_unlock(&sctx->stat_lock);
-                               kfree(bbio);
+                               scrub_put_recover(recover);
                                 return -ENOMEM;
                         }
                         scrub_page_get(page);
                         sblock->pagev[page_index] = page;
                         page->logical = logical;
-                       page->physical = bbio->stripes[mirror_index].physical;
+
+                       scrub_stripe_index_and_offset(logical, raid_map,
+                                                     mapped_length,
+                                                     bbio->num_stripes,
+                                                     mirror_index,
+                                                     &stripe_index,
+                                                     &stripe_offset);
+                       page->physical = bbio->stripes[stripe_index].physical +
+                                        stripe_offset;
+                       page->dev = bbio->stripes[stripe_index].dev;
+
                         BUG_ON(page_index >= original_sblock->page_count);
                         page->physical_for_dev_replace =
                                 original_sblock->pagev[page_index]->
                                 physical_for_dev_replace;
                         /* for missing devices, dev->bdev is NULL */
-                       page->dev = bbio->stripes[mirror_index].dev;
                         page->mirror_num = mirror_index + 1;
                         sblock->page_count++;
                         page->page = alloc_page(GFP_NOFS);
                         if (!page->page)
                                 goto leave_nomem;
+
+                       scrub_get_recover(recover);
+                       page->recover = recover;
                 }
-               kfree(bbio);
+               scrub_put_recover(recover);
                 length -= sublen;
                 logical += sublen;
                 page_index++;
@@ -1293,6 +1443,51 @@ leave_nomem:
         return 0;
  }
  
+struct scrub_bio_ret {
+       struct completion event;
+       int error;
+};
+
+static void scrub_bio_wait_endio(struct bio *bio, int error)
+{
+       struct scrub_bio_ret *ret = bio->bi_private;
+
+       ret->error = error;
+       complete(&ret->event);
+}
+
+static inline int scrub_is_page_on_raid56(struct scrub_page *page)
+{
+       return page->recover && page->recover->raid_map;
+}
+
+static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
+                                       struct bio *bio,
+                                       struct scrub_page *page)
+{
+       struct scrub_bio_ret done;
+       int ret;
+
+       init_completion(&done.event);
+       done.error = 0;
+       bio->bi_iter.bi_sector = page->logical >> 9;
+       bio->bi_private = &done;
+       bio->bi_end_io = scrub_bio_wait_endio;
+
+       ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
+                                   page->recover->raid_map,
+                                   page->recover->map_length,
+                                   page->mirror_num, 0);
+       if (ret)
+               return ret;
+
+       wait_for_completion(&done.event);
+       if (done.error)
+               return -EIO;
+
+       return 0;
+}
+
  /*
   * this function will check the on disk data for checksum errors, header
   * errors and read I/O errors. If any I/O errors happen, the exact pages
@@ -1303,7 +1498,7 @@ leave_nomem:
  static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
                                 struct scrub_block *sblock, int is_metadata,
                                 int have_csum, u8 *csum, u64 generation,
-                               u16 csum_size)
+                               u16 csum_size, int retry_failed_mirror)
  {
         int page_num;
  
@@ -1329,11 +1524,17 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
                         continue;
                 }
                 bio->bi_bdev = page->dev->bdev;
-               bio->bi_iter.bi_sector = page->physical >> 9;
  
                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
-               if (btrfsic_submit_bio_wait(READ, bio))
-                       sblock->no_io_error_seen = 0;
+               if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
+                       if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
+                               sblock->no_io_error_seen = 0;
+               } else {
+                       bio->bi_iter.bi_sector = page->physical >> 9;
+
+                       if (btrfsic_submit_bio_wait(READ, bio))
+                               sblock->no_io_error_seen = 0;
+               }
  
                 bio_put(bio);
         }
@@ -1486,6 +1687,13 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
  {
         int page_num;
  
+       /*
+        * This block is used for the check of the parity on the source device,
+        * so the data needn't be written into the destination device.
+        */
+       if (sblock->sparity)
+               return;
+
         for (page_num = 0; page_num < sblock->page_count; page_num++) {
                 int ret;
  
@@ -1867,6 +2075,9 @@ static void scrub_block_put(struct scrub_block *sblock)
         if (atomic_dec_and_test(&sblock->ref_count)) {
                 int i;
  
+               if (sblock->sparity)
+                       scrub_parity_put(sblock->sparity);
+
                 for (i = 0; i < sblock->page_count; i++)
                         scrub_page_put(sblock->pagev[i]);
                 kfree(sblock);
@@ -2124,9 +2335,51 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
         scrub_pending_bio_dec(sctx);
  }
  
+static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
+                                      unsigned long *bitmap,
+                                      u64 start, u64 len)
+{
+       int offset;
+       int nsectors;
+       int sectorsize = sparity->sctx->dev_root->sectorsize;
+
+       if (len >= sparity->stripe_len) {
+               bitmap_set(bitmap, 0, sparity->nsectors);
+               return;
+       }
+
+       start -= sparity->logic_start;
+       offset = (int)do_div(start, sparity->stripe_len);
+       offset /= sectorsize;
+       nsectors = (int)len / sectorsize;
+
+       if (offset + nsectors <= sparity->nsectors) {
+               bitmap_set(bitmap, offset, nsectors);
+               return;
+       }
+
+       bitmap_set(bitmap, offset, sparity->nsectors - offset);
+       bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
+}
+
+static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
+                                                  u64 start, u64 len)
+{
+       __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
+}
+
+static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
+                                                 u64 start, u64 len)
+{
+       __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
+}
+
  static void scrub_block_complete(struct scrub_block *sblock)
  {
+       int corrupted = 0;
+
         if (!sblock->no_io_error_seen) {
+               corrupted = 1;
                 scrub_handle_errored_block(sblock);
         } else {
                 /*
@@ -2134,9 +2387,19 @@ static void scrub_block_complete(struct scrub_block *sblock)
                  * dev replace case, otherwise write here in dev replace
                  * case.
                  */
-               if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
+               corrupted = scrub_checksum(sblock);
+               if (!corrupted && sblock->sctx->is_dev_replace)
                         scrub_write_block_to_dev_replace(sblock);
         }
+
+       if (sblock->sparity && corrupted && !sblock->data_corrected) {
+               u64 start = sblock->pagev[0]->logical;
+               u64 end = sblock->pagev[sblock->page_count - 1]->logical +
+                         PAGE_SIZE;
+
+               scrub_parity_mark_sectors_error(sblock->sparity,
+                                               start, end - start);
+       }
  }
  
  static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -2228,6 +2491,132 @@ behind_scrub_pages:
         return 0;
  }
  
+static int scrub_pages_for_parity(struct scrub_parity *sparity,
+                                 u64 logical, u64 len,
+                                 u64 physical, struct btrfs_device *dev,
+                                 u64 flags, u64 gen, int mirror_num, u8 *csum)
+{
+       struct scrub_ctx *sctx = sparity->sctx;
+       struct scrub_block *sblock;
+       int index;
+
+       sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+       if (!sblock) {
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.malloc_errors++;
+               spin_unlock(&sctx->stat_lock);
+               return -ENOMEM;
+       }
+
+       /* one ref inside this function, plus one for each page added to
+        * a bio later on */
+       atomic_set(&sblock->ref_count, 1);
+       sblock->sctx = sctx;
+       sblock->no_io_error_seen = 1;
+       sblock->sparity = sparity;
+       scrub_parity_get(sparity);
+
+       for (index = 0; len > 0; index++) {
+               struct scrub_page *spage;
+               u64 l = min_t(u64, len, PAGE_SIZE);
+
+               spage = kzalloc(sizeof(*spage), GFP_NOFS);
+               if (!spage) {
+leave_nomem:
+                       spin_lock(&sctx->stat_lock);
+                       sctx->stat.malloc_errors++;
+                       spin_unlock(&sctx->stat_lock);
+                       scrub_block_put(sblock);
+                       return -ENOMEM;
+               }
+               BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+               /* For scrub block */
+               scrub_page_get(spage);
+               sblock->pagev[index] = spage;
+               /* For scrub parity */
+               scrub_page_get(spage);
+               list_add_tail(&spage->list, &sparity->spages);
+               spage->sblock = sblock;
+               spage->dev = dev;
+               spage->flags = flags;
+               spage->generation = gen;
+               spage->logical = logical;
+               spage->physical = physical;
+               spage->mirror_num = mirror_num;
+               if (csum) {
+                       spage->have_csum = 1;
+                       memcpy(spage->csum, csum, sctx->csum_size);
+               } else {
+                       spage->have_csum = 0;
+               }
+               sblock->page_count++;
+               spage->page = alloc_page(GFP_NOFS);
+               if (!spage->page)
+                       goto leave_nomem;
+               len -= l;
+               logical += l;
+               physical += l;
+       }
+
+       WARN_ON(sblock->page_count == 0);
+       for (index = 0; index < sblock->page_count; index++) {
+               struct scrub_page *spage = sblock->pagev[index];
+               int ret;
+
+               ret = scrub_add_page_to_rd_bio(sctx, spage);
+               if (ret) {
+                       scrub_block_put(sblock);
+                       return ret;
+               }
+       }
+
+       /* last one frees, either here or in bio completion for last page */
+       scrub_block_put(sblock);
+       return 0;
+}
+
+static int scrub_extent_for_parity(struct scrub_parity *sparity,
+                                  u64 logical, u64 len,
+                                  u64 physical, struct btrfs_device *dev,
+                                  u64 flags, u64 gen, int mirror_num)
+{
+       struct scrub_ctx *sctx = sparity->sctx;
+       int ret;
+       u8 csum[BTRFS_CSUM_SIZE];
+       u32 blocksize;
+
+       if (flags & BTRFS_EXTENT_FLAG_DATA) {
+               blocksize = sctx->sectorsize;
+       } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+               blocksize = sctx->nodesize;
+       } else {
+               blocksize = sctx->sectorsize;
+               WARN_ON(1);
+       }
+
+       while (len) {
+               u64 l = min_t(u64, len, blocksize);
+               int have_csum = 0;
+
+               if (flags & BTRFS_EXTENT_FLAG_DATA) {
+                       /* push csums to sbio */
+                       have_csum = scrub_find_csum(sctx, logical, l, csum);
+                       if (have_csum == 0)
+                               goto skip;
+               }
+               ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
+                                            flags, gen, mirror_num,
+                                            have_csum ? csum : NULL);
+skip:
+               if (ret)
+                       return ret;
+               len -= l;
+               logical += l;
+               physical += l;
+       }
+       return 0;
+}
+
  /*
   * Given a physical address, this will calculate it's
   * logical offset. if this is a parity stripe, it will return
@@ -2236,7 +2625,8 @@ behind_scrub_pages:
   * return 0 if it is a data stripe, 1 means parity stripe.
   */
  static int get_raid56_logic_offset(u64 physical, int num,
-                                  struct map_lookup *map, u64 *offset)
+                                  struct map_lookup *map, u64 *offset,
+                                  u64 *stripe_start)
  {
         int i;
         int j = 0;
@@ -2247,6 +2637,9 @@ static int get_raid56_logic_offset(u64 physical, int num,
  
         last_offset = (physical - map->stripes[num].physical) *
                       nr_data_stripes(map);
+       if (stripe_start)
+               *stripe_start = last_offset;
+
         *offset = last_offset;
         for (i = 0; i < nr_data_stripes(map); i++) {
                 *offset = last_offset + i * map->stripe_len;
@@ -2269,13 +2662,330 @@ static int get_raid56_logic_offset(u64 physical, int num,
         return 1;
  }
  
+static void scrub_free_parity(struct scrub_parity *sparity)
+{
+       struct scrub_ctx *sctx = sparity->sctx;
+       struct scrub_page *curr, *next;
+       int nbits;
+
+       nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
+       if (nbits) {
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.read_errors += nbits;
+               sctx->stat.uncorrectable_errors += nbits;
+               spin_unlock(&sctx->stat_lock);
+       }
+
+       list_for_each_entry_safe(curr, next, &sparity->spages, list) {
+               list_del_init(&curr->list);
+               scrub_page_put(curr);
+       }
+
+       kfree(sparity);
+}
+
+static void scrub_parity_bio_endio(struct bio *bio, int error)
+{
+       struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
+       struct scrub_ctx *sctx = sparity->sctx;
+
+       if (error)
+               bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
+                         sparity->nsectors);
+
+       scrub_free_parity(sparity);
+       scrub_pending_bio_dec(sctx);
+       bio_put(bio);
+}
+
+static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
+{
+       struct scrub_ctx *sctx = sparity->sctx;
+       struct bio *bio;
+       struct btrfs_raid_bio *rbio;
+       struct scrub_page *spage;
+       struct btrfs_bio *bbio = NULL;
+       u64 *raid_map = NULL;
+       u64 length;
+       int ret;
+
+       if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
+                          sparity->nsectors))
+               goto out;
+
+       length = sparity->logic_end - sparity->logic_start + 1;
+       ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
+                              sparity->logic_start,
+                              &length, &bbio, 0, &raid_map);
+       if (ret || !bbio || !raid_map)
+               goto bbio_out;
+
+       bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
+       if (!bio)
+               goto bbio_out;
+
+       bio->bi_iter.bi_sector = sparity->logic_start >> 9;
+       bio->bi_private = sparity;
+       bio->bi_end_io = scrub_parity_bio_endio;
+
+       rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
+                                             raid_map, length,
+                                             sparity->scrub_dev,
+                                             sparity->dbitmap,
+                                             sparity->nsectors);
+       if (!rbio)
+               goto rbio_out;
+
+       list_for_each_entry(spage, &sparity->spages, list)
+               raid56_parity_add_scrub_pages(rbio, spage->page,
+                                             spage->logical);
+
+       scrub_pending_bio_inc(sctx);
+       raid56_parity_submit_scrub_rbio(rbio);
+       return;
+
+rbio_out:
+       bio_put(bio);
+bbio_out:
+       kfree(bbio);
+       kfree(raid_map);
+       bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
+                 sparity->nsectors);
+       spin_lock(&sctx->stat_lock);
+       sctx->stat.malloc_errors++;
+       spin_unlock(&sctx->stat_lock);
+out:
+       scrub_free_parity(sparity);
+}
+
+static inline int scrub_calc_parity_bitmap_len(int nsectors)
+{
+       return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
+}
+
+static void scrub_parity_get(struct scrub_parity *sparity)
+{
+       atomic_inc(&sparity->ref_count);
+}
+
+static void scrub_parity_put(struct scrub_parity *sparity)
+{
+       if (!atomic_dec_and_test(&sparity->ref_count))
+               return;
+
+       scrub_parity_check_and_repair(sparity);
+}
+
+static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
+                                                 struct map_lookup *map,
+                                                 struct btrfs_device *sdev,
+                                                 struct btrfs_path *path,
+                                                 u64 logic_start,
+                                                 u64 logic_end)
+{
+       struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+       struct btrfs_root *root = fs_info->extent_root;
+       struct btrfs_root *csum_root = fs_info->csum_root;
+       struct btrfs_extent_item *extent;
+       u64 flags;
+       int ret;
+       int slot;
+       struct extent_buffer *l;
+       struct btrfs_key key;
+       u64 generation;
+       u64 extent_logical;
+       u64 extent_physical;
+       u64 extent_len;
+       struct btrfs_device *extent_dev;
+       struct scrub_parity *sparity;
+       int nsectors;
+       int bitmap_len;
+       int extent_mirror_num;
+       int stop_loop = 0;
+
+       nsectors = map->stripe_len / root->sectorsize;
+       bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
+       sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
+                         GFP_NOFS);
+       if (!sparity) {
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.malloc_errors++;
+               spin_unlock(&sctx->stat_lock);
+               return -ENOMEM;
+       }
+
+       sparity->stripe_len = map->stripe_len;
+       sparity->nsectors = nsectors;
+       sparity->sctx = sctx;
+       sparity->scrub_dev = sdev;
+       sparity->logic_start = logic_start;
+       sparity->logic_end = logic_end;
+       atomic_set(&sparity->ref_count, 1);
+       INIT_LIST_HEAD(&sparity->spages);
+       sparity->dbitmap = sparity->bitmap;
+       sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
+
+       ret = 0;
+       while (logic_start < logic_end) {
+               if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+                       key.type = BTRFS_METADATA_ITEM_KEY;
+               else
+                       key.type = BTRFS_EXTENT_ITEM_KEY;
+               key.objectid = logic_start;
+               key.offset = (u64)-1;
+
+               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+               if (ret < 0)
+                       goto out;
+
+               if (ret > 0) {
+                       ret = btrfs_previous_extent_item(root, path, 0);
+                       if (ret < 0)
+                               goto out;
+                       if (ret > 0) {
+                               btrfs_release_path(path);
+                               ret = btrfs_search_slot(NULL, root, &key,
+                                                       path, 0, 0);
+                               if (ret < 0)
+                                       goto out;
+                       }
+               }
+
+               stop_loop = 0;
+               while (1) {
+                       u64 bytes;
+
+                       l = path->nodes[0];
+                       slot = path->slots[0];
+                       if (slot >= btrfs_header_nritems(l)) {
+                               ret = btrfs_next_leaf(root, path);
+                               if (ret == 0)
+                                       continue;
+                               if (ret < 0)
+                                       goto out;
+
+                               stop_loop = 1;
+                               break;
+                       }
+                       btrfs_item_key_to_cpu(l, &key, slot);
+
+                       if (key.type == BTRFS_METADATA_ITEM_KEY)
+                               bytes = root->nodesize;
+                       else
+                               bytes = key.offset;
+
+                       if (key.objectid + bytes <= logic_start)
+                               goto next;
+
+                       if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+                           key.type != BTRFS_METADATA_ITEM_KEY)
+                               goto next;
+
+                       if (key.objectid > logic_end) {
+                               stop_loop = 1;
+                               break;
+                       }
+
+                       while (key.objectid >= logic_start + map->stripe_len)
+                               logic_start += map->stripe_len;
+
+                       extent = btrfs_item_ptr(l, slot,
+                                               struct btrfs_extent_item);
+                       flags = btrfs_extent_flags(l, extent);
+                       generation = btrfs_extent_generation(l, extent);
+
+                       if (key.objectid < logic_start &&
+                           (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+                               btrfs_err(fs_info,
+                                         "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
+                                          key.objectid, logic_start);
+                               goto next;
+                       }
+again:
+                       extent_logical = key.objectid;
+                       extent_len = bytes;
+
+                       if (extent_logical < logic_start) {
+                               extent_len -= logic_start - extent_logical;
+                               extent_logical = logic_start;
+                       }
+
+                       if (extent_logical + extent_len >
+                           logic_start + map->stripe_len)
+                               extent_len = logic_start + map->stripe_len -
+                                            extent_logical;
+
+                       scrub_parity_mark_sectors_data(sparity, extent_logical,
+                                                      extent_len);
+
+                       scrub_remap_extent(fs_info, extent_logical,
+                                          extent_len, &extent_physical,
+                                          &extent_dev,
+                                          &extent_mirror_num);
+
+                       ret = btrfs_lookup_csums_range(csum_root,
+                                               extent_logical,
+                                               extent_logical + extent_len - 1,
+                                               &sctx->csum_list, 1);
+                       if (ret)
+                               goto out;
+
+                       ret = scrub_extent_for_parity(sparity, extent_logical,
+                                                     extent_len,
+                                                     extent_physical,
+                                                     extent_dev, flags,
+                                                     generation,
+                                                     extent_mirror_num);
+                       if (ret)
+                               goto out;
+
+                       scrub_free_csums(sctx);
+                       if (extent_logical + extent_len <
+                           key.objectid + bytes) {
+                               logic_start += map->stripe_len;
+
+                               if (logic_start >= logic_end) {
+                                       stop_loop = 1;
+                                       break;
+                               }
+
+                               if (logic_start < key.objectid + bytes) {
+                                       cond_resched();
+                                       goto again;
+                               }
+                       }
+next:
+                       path->slots[0]++;
+               }
+
+               btrfs_release_path(path);
+
+               if (stop_loop)
+                       break;
+
+               logic_start += map->stripe_len;
+       }
+out:
+       if (ret < 0)
+               scrub_parity_mark_sectors_error(sparity, logic_start,
+                                               logic_end - logic_start + 1);
+       scrub_parity_put(sparity);
+       scrub_submit(sctx);
+       mutex_lock(&sctx->wr_ctx.wr_lock);
+       scrub_wr_submit(sctx);
+       mutex_unlock(&sctx->wr_ctx.wr_lock);
+
+       btrfs_release_path(path);
+       return ret < 0 ? ret : 0;
+}
+
  static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                                            struct map_lookup *map,
                                            struct btrfs_device *scrub_dev,
                                            int num, u64 base, u64 length,
                                            int is_dev_replace)
  {
-       struct btrfs_path *path;
+       struct btrfs_path *path, *ppath;
         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
         struct btrfs_root *root = fs_info->extent_root;
         struct btrfs_root *csum_root = fs_info->csum_root;
@@ -2302,6 +3012,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
         u64 extent_logical;
         u64 extent_physical;
         u64 extent_len;
+       u64 stripe_logical;
+       u64 stripe_end;
         struct btrfs_device *extent_dev;
         int extent_mirror_num;
         int stop_loop = 0;
@@ -2327,7 +3039,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                 mirror_num = num % map->num_stripes + 1;
         } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
                                 BTRFS_BLOCK_GROUP_RAID6)) {
-               get_raid56_logic_offset(physical, num, map, &offset);
+               get_raid56_logic_offset(physical, num, map, &offset, NULL);
                 increment = map->stripe_len * nr_data_stripes(map);
                 mirror_num = 1;
         } else {
@@ -2339,6 +3051,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
         if (!path)
                 return -ENOMEM;
  
+       ppath = btrfs_alloc_path();
+       if (!ppath) {
+               btrfs_free_path(ppath);
+               return -ENOMEM;
+       }
+
         /*
          * work on commit root. The related disk blocks are static as
          * long as COW is applied. This means, it is save to rewrite
@@ -2357,7 +3075,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
         if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
                          BTRFS_BLOCK_GROUP_RAID6)) {
                 get_raid56_logic_offset(physical_end, num,
-                                       map, &logic_end);
+                                       map, &logic_end, NULL);
                 logic_end += base;
         } else {
                 logic_end = logical + increment * nstripes;
@@ -2404,10 +3122,18 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
                                 BTRFS_BLOCK_GROUP_RAID6)) {
                         ret = get_raid56_logic_offset(physical, num,
-                                       map, &logical);
+                                       map, &logical, &stripe_logical);
                         logical += base;
-                       if (ret)
+                       if (ret) {
+                               stripe_logical += base;
+                               stripe_end = stripe_logical + increment - 1;
+                               ret = scrub_raid56_parity(sctx, map, scrub_dev,
+                                               ppath, stripe_logical,
+                                               stripe_end);
+                               if (ret)
+                                       goto out;
                                 goto skip;
+                       }
                 }
                 /*
                  * canceled?
@@ -2558,13 +3284,25 @@ again:
                                          * loop until we find next data stripe
                                          * or we have finished all stripes.
                                          */
-                                       do {
-                                               physical += map->stripe_len;
-                                               ret = get_raid56_logic_offset(
-                                                               physical, num,
-                                                               map, &logical);
-                                               logical += base;
-                                       } while (physical < physical_end && ret);
+loop:
+                                       physical += map->stripe_len;
+                                       ret = get_raid56_logic_offset(physical,
+                                                       num, map, &logical,
+                                                       &stripe_logical);
+                                       logical += base;
+
+                                       if (ret && physical < physical_end) {
+                                               stripe_logical += base;
+                                               stripe_end = stripe_logical +
+                                                               increment - 1;
+                                               ret = scrub_raid56_parity(sctx,
+                                                       map, scrub_dev, ppath,
+                                                       stripe_logical,
+                                                       stripe_end);
+                                               if (ret)
+                                                       goto out;
+                                               goto loop;
+                                       }
                                 } else {
                                         physical += map->stripe_len;
                                         logical += increment;
@@ -2605,6 +3343,7 @@ out:
  
         blk_finish_plug(&plug);
         btrfs_free_path(path);
+       btrfs_free_path(ppath);
         return ret < 0 ? ret : 0;
  }
  
@@ -3310,6 +4049,50 @@ out:
         scrub_pending_trans_workers_dec(sctx);
  }
  
+static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
+                                u64 logical)
+{
+       struct extent_state *cached_state = NULL;
+       struct btrfs_ordered_extent *ordered;
+       struct extent_io_tree *io_tree;
+       struct extent_map *em;
+       u64 lockstart = start, lockend = start + len - 1;
+       int ret = 0;
+
+       io_tree = &BTRFS_I(inode)->io_tree;
+
+       lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+       ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
+       if (ordered) {
+               btrfs_put_ordered_extent(ordered);
+               ret = 1;
+               goto out_unlock;
+       }
+
+       em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+       if (IS_ERR(em)) {
+               ret = PTR_ERR(em);
+               goto out_unlock;
+       }
+
+       /*
+        * This extent does not actually cover the logical extent anymore,
+        * move on to the next inode.
+        */
+       if (em->block_start > logical ||
+           em->block_start + em->block_len < logical + len) {
+               free_extent_map(em);
+               ret = 1;
+               goto out_unlock;
+       }
+       free_extent_map(em);
+
+out_unlock:
+       unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
+                            GFP_NOFS);
+       return ret;
+}
+
  static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
                                       struct scrub_copy_nocow_ctx *nocow_ctx)
  {
@@ -3318,13 +4101,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
         struct inode *inode;
         struct page *page;
         struct btrfs_root *local_root;
-       struct btrfs_ordered_extent *ordered;
-       struct extent_map *em;
-       struct extent_state *cached_state = NULL;
         struct extent_io_tree *io_tree;
         u64 physical_for_dev_replace;
+       u64 nocow_ctx_logical;
         u64 len = nocow_ctx->len;
-       u64 lockstart = offset, lockend = offset + len - 1;
         unsigned long index;
         int srcu_index;
         int ret = 0;
@@ -3356,30 +4136,13 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
  
         physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
         io_tree = &BTRFS_I(inode)->io_tree;
+       nocow_ctx_logical = nocow_ctx->logical;
  
-       lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
-       ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
-       if (ordered) {
-               btrfs_put_ordered_extent(ordered);
-               goto out_unlock;
-       }
-
-       em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
-       if (IS_ERR(em)) {
-               ret = PTR_ERR(em);
-               goto out_unlock;
-       }
-
-       /*
-        * This extent does not actually cover the logical extent anymore,
-        * move on to the next inode.
-        */
-       if (em->block_start > nocow_ctx->logical ||
-           em->block_start + em->block_len < nocow_ctx->logical + len) {
-               free_extent_map(em);
-               goto out_unlock;
+       ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
+       if (ret) {
+               ret = ret > 0 ? 0 : ret;
+               goto out;
         }
-       free_extent_map(em);
  
         while (len >= PAGE_CACHE_SIZE) {
                 index = offset >> PAGE_CACHE_SHIFT;
@@ -3396,7 +4159,7 @@ again:
                                 goto next_page;
                 } else {
                         ClearPageError(page);
-                       err = extent_read_full_page_nolock(io_tree, page,
+                       err = extent_read_full_page(io_tree, page,
                                                            btrfs_get_extent,
                                                            nocow_ctx->mirror_num);
                         if (err) {
@@ -3421,6 +4184,14 @@ again:
                                 goto next_page;
                         }
                 }
+
+               ret = check_extent_to_block(inode, offset, len,
+                                           nocow_ctx_logical);
+               if (ret) {
+                       ret = ret > 0 ? 0 : ret;
+                       goto next_page;
+               }
+
                 err = write_page_nocow(nocow_ctx->sctx,
                                        physical_for_dev_replace, page);
                 if (err)
@@ -3434,12 +4205,10 @@ next_page:
  
                 offset += PAGE_CACHE_SIZE;
                 physical_for_dev_replace += PAGE_CACHE_SIZE;
+               nocow_ctx_logical += PAGE_CACHE_SIZE;
                 len -= PAGE_CACHE_SIZE;
         }
         ret = COPY_COMPLETE;
-out_unlock:
-       unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
-                            GFP_NOFS);
  out:
         mutex_unlock(&inode->i_mutex);
         iput(inode);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c

index 874828d..804432d 100644 (file)
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5507,6 +5507,51 @@ out:
         return ret;
  }
  
+/*
+ * If orphan cleanup did remove any orphans from a root, it means the tree
+ * was modified and therefore the commit root is not the same as the current
+ * root anymore. This is a problem, because send uses the commit root and
+ * therefore can see inode items that don't exist in the current root anymore,
+ * and for example make calls to btrfs_iget, which will do tree lookups based
+ * on the current root and not on the commit root. Those lookups will fail,
+ * returning a -ESTALE error, and making send fail with that error. So make
+ * sure a send does not see any orphans we have just removed, and that it will
+ * see the same inodes regardless of whether a transaction commit happened
+ * before it started (meaning that the commit root will be the same as the
+ * current root) or not.
+ */
+static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
+{
+       int i;
+       struct btrfs_trans_handle *trans = NULL;
+
+again:
+       if (sctx->parent_root &&
+           sctx->parent_root->node != sctx->parent_root->commit_root)
+               goto commit_trans;
+
+       for (i = 0; i < sctx->clone_roots_cnt; i++)
+               if (sctx->clone_roots[i].root->node !=
+                   sctx->clone_roots[i].root->commit_root)
+                       goto commit_trans;
+
+       if (trans)
+               return btrfs_end_transaction(trans, sctx->send_root);
+
+       return 0;
+
+commit_trans:
+       /* Use any root, all fs roots will get their commit roots updated. */
+       if (!trans) {
+               trans = btrfs_join_transaction(sctx->send_root);
+               if (IS_ERR(trans))
+                       return PTR_ERR(trans);
+               goto again;
+       }
+
+       return btrfs_commit_transaction(trans, sctx->send_root);
+}
+
  static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
  {
         spin_lock(&root->root_item_lock);
@@ -5728,6 +5773,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                         NULL);
         sort_clone_roots = 1;
  
+       ret = ensure_commit_roots_uptodate(sctx);
+       if (ret)
+               goto out;
+
         current->journal_info = BTRFS_SEND_TRANS_STUB;
         ret = send_subvol(sctx);
         current->journal_info = NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c

index 54bd91e..60f7cbe 100644 (file)
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -262,7 +262,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
         trans->aborted = errno;
         /* Nothing used. The other threads that have joined this
          * transaction may be able to continue. */
-       if (!trans->blocks_used) {
+       if (!trans->blocks_used && list_empty(&trans->new_bgs)) {
                 const char *errstr;
  
                 errstr = btrfs_decode_error(errno);
@@ -642,11 +642,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                                              "disabling disk space caching");
                         break;
                 case Opt_inode_cache:
-                       btrfs_set_and_info(root, CHANGE_INODE_CACHE,
+                       btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
                                            "enabling inode map caching");
                         break;
                 case Opt_noinode_cache:
-                       btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
+                       btrfs_clear_pending_and_info(info, INODE_MAP_CACHE,
                                              "disabling inode map caching");
                         break;
                 case Opt_clear_cache:
@@ -993,9 +993,17 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
         trans = btrfs_attach_transaction_barrier(root);
         if (IS_ERR(trans)) {
                 /* no transaction, don't bother */
-               if (PTR_ERR(trans) == -ENOENT)
-                       return 0;
-               return PTR_ERR(trans);
+               if (PTR_ERR(trans) == -ENOENT) {
+                       /*
+                        * Exit unless we have some pending changes
+                        * that need to go through commit
+                        */
+                       if (fs_info->pending_changes == 0)
+                               return 0;
+                       trans = btrfs_start_transaction(root, 0);
+               } else {
+                       return PTR_ERR(trans);
+               }
         }
         return btrfs_commit_transaction(trans, root);
  }
@@ -1644,8 +1652,20 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
         int i = 0, nr_devices;
         int ret;
  
+       /*
+        * We aren't under the device list lock, so this is racey-ish, but good
+        * enough for our purposes.
+        */
         nr_devices = fs_info->fs_devices->open_devices;
-       BUG_ON(!nr_devices);
+       if (!nr_devices) {
+               smp_mb();
+               nr_devices = fs_info->fs_devices->open_devices;
+               ASSERT(nr_devices);
+               if (!nr_devices) {
+                       *free_bytes = 0;
+                       return 0;
+               }
+       }
  
         devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
                                GFP_NOFS);
@@ -1670,11 +1690,17 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
         else
                 min_stripe_size = BTRFS_STRIPE_LEN;
  
-       list_for_each_entry(device, &fs_devices->devices, dev_list) {
+       if (fs_info->alloc_start)
+               mutex_lock(&fs_devices->device_list_mutex);
+       rcu_read_lock();
+       list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
                 if (!device->in_fs_metadata || !device->bdev ||
                     device->is_tgtdev_for_dev_replace)
                         continue;
  
+               if (i >= nr_devices)
+                       break;
+
                 avail_space = device->total_bytes - device->bytes_used;
  
                 /* align with stripe_len */
@@ -1689,24 +1715,32 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
                 skip_space = 1024 * 1024;
  
                 /* user can set the offset in fs_info->alloc_start. */
-               if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
-                   device->total_bytes)
+               if (fs_info->alloc_start &&
+                   fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+                   device->total_bytes) {
+                       rcu_read_unlock();
                         skip_space = max(fs_info->alloc_start, skip_space);
  
-               /*
-                * btrfs can not use the free space in [0, skip_space - 1],
-                * we must subtract it from the total. In order to implement
-                * it, we account the used space in this range first.
-                */
-               ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
-                                                    &used_space);
-               if (ret) {
-                       kfree(devices_info);
-                       return ret;
-               }
+                       /*
+                        * btrfs can not use the free space in
+                        * [0, skip_space - 1], we must subtract it from the
+                        * total. In order to implement it, we account the used
+                        * space in this range first.
+                        */
+                       ret = btrfs_account_dev_extents_size(device, 0,
+                                                            skip_space - 1,
+                                                            &used_space);
+                       if (ret) {
+                               kfree(devices_info);
+                               mutex_unlock(&fs_devices->device_list_mutex);
+                               return ret;
+                       }
  
-               /* calc the free space in [0, skip_space - 1] */
-               skip_space -= used_space;
+                       rcu_read_lock();
+
+                       /* calc the free space in [0, skip_space - 1] */
+                       skip_space -= used_space;
+               }
  
                 /*
                  * we can use the free space in [0, skip_space - 1], subtract
@@ -1725,6 +1759,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
  
                 i++;
         }
+       rcu_read_unlock();
+       if (fs_info->alloc_start)
+               mutex_unlock(&fs_devices->device_list_mutex);
  
         nr_devices = i;
  
@@ -1787,8 +1824,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
          * holding chunk_muext to avoid allocating new chunks, holding
          * device_list_mutex to avoid the device being removed
          */
-       mutex_lock(&fs_info->fs_devices->device_list_mutex);
-       mutex_lock(&fs_info->chunk_mutex);
         rcu_read_lock();
         list_for_each_entry_rcu(found, head, list) {
                 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1824,17 +1859,12 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
         buf->f_bfree -= block_rsv->size >> bits;
         spin_unlock(&block_rsv->lock);
  
-       buf->f_bavail = total_free_data;
+       buf->f_bavail = div_u64(total_free_data, factor);
         ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
-       if (ret) {
-               mutex_unlock(&fs_info->chunk_mutex);
-               mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+       if (ret)
                 return ret;
-       }
         buf->f_bavail += div_u64(total_free_data, factor);
         buf->f_bavail = buf->f_bavail >> bits;
-       mutex_unlock(&fs_info->chunk_mutex);
-       mutex_unlock(&fs_info->fs_devices->device_list_mutex);
  
         buf->f_type = BTRFS_SUPER_MAGIC;
         buf->f_bsize = dentry->d_sb->s_blocksize;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c

index b2e7bb4..92db3f6 100644 (file)
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -111,7 +111,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
  {
         struct btrfs_fs_info *fs_info;
         struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
-       struct btrfs_trans_handle *trans;
         u64 features, set, clear;
         unsigned long val;
         int ret;
@@ -153,10 +152,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
         btrfs_info(fs_info, "%s %s feature flag",
                    val ? "Setting" : "Clearing", fa->kobj_attr.attr.name);
  
-       trans = btrfs_start_transaction(fs_info->fs_root, 0);
-       if (IS_ERR(trans))
-               return PTR_ERR(trans);
-
         spin_lock(&fs_info->super_lock);
         features = get_features(fs_info, fa->feature_set);
         if (val)
@@ -166,9 +161,11 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
         set_features(fs_info, fa->feature_set, features);
         spin_unlock(&fs_info->super_lock);
  
-       ret = btrfs_commit_transaction(trans, fs_info->fs_root);
-       if (ret)
-               return ret;
+       /*
+        * We don't want to do full transaction commit from inside sysfs
+        */
+       btrfs_set_pending(fs_info, COMMIT);
+       wake_up_process(fs_info->transaction_kthread);
  
         return count;
  }
@@ -372,9 +369,6 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
                                  const char *buf, size_t len)
  {
         struct btrfs_fs_info *fs_info = to_fs_info(kobj);
-       struct btrfs_trans_handle *trans;
-       struct btrfs_root *root = fs_info->fs_root;
-       int ret;
         size_t p_len;
  
         if (fs_info->sb->s_flags & MS_RDONLY)
@@ -389,20 +383,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
         if (p_len >= BTRFS_LABEL_SIZE)
                 return -EINVAL;
  
-       trans = btrfs_start_transaction(root, 0);
-       if (IS_ERR(trans))
-               return PTR_ERR(trans);
-
-       spin_lock(&root->fs_info->super_lock);
+       spin_lock(&fs_info->super_lock);
         memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE);
         memcpy(fs_info->super_copy->label, buf, p_len);
-       spin_unlock(&root->fs_info->super_lock);
-       ret = btrfs_commit_transaction(trans, root);
+       spin_unlock(&fs_info->super_lock);
  
-       if (!ret)
-               return len;
+       /*
+        * We don't want to do full transaction commit from inside sysfs
+        */
+       btrfs_set_pending(fs_info, COMMIT);
+       wake_up_process(fs_info->transaction_kthread);
  
-       return ret;
+       return len;
  }
  BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
  
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c

index dcaae36..a605d4e 100644 (file)
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
         }
  }
  
+static void clear_btree_io_tree(struct extent_io_tree *tree)
+{
+       spin_lock(&tree->lock);
+       while (!RB_EMPTY_ROOT(&tree->state)) {
+               struct rb_node *node;
+               struct extent_state *state;
+
+               node = rb_first(&tree->state);
+               state = rb_entry(node, struct extent_state, rb_node);
+               rb_erase(&state->rb_node, &tree->state);
+               RB_CLEAR_NODE(&state->rb_node);
+               /*
+                * btree io trees aren't supposed to have tasks waiting for
+                * changes in the flags of extent states ever.
+                */
+               ASSERT(!waitqueue_active(&state->wq));
+               free_extent_state(state);
+               if (need_resched()) {
+                       spin_unlock(&tree->lock);
+                       cond_resched();
+                       spin_lock(&tree->lock);
+               }
+       }
+       spin_unlock(&tree->lock);
+}
+
  static noinline void switch_commit_roots(struct btrfs_transaction *trans,
                                          struct btrfs_fs_info *fs_info)
  {
@@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans,
                 root->commit_root = btrfs_root_node(root);
                 if (is_fstree(root->objectid))
                         btrfs_unpin_free_ino(root);
+               clear_btree_io_tree(&root->dirty_log_pages);
         }
         up_write(&fs_info->commit_root_sem);
  }
@@ -220,6 +247,7 @@ loop:
         INIT_LIST_HEAD(&cur_trans->pending_snapshots);
         INIT_LIST_HEAD(&cur_trans->pending_chunks);
         INIT_LIST_HEAD(&cur_trans->switch_commits);
+       INIT_LIST_HEAD(&cur_trans->pending_ordered);
         list_add_tail(&cur_trans->list, &fs_info->trans_list);
         extent_io_tree_init(&cur_trans->dirty_pages,
                              fs_info->btree_inode->i_mapping);
@@ -488,6 +516,7 @@ again:
         h->sync = false;
         INIT_LIST_HEAD(&h->qgroup_ref_list);
         INIT_LIST_HEAD(&h->new_bgs);
+       INIT_LIST_HEAD(&h->ordered);
  
         smp_mb();
         if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -719,6 +748,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
         if (!list_empty(&trans->new_bgs))
                 btrfs_create_pending_block_groups(trans, root);
  
+       if (!list_empty(&trans->ordered)) {
+               spin_lock(&info->trans_lock);
+               list_splice(&trans->ordered, &cur_trans->pending_ordered);
+               spin_unlock(&info->trans_lock);
+       }
+
         trans->delayed_ref_updates = 0;
         if (!trans->sync) {
                 must_run_delayed_refs =
@@ -828,17 +863,39 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
  
         while (!find_first_extent_bit(dirty_pages, start, &start, &end,
                                       mark, &cached_state)) {
-               convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
-                                  mark, &cached_state, GFP_NOFS);
-               cached_state = NULL;
-               err = filemap_fdatawrite_range(mapping, start, end);
+               bool wait_writeback = false;
+
+               err = convert_extent_bit(dirty_pages, start, end,
+                                        EXTENT_NEED_WAIT,
+                                        mark, &cached_state, GFP_NOFS);
+               /*
+                * convert_extent_bit can return -ENOMEM, which is most of the
+                * time a temporary error. So when it happens, ignore the error
+                * and wait for writeback of this range to finish - because we
+                * failed to set the bit EXTENT_NEED_WAIT for the range, a call
+                * to btrfs_wait_marked_extents() would not know that writeback
+                * for this range started and therefore wouldn't wait for it to
+                * finish - we don't want to commit a superblock that points to
+                * btree nodes/leafs for which writeback hasn't finished yet
+                * (and without errors).
+                * We cleanup any entries left in the io tree when committing
+                * the transaction (through clear_btree_io_tree()).
+                */
+               if (err == -ENOMEM) {
+                       err = 0;
+                       wait_writeback = true;
+               }
+               if (!err)
+                       err = filemap_fdatawrite_range(mapping, start, end);
                 if (err)
                         werr = err;
+               else if (wait_writeback)
+                       werr = filemap_fdatawait_range(mapping, start, end);
+               free_extent_state(cached_state);
+               cached_state = NULL;
                 cond_resched();
                 start = end + 1;
         }
-       if (err)
-               werr = err;
         return werr;
  }
  
@@ -862,11 +919,25 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
  
         while (!find_first_extent_bit(dirty_pages, start, &start, &end,
                                       EXTENT_NEED_WAIT, &cached_state)) {
-               clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
-                                0, 0, &cached_state, GFP_NOFS);
-               err = filemap_fdatawait_range(mapping, start, end);
+               /*
+                * Ignore -ENOMEM errors returned by clear_extent_bit().
+                * When committing the transaction, we'll remove any entries
+                * left in the io tree. For a log commit, we don't remove them
+                * after committing the log because the tree can be accessed
+                * concurrently - we do it only at transaction commit time when
+                * it's safe to do it (through clear_btree_io_tree()).
+                */
+               err = clear_extent_bit(dirty_pages, start, end,
+                                      EXTENT_NEED_WAIT,
+                                      0, 0, &cached_state, GFP_NOFS);
+               if (err == -ENOMEM)
+                       err = 0;
+               if (!err)
+                       err = filemap_fdatawait_range(mapping, start, end);
                 if (err)
                         werr = err;
+               free_extent_state(cached_state);
+               cached_state = NULL;
                 cond_resched();
                 start = end + 1;
         }
@@ -919,17 +990,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
         return 0;
  }
  
-int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root)
  {
-       if (!trans || !trans->transaction) {
-               struct inode *btree_inode;
-               btree_inode = root->fs_info->btree_inode;
-               return filemap_write_and_wait(btree_inode->i_mapping);
-       }
-       return btrfs_write_and_wait_marked_extents(root,
+       int ret;
+
+       ret = btrfs_write_and_wait_marked_extents(root,
                                            &trans->transaction->dirty_pages,
                                            EXTENT_DIRTY);
+       clear_btree_io_tree(&trans->transaction->dirty_pages);
+
+       return ret;
  }
  
  /*
@@ -1652,6 +1723,28 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
                 btrfs_wait_ordered_roots(fs_info, -1);
  }
  
+static inline void
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
+                          struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_ordered_extent *ordered;
+
+       spin_lock(&fs_info->trans_lock);
+       while (!list_empty(&cur_trans->pending_ordered)) {
+               ordered = list_first_entry(&cur_trans->pending_ordered,
+                                          struct btrfs_ordered_extent,
+                                          trans_list);
+               list_del_init(&ordered->trans_list);
+               spin_unlock(&fs_info->trans_lock);
+
+               wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+                                                  &ordered->flags));
+               btrfs_put_ordered_extent(ordered);
+               spin_lock(&fs_info->trans_lock);
+       }
+       spin_unlock(&fs_info->trans_lock);
+}
+
  int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root)
  {
@@ -1702,6 +1795,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         }
  
         spin_lock(&root->fs_info->trans_lock);
+       list_splice(&trans->ordered, &cur_trans->pending_ordered);
         if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
                 spin_unlock(&root->fs_info->trans_lock);
                 atomic_inc(&cur_trans->use_count);
@@ -1754,6 +1848,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  
         btrfs_wait_delalloc_flush(root->fs_info);
  
+       btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+
         btrfs_scrub_pause(root);
         /*
          * Ok now we need to make sure to block out any other joins while we
@@ -1842,13 +1938,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         }
  
         /*
-        * Since the transaction is done, we should set the inode map cache flag
-        * before any other comming transaction.
+        * Since the transaction is done, we can apply the pending changes
+        * before the next transaction.
          */
-       if (btrfs_test_opt(root, CHANGE_INODE_CACHE))
-               btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
-       else
-               btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
+       btrfs_apply_pending_changes(root->fs_info);
  
         /* commit_fs_roots gets rid of all the tree log roots, it is now
          * safe to free the root of tree log roots
@@ -2019,3 +2112,32 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
  
         return (ret < 0) ? 0 : 1;
  }
+
+void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
+{
+       unsigned long prev;
+       unsigned long bit;
+
+       prev = cmpxchg(&fs_info->pending_changes, 0, 0);
+       if (!prev)
+               return;
+
+       bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE;
+       if (prev & bit)
+               btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE);
+       prev &= ~bit;
+
+       bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE;
+       if (prev & bit)
+               btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE);
+       prev &= ~bit;
+
+       bit = 1 << BTRFS_PENDING_COMMIT;
+       if (prev & bit)
+               btrfs_debug(fs_info, "pending commit done");
+       prev &= ~bit;
+
+       if (prev)
+               btrfs_warn(fs_info,
+                       "unknown pending changes left 0x%lx, ignoring", prev);
+}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h

index d8f40e1..00ed29c 100644 (file)
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -56,6 +56,7 @@ struct btrfs_transaction {
         wait_queue_head_t commit_wait;
         struct list_head pending_snapshots;
         struct list_head pending_chunks;
+       struct list_head pending_ordered;
         struct list_head switch_commits;
         struct btrfs_delayed_ref_root delayed_refs;
         int aborted;
@@ -105,6 +106,7 @@ struct btrfs_trans_handle {
          */
         struct btrfs_root *root;
         struct seq_list delayed_ref_elem;
+       struct list_head ordered;
         struct list_head qgroup_ref_list;
         struct list_head new_bgs;
  };
@@ -145,8 +147,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
                                         struct btrfs_root *root);
  struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
  int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
-int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *root);
  
  void btrfs_add_dead_root(struct btrfs_root *root);
  int btrfs_defrag_root(struct btrfs_root *root);
@@ -170,4 +170,6 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
  int btrfs_transaction_blocked(struct btrfs_fs_info *info);
  int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
  void btrfs_put_transaction(struct btrfs_transaction *transaction);
+void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
+
  #endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c

index 286213c..9a02da1 100644 (file)
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2599,12 +2599,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         index2 = root_log_ctx.log_transid % 2;
         if (atomic_read(&log_root_tree->log_commit[index2])) {
                 blk_finish_plug(&plug);
-               btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+               ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
+                                               mark);
+               btrfs_wait_logged_extents(trans, log, log_transid);
                 wait_log_commit(trans, log_root_tree,
                                 root_log_ctx.log_transid);
-               btrfs_free_logged_extents(log, log_transid);
                 mutex_unlock(&log_root_tree->log_mutex);
-               ret = root_log_ctx.log_ret;
+               if (!ret)
+                       ret = root_log_ctx.log_ret;
                 goto out;
         }
         ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
@@ -2641,11 +2643,18 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                 mutex_unlock(&log_root_tree->log_mutex);
                 goto out_wake_log_root;
         }
-       btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
-       btrfs_wait_marked_extents(log_root_tree,
-                                 &log_root_tree->dirty_log_pages,
-                                 EXTENT_NEW | EXTENT_DIRTY);
-       btrfs_wait_logged_extents(log, log_transid);
+       ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+       if (!ret)
+               ret = btrfs_wait_marked_extents(log_root_tree,
+                                               &log_root_tree->dirty_log_pages,
+                                               EXTENT_NEW | EXTENT_DIRTY);
+       if (ret) {
+               btrfs_set_log_full_commit(root->fs_info, trans);
+               btrfs_free_logged_extents(log, log_transid);
+               mutex_unlock(&log_root_tree->log_mutex);
+               goto out_wake_log_root;
+       }
+       btrfs_wait_logged_extents(trans, log, log_transid);
  
         btrfs_set_super_log_root(root->fs_info->super_for_commit,
                                 log_root_tree->node->start);
@@ -3626,6 +3635,12 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
                             test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
  
                 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
+                       /*
+                        * Clear the AS_EIO/AS_ENOSPC flags from the inode's
+                        * i_mapping flags, so that the next fsync won't get
+                        * an outdated io error too.
+                        */
+                       btrfs_inode_check_errors(inode);
                         *ordered_io_error = true;
                         break;
                 }
@@ -3766,7 +3781,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
         fi = btrfs_item_ptr(leaf, path->slots[0],
                             struct btrfs_file_extent_item);
  
-       btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
+       btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
                                                &token);
         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
                 btrfs_set_token_file_extent_type(leaf, fi,
@@ -3963,7 +3978,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
  
         mutex_lock(&BTRFS_I(inode)->log_mutex);
  
-       btrfs_get_logged_extents(inode, &logged_list);
+       btrfs_get_logged_extents(inode, &logged_list, start, end);
  
         /*
          * a brute force approach to making sure we get the most uptodate
@@ -4089,6 +4104,21 @@ log_extents:
         btrfs_release_path(path);
         btrfs_release_path(dst_path);
         if (fast_search) {
+               /*
+                * Some ordered extents started by fsync might have completed
+                * before we collected the ordered extents in logged_list, which
+                * means they're gone, not in our logged_list nor in the inode's
+                * ordered tree. We want the application/user space to know an
+                * error happened while attempting to persist file data so that
+                * it can take proper action. If such error happened, we leave
+                * without writing to the log tree and the fsync must report the
+                * file data write error and not commit the current transaction.
+                */
+               err = btrfs_inode_check_errors(inode);
+               if (err) {
+                       ctx->io_err = err;
+                       goto out_unlock;
+               }
                 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
                                                 &logged_list, ctx);
                 if (ret) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index d47289c..0144790 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -53,16 +53,6 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
  DEFINE_MUTEX(uuid_mutex);
  static LIST_HEAD(fs_uuids);
  
-static void lock_chunks(struct btrfs_root *root)
-{
-       mutex_lock(&root->fs_info->chunk_mutex);
-}
-
-static void unlock_chunks(struct btrfs_root *root)
-{
-       mutex_unlock(&root->fs_info->chunk_mutex);
-}
-
  static struct btrfs_fs_devices *__alloc_fs_devices(void)
  {
         struct btrfs_fs_devices *fs_devs;
@@ -1068,9 +1058,11 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
                                    u64 *start, u64 len)
  {
         struct extent_map *em;
+       struct list_head *search_list = &trans->transaction->pending_chunks;
         int ret = 0;
  
-       list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
+again:
+       list_for_each_entry(em, search_list, list) {
                 struct map_lookup *map;
                 int i;
  
@@ -1087,6 +1079,10 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
                         ret = 1;
                 }
         }
+       if (search_list == &trans->transaction->pending_chunks) {
+               search_list = &trans->root->fs_info->pinned_chunks;
+               goto again;
+       }
  
         return ret;
  }
@@ -1800,8 +1796,8 @@ error_undo:
         goto error_brelse;
  }
  
-void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
-                                struct btrfs_device *srcdev)
+void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
+                                       struct btrfs_device *srcdev)
  {
         struct btrfs_fs_devices *fs_devices;
  
@@ -1829,6 +1825,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
  
         if (srcdev->bdev)
                 fs_devices->open_devices--;
+}
+
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
+                                     struct btrfs_device *srcdev)
+{
+       struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
  
         call_rcu(&srcdev->rcu, free_device);
  
@@ -2647,18 +2649,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
                 }
         }
  
-       ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+       ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
         if (ret) {
                 btrfs_abort_transaction(trans, extent_root, ret);
                 goto out;
         }
  
-       write_lock(&em_tree->lock);
-       remove_extent_mapping(em_tree, em);
-       write_unlock(&em_tree->lock);
-
-       /* once for the tree */
-       free_extent_map(em);
  out:
         /* once for us */
         free_extent_map(em);
@@ -4505,6 +4501,8 @@ error_del_extent:
         free_extent_map(em);
         /* One for the tree reference */
         free_extent_map(em);
+       /* One for the pending_chunks list reference */
+       free_extent_map(em);
  error:
         kfree(devices_info);
         return ret;
@@ -4881,13 +4879,15 @@ static inline int parity_smaller(u64 a, u64 b)
  static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
  {
         struct btrfs_bio_stripe s;
+       int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
         int i;
         u64 l;
         int again = 1;
+       int m;
  
         while (again) {
                 again = 0;
-               for (i = 0; i < bbio->num_stripes - 1; i++) {
+               for (i = 0; i < real_stripes - 1; i++) {
                         if (parity_smaller(raid_map[i], raid_map[i+1])) {
                                 s = bbio->stripes[i];
                                 l = raid_map[i];
@@ -4895,6 +4895,14 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
                                 raid_map[i] = raid_map[i+1];
                                 bbio->stripes[i+1] = s;
                                 raid_map[i+1] = l;
+
+                               if (bbio->tgtdev_map) {
+                                       m = bbio->tgtdev_map[i];
+                                       bbio->tgtdev_map[i] =
+                                                       bbio->tgtdev_map[i + 1];
+                                       bbio->tgtdev_map[i + 1] = m;
+                               }
+
                                 again = 1;
                         }
                 }
@@ -4923,6 +4931,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
         int ret = 0;
         int num_stripes;
         int max_errors = 0;
+       int tgtdev_indexes = 0;
         struct btrfs_bio *bbio = NULL;
         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
         int dev_replace_is_ongoing = 0;
@@ -5161,15 +5170,14 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                                 BTRFS_BLOCK_GROUP_RAID6)) {
                 u64 tmp;
  
-               if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
-                   && raid_map_ret) {
+               if (raid_map_ret &&
+                   ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
+                    mirror_num > 1)) {
                         int i, rot;
  
                         /* push stripe_nr back to the start of the full stripe */
                         stripe_nr = raid56_full_stripe_start;
-                       do_div(stripe_nr, stripe_len);
-
-                       stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+                       do_div(stripe_nr, stripe_len * nr_data_stripes(map));
  
                         /* RAID[56] write or recovery. Return all stripes */
                         num_stripes = map->num_stripes;
@@ -5235,14 +5243,19 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                         num_alloc_stripes <<= 1;
                 if (rw & REQ_GET_READ_MIRRORS)
                         num_alloc_stripes++;
+               tgtdev_indexes = num_stripes;
         }
-       bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
+
+       bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes),
+                      GFP_NOFS);
         if (!bbio) {
                 kfree(raid_map);
                 ret = -ENOMEM;
                 goto out;
         }
         atomic_set(&bbio->error, 0);
+       if (dev_replace_is_ongoing)
+               bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
  
         if (rw & REQ_DISCARD) {
                 int factor = 0;
@@ -5327,6 +5340,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
         if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
                 max_errors = btrfs_chunk_max_errors(map);
  
+       tgtdev_indexes = 0;
         if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
             dev_replace->tgtdev != NULL) {
                 int index_where_to_add;
@@ -5355,8 +5369,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                                 new->physical = old->physical;
                                 new->length = old->length;
                                 new->dev = dev_replace->tgtdev;
+                               bbio->tgtdev_map[i] = index_where_to_add;
                                 index_where_to_add++;
                                 max_errors++;
+                               tgtdev_indexes++;
                         }
                 }
                 num_stripes = index_where_to_add;
@@ -5402,7 +5418,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                                 tgtdev_stripe->length =
                                         bbio->stripes[index_srcdev].length;
                                 tgtdev_stripe->dev = dev_replace->tgtdev;
+                               bbio->tgtdev_map[index_srcdev] = num_stripes;
  
+                               tgtdev_indexes++;
                                 num_stripes++;
                         }
                 }
@@ -5412,6 +5430,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
         bbio->num_stripes = num_stripes;
         bbio->max_errors = max_errors;
         bbio->mirror_num = mirror_num;
+       bbio->num_tgtdevs = tgtdev_indexes;
  
         /*
          * this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -5443,6 +5462,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                                  mirror_num, NULL);
  }
  
+/* For Scrub/replace */
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
+                    u64 logical, u64 *length,
+                    struct btrfs_bio **bbio_ret, int mirror_num,
+                    u64 **raid_map_ret)
+{
+       return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
+                                mirror_num, raid_map_ret);
+}
+
  int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                      u64 chunk_start, u64 physical, u64 devid,
                      u64 **logical, int *naddrs, int *stripe_len)
@@ -5812,12 +5841,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                 } else {
                         ret = raid56_parity_recover(root, bio, bbio,
                                                     raid_map, map_length,
-                                                   mirror_num);
+                                                   mirror_num, 1);
                 }
-               /*
-                * FIXME, replace dosen't support raid56 yet, please fix
-                * it in the future.
-                */
+
                 btrfs_bio_counter_dec(root->fs_info);
                 return ret;
         }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h

index 08980fa..d6fe73c 100644 (file)
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -292,7 +292,7 @@ struct btrfs_bio_stripe {
  struct btrfs_bio;
  typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
  
-#define BTRFS_BIO_ORIG_BIO_SUBMITTED   0x1
+#define BTRFS_BIO_ORIG_BIO_SUBMITTED   (1 << 0)
  
  struct btrfs_bio {
         atomic_t stripes_pending;
@@ -305,6 +305,8 @@ struct btrfs_bio {
         int max_errors;
         int num_stripes;
         int mirror_num;
+       int num_tgtdevs;
+       int *tgtdev_map;
         struct btrfs_bio_stripe stripes[];
  };
  
@@ -387,12 +389,18 @@ struct btrfs_balance_control {
  int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
                                    u64 end, u64 *length);
  
-#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
-                           (sizeof(struct btrfs_bio_stripe) * (n)))
+#define btrfs_bio_size(total_stripes, real_stripes)            \
+       (sizeof(struct btrfs_bio) +                             \
+        (sizeof(struct btrfs_bio_stripe) * (total_stripes)) +  \
+        (sizeof(int) * (real_stripes)))
  
  int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                     u64 logical, u64 *length,
                     struct btrfs_bio **bbio_ret, int mirror_num);
+int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
+                    u64 logical, u64 *length,
+                    struct btrfs_bio **bbio_ret, int mirror_num,
+                    u64 **raid_map_ret);
  int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                      u64 chunk_start, u64 physical, u64 devid,
                      u64 **logical, int *naddrs, int *stripe_len);
@@ -448,8 +456,10 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
  int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
  int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
                         struct btrfs_fs_info *fs_info);
-void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
-                                struct btrfs_device *srcdev);
+void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
+                                       struct btrfs_device *srcdev);
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
+                                     struct btrfs_device *srcdev);
  void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
                                       struct btrfs_device *tgtdev);
  void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
@@ -513,4 +523,16 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
  void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
  void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
                                         struct btrfs_transaction *transaction);
+
+static inline void lock_chunks(struct btrfs_root *root)
+{
+       mutex_lock(&root->fs_info->chunk_mutex);
+}
+
+static inline void unlock_chunks(struct btrfs_root *root)
+{
+       mutex_unlock(&root->fs_info->chunk_mutex);
+}
+
+
  #endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c

index dcf2013..47b1946 100644 (file)
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -29,6 +29,7 @@
  #include "xattr.h"
  #include "disk-io.h"
  #include "props.h"
+#include "locking.h"
  
  
  ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
@@ -91,7 +92,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
                        struct inode *inode, const char *name,
                        const void *value, size_t size, int flags)
  {
-       struct btrfs_dir_item *di;
+       struct btrfs_dir_item *di = NULL;
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_path *path;
         size_t name_len = strlen(name);
@@ -103,84 +104,119 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
+       path->skip_release_on_error = 1;
+
+       if (!value) {
+               di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
+                                       name, name_len, -1);
+               if (!di && (flags & XATTR_REPLACE))
+                       ret = -ENODATA;
+               else if (di)
+                       ret = btrfs_delete_one_dir_name(trans, root, path, di);
+               goto out;
+       }
  
+       /*
+        * For a replace we can't just do the insert blindly.
+        * Do a lookup first (read-only btrfs_search_slot), and return if xattr
+        * doesn't exist. If it exists, fall down below to the insert/replace
+        * path - we can't race with a concurrent xattr delete, because the VFS
+        * locks the inode's i_mutex before calling setxattr or removexattr.
+        */
         if (flags & XATTR_REPLACE) {
-               di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
-                                       name_len, -1);
-               if (IS_ERR(di)) {
-                       ret = PTR_ERR(di);
-                       goto out;
-               } else if (!di) {
+               ASSERT(mutex_is_locked(&inode->i_mutex));
+               di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
+                                       name, name_len, 0);
+               if (!di) {
                         ret = -ENODATA;
                         goto out;
                 }
-               ret = btrfs_delete_one_dir_name(trans, root, path, di);
-               if (ret)
-                       goto out;
                 btrfs_release_path(path);
+               di = NULL;
+       }
  
+       ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
+                                     name, name_len, value, size);
+       if (ret == -EOVERFLOW) {
                 /*
-                * remove the attribute
+                * We have an existing item in a leaf, split_leaf couldn't
+                * expand it. That item might have or not a dir_item that
+                * matches our target xattr, so lets check.
                  */
-               if (!value)
-                       goto out;
-       } else {
-               di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
-                                       name, name_len, 0);
-               if (IS_ERR(di)) {
-                       ret = PTR_ERR(di);
+               ret = 0;
+               btrfs_assert_tree_locked(path->nodes[0]);
+               di = btrfs_match_dir_item_name(root, path, name, name_len);
+               if (!di && !(flags & XATTR_REPLACE)) {
+                       ret = -ENOSPC;
                         goto out;
                 }
-               if (!di && !value)
-                       goto out;
-               btrfs_release_path(path);
+       } else if (ret == -EEXIST) {
+               ret = 0;
+               di = btrfs_match_dir_item_name(root, path, name, name_len);
+               ASSERT(di); /* logic error */
+       } else if (ret) {
+               goto out;
         }
  
-again:
-       ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
-                                     name, name_len, value, size);
-       /*
-        * If we're setting an xattr to a new value but the new value is say
-        * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
-        * back from split_leaf.  This is because it thinks we'll be extending
-        * the existing item size, but we're asking for enough space to add the
-        * item itself.  So if we get EOVERFLOW just set ret to EEXIST and let
-        * the rest of the function figure it out.
-        */
-       if (ret == -EOVERFLOW)
+       if (di && (flags & XATTR_CREATE)) {
                 ret = -EEXIST;
+               goto out;
+       }
  
-       if (ret == -EEXIST) {
-               if (flags & XATTR_CREATE)
-                       goto out;
+       if (di) {
                 /*
-                * We can't use the path we already have since we won't have the
-                * proper locking for a delete, so release the path and
-                * re-lookup to delete the thing.
+                * We're doing a replace, and it must be atomic, that is, at
+                * any point in time we have either the old or the new xattr
+                * value in the tree. We don't want readers (getxattr and
+                * listxattrs) to miss a value, this is specially important
+                * for ACLs.
                  */
-               btrfs_release_path(path);
-               di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
-                                       name, name_len, -1);
-               if (IS_ERR(di)) {
-                       ret = PTR_ERR(di);
-                       goto out;
-               } else if (!di) {
-                       /* Shouldn't happen but just in case... */
-                       btrfs_release_path(path);
-                       goto again;
+               const int slot = path->slots[0];
+               struct extent_buffer *leaf = path->nodes[0];
+               const u16 old_data_len = btrfs_dir_data_len(leaf, di);
+               const u32 item_size = btrfs_item_size_nr(leaf, slot);
+               const u32 data_size = sizeof(*di) + name_len + size;
+               struct btrfs_item *item;
+               unsigned long data_ptr;
+               char *ptr;
+
+               if (size > old_data_len) {
+                       if (btrfs_leaf_free_space(root, leaf) <
+                           (size - old_data_len)) {
+                               ret = -ENOSPC;
+                               goto out;
+                       }
                 }
  
-               ret = btrfs_delete_one_dir_name(trans, root, path, di);
-               if (ret)
-                       goto out;
+               if (old_data_len + name_len + sizeof(*di) == item_size) {
+                       /* No other xattrs packed in the same leaf item. */
+                       if (size > old_data_len)
+                               btrfs_extend_item(root, path,
+                                                 size - old_data_len);
+                       else if (size < old_data_len)
+                               btrfs_truncate_item(root, path, data_size, 1);
+               } else {
+                       /* There are other xattrs packed in the same item. */
+                       ret = btrfs_delete_one_dir_name(trans, root, path, di);
+                       if (ret)
+                               goto out;
+                       btrfs_extend_item(root, path, data_size);
+               }
  
+               item = btrfs_item_nr(slot);
+               ptr = btrfs_item_ptr(leaf, slot, char);
+               ptr += btrfs_item_size(leaf, item) - data_size;
+               di = (struct btrfs_dir_item *)ptr;
+               btrfs_set_dir_data_len(leaf, di, size);
+               data_ptr = ((unsigned long)(di + 1)) + name_len;
+               write_extent_buffer(leaf, value, data_ptr, size);
+               btrfs_mark_buffer_dirty(leaf);
+       } else {
                 /*
-                * We have a value to set, so go back and try to insert it now.
+                * Insert, and we had space for the xattr, so path->slots[0] is
+                * where our xattr dir_item is and btrfs_insert_xattr_item()
+                * filled it.
                  */
-               if (value) {
-                       btrfs_release_path(path);
-                       goto again;
-               }
         }
  out:
         btrfs_free_path(path);
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h

index 2f47824..611e1c5 100644 (file)
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -157,6 +157,7 @@ struct btrfs_ioctl_dev_replace_status_params {
  #define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR                        0
  #define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED             1
  #define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED         2
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS                3
  struct btrfs_ioctl_dev_replace_args {
         __u64 cmd;      /* in */
         __u64 result;   /* out */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 12 Dec 2014 19:15:23 +0000 (11:15 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 12 Dec 2014 19:15:23 +0000 (11:15 -0800)
fs/btrfs/check-integrity.c		patch \| blob \| history
fs/btrfs/compression.c		patch \| blob \| history
fs/btrfs/ctree.c		patch \| blob \| history
fs/btrfs/ctree.h		patch \| blob \| history
fs/btrfs/dev-replace.c		patch \| blob \| history
fs/btrfs/dir-item.c		patch \| blob \| history
fs/btrfs/disk-io.c		patch \| blob \| history
fs/btrfs/extent-tree.c		patch \| blob \| history
fs/btrfs/extent_io.c		patch \| blob \| history
fs/btrfs/extent_io.h		patch \| blob \| history
fs/btrfs/extent_map.c		patch \| blob \| history
fs/btrfs/file.c		patch \| blob \| history
fs/btrfs/free-space-cache.c		patch \| blob \| history
fs/btrfs/free-space-cache.h		patch \| blob \| history
fs/btrfs/inode-map.c		patch \| blob \| history
fs/btrfs/inode.c		patch \| blob \| history
fs/btrfs/ioctl.c		patch \| blob \| history
fs/btrfs/ordered-data.c		patch \| blob \| history
fs/btrfs/ordered-data.h		patch \| blob \| history
fs/btrfs/raid56.c		patch \| blob \| history
fs/btrfs/raid56.h		patch \| blob \| history
fs/btrfs/scrub.c		patch \| blob \| history
fs/btrfs/send.c		patch \| blob \| history
fs/btrfs/super.c		patch \| blob \| history
fs/btrfs/sysfs.c		patch \| blob \| history
fs/btrfs/transaction.c		patch \| blob \| history
fs/btrfs/transaction.h		patch \| blob \| history
fs/btrfs/tree-log.c		patch \| blob \| history
fs/btrfs/volumes.c		patch \| blob \| history
fs/btrfs/volumes.h		patch \| blob \| history
fs/btrfs/xattr.c		patch \| blob \| history
include/uapi/linux/btrfs.h		patch \| blob \| history