Btrfs: fix chunk allocation error handling

[cascardo/linux.git] / fs / btrfs / volumes.c
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index 0f5ebb7..13efbcf 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,7 +25,6 @@
  #include <linux/capability.h>
  #include <linux/ratelimit.h>
  #include <linux/kthread.h>
-#include <asm/div64.h>
  #include "compat.h"
  #include "ctree.h"
  #include "extent_map.h"
@@ -36,6 +35,8 @@
  #include "async-thread.h"
  #include "check-integrity.h"
  #include "rcu-string.h"
+#include "math.h"
+#include "dev-replace.h"
  
  static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
@@ -71,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
         kfree(fs_devices);
  }
  
+static void btrfs_kobject_uevent(struct block_device *bdev,
+                                enum kobject_action action)
+{
+       int ret;
+
+       ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
+       if (ret)
+               pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
+                       action,
+                       kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
+                       &disk_to_dev(bdev->bd_disk)->kobj);
+}
+
  void btrfs_cleanup_fs_uuids(void)
  {
         struct btrfs_fs_devices *fs_devices;
@@ -108,6 +122,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
         return NULL;
  }
  
+static int
+btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
+                     int flush, struct block_device **bdev,
+                     struct buffer_head **bh)
+{
+       int ret;
+
+       *bdev = blkdev_get_by_path(device_path, flags, holder);
+
+       if (IS_ERR(*bdev)) {
+               ret = PTR_ERR(*bdev);
+               printk(KERN_INFO "btrfs: open %s failed\n", device_path);
+               goto error;
+       }
+
+       if (flush)
+               filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+       ret = set_blocksize(*bdev, 4096);
+       if (ret) {
+               blkdev_put(*bdev, flags);
+               goto error;
+       }
+       invalidate_bdev(*bdev);
+       *bh = btrfs_read_dev_super(*bdev);
+       if (!*bh) {
+               ret = -EINVAL;
+               blkdev_put(*bdev, flags);
+               goto error;
+       }
+
+       return 0;
+
+error:
+       *bdev = NULL;
+       *bh = NULL;
+       return ret;
+}
+
  static void requeue_list(struct btrfs_pending_bios *pending_bios,
                         struct bio *head, struct bio *tail)
  {
@@ -467,7 +519,8 @@ error:
         return ERR_PTR(-ENOMEM);
  }
  
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+                              struct btrfs_fs_devices *fs_devices, int step)
  {
         struct btrfs_device *device, *next;
  
@@ -480,8 +533,9 @@ again:
         /* This is the initialized path, it is safe to release the devices. */
         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
                 if (device->in_fs_metadata) {
-                       if (!latest_transid ||
-                           device->generation > latest_transid) {
+                       if (!device->is_tgtdev_for_dev_replace &&
+                           (!latest_transid ||
+                            device->generation > latest_transid)) {
                                 latest_devid = device->devid;
                                 latest_transid = device->generation;
                                 latest_bdev = device->bdev;
@@ -489,6 +543,21 @@ again:
                         continue;
                 }
  
+               if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
+                       /*
+                        * In the first step, keep the device which has
+                        * the correct fsid and the devid that is used
+                        * for the dev_replace procedure.
+                        * In the second step, the dev_replace state is
+                        * read from the device tree and it is known
+                        * whether the procedure is really active or
+                        * not, which means whether this device is
+                        * used or whether it should be removed.
+                        */
+                       if (step == 0 || device->is_tgtdev_for_dev_replace) {
+                               continue;
+                       }
+               }
                 if (device->bdev) {
                         blkdev_put(device->bdev, device->mode);
                         device->bdev = NULL;
@@ -497,7 +566,8 @@ again:
                 if (device->writeable) {
                         list_del_init(&device->dev_alloc_list);
                         device->writeable = 0;
-                       fs_devices->rw_devices--;
+                       if (!device->is_tgtdev_for_dev_replace)
+                               fs_devices->rw_devices--;
                 }
                 list_del_init(&device->dev_list);
                 fs_devices->num_devices--;
@@ -555,7 +625,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
                 if (device->bdev)
                         fs_devices->open_devices--;
  
-               if (device->writeable) {
+               if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                         list_del_init(&device->dev_alloc_list);
                         fs_devices->rw_devices--;
                 }
@@ -637,18 +707,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                 if (!device->name)
                         continue;
  
-               bdev = blkdev_get_by_path(device->name->str, flags, holder);
-               if (IS_ERR(bdev)) {
-                       printk(KERN_INFO "btrfs: open %s failed\n", device->name->str);
-                       goto error;
-               }
-               filemap_write_and_wait(bdev->bd_inode->i_mapping);
-               invalidate_bdev(bdev);
-               set_blocksize(bdev, 4096);
-
-               bh = btrfs_read_dev_super(bdev);
-               if (!bh)
-                       goto error_close;
+               ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+                                           &bdev, &bh);
+               if (ret)
+                       continue;
  
                 disk_super = (struct btrfs_super_block *)bh->b_data;
                 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -687,7 +749,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                         fs_devices->rotating = 1;
  
                 fs_devices->open_devices++;
-               if (device->writeable) {
+               if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                         fs_devices->rw_devices++;
                         list_add(&device->dev_alloc_list,
                                  &fs_devices->alloc_list);
@@ -697,9 +759,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
  
  error_brelse:
                 brelse(bh);
-error_close:
                 blkdev_put(bdev, flags);
-error:
                 continue;
         }
         if (fs_devices->open_devices == 0) {
@@ -732,52 +792,100 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
         return ret;
  }
  
+/*
+ * Look for a btrfs signature on a device. This may be called out of the mount path
+ * and we are not allowed to call set_blocksize during the scan. The superblock
+ * is read via pagecache
+ */
  int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                           struct btrfs_fs_devices **fs_devices_ret)
  {
         struct btrfs_super_block *disk_super;
         struct block_device *bdev;
-       struct buffer_head *bh;
-       int ret;
+       struct page *page;
+       void *p;
+       int ret = -EINVAL;
         u64 devid;
         u64 transid;
         u64 total_devices;
+       u64 bytenr;
+       pgoff_t index;
  
+       /*
+        * we would like to check all the supers, but that would make
+        * a btrfs mount succeed after a mkfs from a different FS.
+        * So, we need to add a special mount option to scan for
+        * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+        */
+       bytenr = btrfs_sb_offset(0);
         flags |= FMODE_EXCL;
+       mutex_lock(&uuid_mutex);
+
         bdev = blkdev_get_by_path(path, flags, holder);
  
         if (IS_ERR(bdev)) {
                 ret = PTR_ERR(bdev);
+               printk(KERN_INFO "btrfs: open %s failed\n", path);
                 goto error;
         }
  
-       mutex_lock(&uuid_mutex);
-       ret = set_blocksize(bdev, 4096);
-       if (ret)
-               goto error_close;
-       bh = btrfs_read_dev_super(bdev);
-       if (!bh) {
-               ret = -EINVAL;
-               goto error_close;
-       }
-       disk_super = (struct btrfs_super_block *)bh->b_data;
+       /* make sure our super fits in the device */
+       if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
+               goto error_bdev_put;
+
+       /* make sure our super fits in the page */
+       if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
+               goto error_bdev_put;
+
+       /* make sure our super doesn't straddle pages on disk */
+       index = bytenr >> PAGE_CACHE_SHIFT;
+       if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
+               goto error_bdev_put;
+
+       /* pull in the page with our super */
+       page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+                                  index, GFP_NOFS);
+
+       if (IS_ERR_OR_NULL(page))
+               goto error_bdev_put;
+
+       p = kmap(page);
+
+       /* align our pointer to the offset of the super block */
+       disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
+
+       if (btrfs_super_bytenr(disk_super) != bytenr ||
+           strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+                   sizeof(disk_super->magic)))
+               goto error_unmap;
+
         devid = btrfs_stack_device_id(&disk_super->dev_item);
         transid = btrfs_super_generation(disk_super);
         total_devices = btrfs_super_num_devices(disk_super);
-       if (disk_super->label[0])
+
+       if (disk_super->label[0]) {
+               if (disk_super->label[BTRFS_LABEL_SIZE - 1])
+                       disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
                 printk(KERN_INFO "device label %s ", disk_super->label);
-       else
+       } else {
                 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
+       }
+
         printk(KERN_CONT "devid %llu transid %llu %s\n",
                (unsigned long long)devid, (unsigned long long)transid, path);
+
         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
         if (!ret && fs_devices_ret)
                 (*fs_devices_ret)->total_devices = total_devices;
-       brelse(bh);
-error_close:
-       mutex_unlock(&uuid_mutex);
+
+error_unmap:
+       kunmap(page);
+       page_cache_release(page);
+
+error_bdev_put:
         blkdev_put(bdev, flags);
  error:
+       mutex_unlock(&uuid_mutex);
         return ret;
  }
  
@@ -796,7 +904,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
  
         *length = 0;
  
-       if (start >= device->total_bytes)
+       if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
                 return 0;
  
         path = btrfs_alloc_path();
@@ -913,7 +1021,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
         max_hole_size = 0;
         hole_size = 0;
  
-       if (search_start >= search_end) {
+       if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
                 ret = -ENOSPC;
                 goto error;
         }
@@ -1096,6 +1204,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
         struct btrfs_key key;
  
         WARN_ON(!device->in_fs_metadata);
+       WARN_ON(device->is_tgtdev_for_dev_replace);
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
@@ -1321,25 +1430,36 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
         u64 devid;
         u64 num_devices;
         u8 *dev_uuid;
+       unsigned seq;
         int ret = 0;
         bool clear_super = false;
  
         mutex_lock(&uuid_mutex);
  
-       all_avail = root->fs_info->avail_data_alloc_bits |
-               root->fs_info->avail_system_alloc_bits |
-               root->fs_info->avail_metadata_alloc_bits;
+       do {
+               seq = read_seqbegin(&root->fs_info->profiles_lock);
+
+               all_avail = root->fs_info->avail_data_alloc_bits |
+                           root->fs_info->avail_system_alloc_bits |
+                           root->fs_info->avail_metadata_alloc_bits;
+       } while (read_seqretry(&root->fs_info->profiles_lock, seq));
  
-       if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
-           root->fs_info->fs_devices->num_devices <= 4) {
+       num_devices = root->fs_info->fs_devices->num_devices;
+       btrfs_dev_replace_lock(&root->fs_info->dev_replace);
+       if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
+               WARN_ON(num_devices < 1);
+               num_devices--;
+       }
+       btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
+
+       if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
                 printk(KERN_ERR "btrfs: unable to go below four devices "
                        "on raid10\n");
                 ret = -EINVAL;
                 goto out;
         }
  
-       if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
-           root->fs_info->fs_devices->num_devices <= 2) {
+       if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
                 printk(KERN_ERR "btrfs: unable to go below two "
                        "devices on raid1\n");
                 ret = -EINVAL;
@@ -1357,7 +1477,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                  * is held.
                  */
                 list_for_each_entry(tmp, devices, dev_list) {
-                       if (tmp->in_fs_metadata && !tmp->bdev) {
+                       if (tmp->in_fs_metadata &&
+                           !tmp->is_tgtdev_for_dev_replace &&
+                           !tmp->bdev) {
                                 device = tmp;
                                 break;
                         }
@@ -1371,24 +1493,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                         goto out;
                 }
         } else {
-               bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
-                                         root->fs_info->bdev_holder);
-               if (IS_ERR(bdev)) {
-                       ret = PTR_ERR(bdev);
+               ret = btrfs_get_bdev_and_sb(device_path,
+                                           FMODE_WRITE | FMODE_EXCL,
+                                           root->fs_info->bdev_holder, 0,
+                                           &bdev, &bh);
+               if (ret)
                         goto out;
-               }
-
-               set_blocksize(bdev, 4096);
-               invalidate_bdev(bdev);
-               bh = btrfs_read_dev_super(bdev);
-               if (!bh) {
-                       ret = -EINVAL;
-                       goto error_close;
-               }
                 disk_super = (struct btrfs_super_block *)bh->b_data;
                 devid = btrfs_stack_device_id(&disk_super->dev_item);
                 dev_uuid = disk_super->dev_item.uuid;
-               device = btrfs_find_device(root, devid, dev_uuid,
+               device = btrfs_find_device(root->fs_info, devid, dev_uuid,
                                            disk_super->fsid);
                 if (!device) {
                         ret = -ENOENT;
@@ -1396,6 +1510,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                 }
         }
  
+       if (device->is_tgtdev_for_dev_replace) {
+               pr_err("btrfs: unable to remove the dev_replace target dev\n");
+               ret = -EINVAL;
+               goto error_brelse;
+       }
+
         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
                 printk(KERN_ERR "btrfs: unable to remove the only writeable "
                        "device\n");
@@ -1415,6 +1535,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
         if (ret)
                 goto error_undo;
  
+       /*
+        * TODO: the superblock still includes this device in its num_devices
+        * counter although write_all_supers() is not locked out. This
+        * could give a filesystem state which requires a degraded mount.
+        */
         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
         if (ret)
                 goto error_undo;
@@ -1425,7 +1550,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
         spin_unlock(&root->fs_info->free_chunk_lock);
  
         device->in_fs_metadata = 0;
-       btrfs_scrub_cancel_dev(root, device);
+       btrfs_scrub_cancel_dev(root->fs_info, device);
  
         /*
          * the device list mutex makes sure that we don't change
@@ -1482,7 +1607,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
          * at this point, the device is zero sized.  We want to
          * remove it from the devices list and zero out the old super
          */
-       if (clear_super) {
+       if (clear_super && disk_super) {
                 /* make sure this device isn't detected as part of
                  * the FS anymore
                  */
@@ -1493,9 +1618,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
  
         ret = 0;
  
+       /* Notify udev that device has changed */
+       if (bdev)
+               btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
  error_brelse:
         brelse(bh);
-error_close:
         if (bdev)
                 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
  out:
@@ -1512,6 +1640,112 @@ error_undo:
         goto error_brelse;
  }
  
+void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
+                                struct btrfs_device *srcdev)
+{
+       WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+       list_del_rcu(&srcdev->dev_list);
+       list_del_rcu(&srcdev->dev_alloc_list);
+       fs_info->fs_devices->num_devices--;
+       if (srcdev->missing) {
+               fs_info->fs_devices->missing_devices--;
+               fs_info->fs_devices->rw_devices++;
+       }
+       if (srcdev->can_discard)
+               fs_info->fs_devices->num_can_discard--;
+       if (srcdev->bdev)
+               fs_info->fs_devices->open_devices--;
+
+       call_rcu(&srcdev->rcu, free_device);
+}
+
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+                                     struct btrfs_device *tgtdev)
+{
+       struct btrfs_device *next_device;
+
+       WARN_ON(!tgtdev);
+       mutex_lock(&fs_info->fs_devices->device_list_mutex);
+       if (tgtdev->bdev) {
+               btrfs_scratch_superblock(tgtdev);
+               fs_info->fs_devices->open_devices--;
+       }
+       fs_info->fs_devices->num_devices--;
+       if (tgtdev->can_discard)
+               fs_info->fs_devices->num_can_discard++;
+
+       next_device = list_entry(fs_info->fs_devices->devices.next,
+                                struct btrfs_device, dev_list);
+       if (tgtdev->bdev == fs_info->sb->s_bdev)
+               fs_info->sb->s_bdev = next_device->bdev;
+       if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
+               fs_info->fs_devices->latest_bdev = next_device->bdev;
+       list_del_rcu(&tgtdev->dev_list);
+
+       call_rcu(&tgtdev->rcu, free_device);
+
+       mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+}
+
+int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
+                             struct btrfs_device **device)
+{
+       int ret = 0;
+       struct btrfs_super_block *disk_super;
+       u64 devid;
+       u8 *dev_uuid;
+       struct block_device *bdev;
+       struct buffer_head *bh;
+
+       *device = NULL;
+       ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
+                                   root->fs_info->bdev_holder, 0, &bdev, &bh);
+       if (ret)
+               return ret;
+       disk_super = (struct btrfs_super_block *)bh->b_data;
+       devid = btrfs_stack_device_id(&disk_super->dev_item);
+       dev_uuid = disk_super->dev_item.uuid;
+       *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+                                   disk_super->fsid);
+       brelse(bh);
+       if (!*device)
+               ret = -ENOENT;
+       blkdev_put(bdev, FMODE_READ);
+       return ret;
+}
+
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+                                        char *device_path,
+                                        struct btrfs_device **device)
+{
+       *device = NULL;
+       if (strcmp(device_path, "missing") == 0) {
+               struct list_head *devices;
+               struct btrfs_device *tmp;
+
+               devices = &root->fs_info->fs_devices->devices;
+               /*
+                * It is safe to read the devices since the volume_mutex
+                * is held by the caller.
+                */
+               list_for_each_entry(tmp, devices, dev_list) {
+                       if (tmp->in_fs_metadata && !tmp->bdev) {
+                               *device = tmp;
+                               break;
+                       }
+               }
+
+               if (!*device) {
+                       pr_err("btrfs: no missing device found\n");
+                       return -ENOENT;
+               }
+
+               return 0;
+       } else {
+               return btrfs_find_device_by_path(root, device_path, device);
+       }
+}
+
  /*
   * does all the dirty work required for changing file system's UUID.
   */
@@ -1630,7 +1864,8 @@ next_slot:
                 read_extent_buffer(leaf, fs_uuid,
                                    (unsigned long)btrfs_device_fsid(dev_item),
                                    BTRFS_UUID_SIZE);
-               device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+               device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+                                          fs_uuid);
                 BUG_ON(!device); /* Logic error */
  
                 if (device->fs_devices->seeding) {
@@ -1678,16 +1913,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         filemap_write_and_wait(bdev->bd_inode->i_mapping);
  
         devices = &root->fs_info->fs_devices->devices;
-       /*
-        * we have the volume lock, so we don't need the extra
-        * device list mutex while reading the list here.
-        */
+
+       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
         list_for_each_entry(device, devices, dev_list) {
                 if (device->bdev == bdev) {
                         ret = -EEXIST;
+                       mutex_unlock(
+                               &root->fs_info->fs_devices->device_list_mutex);
                         goto error;
                 }
         }
+       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  
         device = kzalloc(sizeof(*device), GFP_NOFS);
         if (!device) {
@@ -1737,6 +1973,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         device->dev_root = root->fs_info->dev_root;
         device->bdev = bdev;
         device->in_fs_metadata = 1;
+       device->is_tgtdev_for_dev_replace = 0;
         device->mode = FMODE_EXCL;
         set_blocksize(device->bdev, 4096);
  
@@ -1844,6 +2081,98 @@ error:
         return ret;
  }
  
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+                                 struct btrfs_device **device_out)
+{
+       struct request_queue *q;
+       struct btrfs_device *device;
+       struct block_device *bdev;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct list_head *devices;
+       struct rcu_string *name;
+       int ret = 0;
+
+       *device_out = NULL;
+       if (fs_info->fs_devices->seeding)
+               return -EINVAL;
+
+       bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
+                                 fs_info->bdev_holder);
+       if (IS_ERR(bdev))
+               return PTR_ERR(bdev);
+
+       filemap_write_and_wait(bdev->bd_inode->i_mapping);
+
+       devices = &fs_info->fs_devices->devices;
+       list_for_each_entry(device, devices, dev_list) {
+               if (device->bdev == bdev) {
+                       ret = -EEXIST;
+                       goto error;
+               }
+       }
+
+       device = kzalloc(sizeof(*device), GFP_NOFS);
+       if (!device) {
+               ret = -ENOMEM;
+               goto error;
+       }
+
+       name = rcu_string_strdup(device_path, GFP_NOFS);
+       if (!name) {
+               kfree(device);
+               ret = -ENOMEM;
+               goto error;
+       }
+       rcu_assign_pointer(device->name, name);
+
+       q = bdev_get_queue(bdev);
+       if (blk_queue_discard(q))
+               device->can_discard = 1;
+       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+       device->writeable = 1;
+       device->work.func = pending_bios_fn;
+       generate_random_uuid(device->uuid);
+       device->devid = BTRFS_DEV_REPLACE_DEVID;
+       spin_lock_init(&device->io_lock);
+       device->generation = 0;
+       device->io_width = root->sectorsize;
+       device->io_align = root->sectorsize;
+       device->sector_size = root->sectorsize;
+       device->total_bytes = i_size_read(bdev->bd_inode);
+       device->disk_total_bytes = device->total_bytes;
+       device->dev_root = fs_info->dev_root;
+       device->bdev = bdev;
+       device->in_fs_metadata = 1;
+       device->is_tgtdev_for_dev_replace = 1;
+       device->mode = FMODE_EXCL;
+       set_blocksize(device->bdev, 4096);
+       device->fs_devices = fs_info->fs_devices;
+       list_add(&device->dev_list, &fs_info->fs_devices->devices);
+       fs_info->fs_devices->num_devices++;
+       fs_info->fs_devices->open_devices++;
+       if (device->can_discard)
+               fs_info->fs_devices->num_can_discard++;
+       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+       *device_out = device;
+       return ret;
+
+error:
+       blkdev_put(bdev, FMODE_EXCL);
+       return ret;
+}
+
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+                                             struct btrfs_device *tgtdev)
+{
+       WARN_ON(fs_info->fs_devices->rw_devices == 0);
+       tgtdev->io_width = fs_info->dev_root->sectorsize;
+       tgtdev->io_align = fs_info->dev_root->sectorsize;
+       tgtdev->sector_size = fs_info->dev_root->sectorsize;
+       tgtdev->dev_root = fs_info->dev_root;
+       tgtdev->in_fs_metadata = 1;
+}
+
  static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
                                         struct btrfs_device *device)
  {
@@ -1900,7 +2229,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
  
         if (!device->writeable)
                 return -EACCES;
-       if (new_size <= device->total_bytes)
+       if (new_size <= device->total_bytes ||
+           device->is_tgtdev_for_dev_replace)
                 return -EINVAL;
  
         btrfs_set_super_total_bytes(super_copy, old_total + diff);
@@ -2338,18 +2668,6 @@ static int chunk_profiles_filter(u64 chunk_type,
         return 1;
  }
  
-static u64 div_factor_fine(u64 num, int factor)
-{
-       if (factor <= 0)
-               return 0;
-       if (factor >= 100)
-               return num;
-
-       num *= factor;
-       do_div(num, 100);
-       return num;
-}
-
  static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
                               struct btrfs_balance_args *bargs)
  {
@@ -2360,7 +2678,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
         chunk_used = btrfs_block_group_used(&cache->item);
  
-       user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+       if (bargs->usage == 0)
+               user_thresh = 0;
+       else if (bargs->usage > 100)
+               user_thresh = cache->key.offset;
+       else
+               user_thresh = div_factor_fine(cache->key.offset,
+                                             bargs->usage);
+
         if (chunk_used < user_thresh)
                 ret = 0;
  
@@ -2514,15 +2839,6 @@ static int should_balance_chunk(struct btrfs_root *root,
         return 1;
  }
  
-static u64 div_factor(u64 num, int factor)
-{
-       if (factor == 10)
-               return num;
-       num *= factor;
-       do_div(num, 10);
-       return num;
-}
-
  static int __btrfs_balance(struct btrfs_fs_info *fs_info)
  {
         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -2550,7 +2866,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
                 size_to_free = div_factor(old_size, 1);
                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
                 if (!device->writeable ||
-                   device->total_bytes - device->bytes_used > size_to_free)
+                   device->total_bytes - device->bytes_used > size_to_free ||
+                   device->is_tgtdev_for_dev_replace)
                         continue;
  
                 ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -2713,6 +3030,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
         unset_balance_control(fs_info);
         ret = del_balance_item(fs_info->tree_root);
         BUG_ON(ret);
+
+       atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
  }
  
  void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
@@ -2728,6 +3047,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
         u64 allowed;
         int mixed = 0;
         int ret;
+       u64 num_devices;
+       unsigned seq;
  
         if (btrfs_fs_closing(fs_info) ||
             atomic_read(&fs_info->balance_pause_req) ||
@@ -2756,10 +3077,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                 }
         }
  
+       num_devices = fs_info->fs_devices->num_devices;
+       btrfs_dev_replace_lock(&fs_info->dev_replace);
+       if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+               BUG_ON(num_devices < 1);
+               num_devices--;
+       }
+       btrfs_dev_replace_unlock(&fs_info->dev_replace);
         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-       if (fs_info->fs_devices->num_devices == 1)
+       if (num_devices == 1)
                 allowed |= BTRFS_BLOCK_GROUP_DUP;
-       else if (fs_info->fs_devices->num_devices < 4)
+       else if (num_devices < 4)
                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
         else
                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
@@ -2804,22 +3132,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
         /* allow to reduce meta or sys integrity only if force set */
         allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
                         BTRFS_BLOCK_GROUP_RAID10;
-       if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-            (fs_info->avail_system_alloc_bits & allowed) &&
-            !(bctl->sys.target & allowed)) ||
-           ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-            (fs_info->avail_metadata_alloc_bits & allowed) &&
-            !(bctl->meta.target & allowed))) {
-               if (bctl->flags & BTRFS_BALANCE_FORCE) {
-                       printk(KERN_INFO "btrfs: force reducing metadata "
-                              "integrity\n");
-               } else {
-                       printk(KERN_ERR "btrfs: balance will reduce metadata "
-                              "integrity, use force if you want this\n");
-                       ret = -EINVAL;
-                       goto out;
+       do {
+               seq = read_seqbegin(&fs_info->profiles_lock);
+
+               if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                    (fs_info->avail_system_alloc_bits & allowed) &&
+                    !(bctl->sys.target & allowed)) ||
+                   ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                    (fs_info->avail_metadata_alloc_bits & allowed) &&
+                    !(bctl->meta.target & allowed))) {
+                       if (bctl->flags & BTRFS_BALANCE_FORCE) {
+                               printk(KERN_INFO "btrfs: force reducing metadata "
+                                      "integrity\n");
+                       } else {
+                               printk(KERN_ERR "btrfs: balance will reduce metadata "
+                                      "integrity, use force if you want this\n");
+                               ret = -EINVAL;
+                               goto out;
+                       }
                 }
-       }
+       } while (read_seqretry(&fs_info->profiles_lock, seq));
  
         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
                 int num_tolerated_disk_barrier_failures;
@@ -2884,8 +3216,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
  out:
         if (bctl->flags & BTRFS_BALANCE_RESUME)
                 __cancel_balance(fs_info);
-       else
+       else {
                 kfree(bctl);
+               atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+       }
         return ret;
  }
  
@@ -2977,6 +3311,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
         btrfs_balance_sys(leaf, item, &disk_bargs);
         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
  
+       WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
+
         mutex_lock(&fs_info->volume_mutex);
         mutex_lock(&fs_info->balance_mutex);
  
@@ -3080,7 +3416,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
         u64 old_size = device->total_bytes;
         u64 diff = device->total_bytes - new_size;
  
-       if (new_size >= device->total_bytes)
+       if (device->is_tgtdev_for_dev_replace)
                 return -EINVAL;
  
         path = btrfs_alloc_path();
@@ -3235,6 +3571,49 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
         return 0;
  }
  
+struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+       [BTRFS_RAID_RAID10] = {
+               .sub_stripes    = 2,
+               .dev_stripes    = 1,
+               .devs_max       = 0,    /* 0 == as many as possible */
+               .devs_min       = 4,
+               .devs_increment = 2,
+               .ncopies        = 2,
+       },
+       [BTRFS_RAID_RAID1] = {
+               .sub_stripes    = 1,
+               .dev_stripes    = 1,
+               .devs_max       = 2,
+               .devs_min       = 2,
+               .devs_increment = 2,
+               .ncopies        = 2,
+       },
+       [BTRFS_RAID_DUP] = {
+               .sub_stripes    = 1,
+               .dev_stripes    = 2,
+               .devs_max       = 1,
+               .devs_min       = 1,
+               .devs_increment = 1,
+               .ncopies        = 2,
+       },
+       [BTRFS_RAID_RAID0] = {
+               .sub_stripes    = 1,
+               .dev_stripes    = 1,
+               .devs_max       = 0,
+               .devs_min       = 2,
+               .devs_increment = 1,
+               .ncopies        = 1,
+       },
+       [BTRFS_RAID_SINGLE] = {
+               .sub_stripes    = 1,
+               .dev_stripes    = 1,
+               .devs_max       = 1,
+               .devs_min       = 1,
+               .devs_increment = 1,
+               .ncopies        = 1,
+       },
+};
+ 
  static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                                struct btrfs_root *extent_root,
                                struct map_lookup **map_ret,
@@ -3264,43 +3643,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
         int ndevs;
         int i;
         int j;
+       int index;
  
         BUG_ON(!alloc_profile_is_valid(type, 0));
  
         if (list_empty(&fs_devices->alloc_list))
                 return -ENOSPC;
  
-       sub_stripes = 1;
-       dev_stripes = 1;
-       devs_increment = 1;
-       ncopies = 1;
-       devs_max = 0;   /* 0 == as many as possible */
-       devs_min = 1;
+       index = __get_raid_index(type);
  
-       /*
-        * define the properties of each RAID type.
-        * FIXME: move this to a global table and use it in all RAID
-        * calculation code
-        */
-       if (type & (BTRFS_BLOCK_GROUP_DUP)) {
-               dev_stripes = 2;
-               ncopies = 2;
-               devs_max = 1;
-       } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-               devs_min = 2;
-       } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
-               devs_increment = 2;
-               ncopies = 2;
-               devs_max = 2;
-               devs_min = 2;
-       } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-               sub_stripes = 2;
-               devs_increment = 2;
-               ncopies = 2;
-               devs_min = 4;
-       } else {
-               devs_max = 1;
-       }
+       sub_stripes = btrfs_raid_array[index].sub_stripes;
+       dev_stripes = btrfs_raid_array[index].dev_stripes;
+       devs_max = btrfs_raid_array[index].devs_max;
+       devs_min = btrfs_raid_array[index].devs_min;
+       devs_increment = btrfs_raid_array[index].devs_increment;
+       ncopies = btrfs_raid_array[index].ncopies;
  
         if (type & BTRFS_BLOCK_GROUP_DATA) {
                 max_stripe_size = 1024 * 1024 * 1024;
@@ -3347,13 +3704,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                 cur = cur->next;
  
                 if (!device->writeable) {
-                       printk(KERN_ERR
+                       WARN(1, KERN_ERR
                                "btrfs: read-only device in alloc_list\n");
-                       WARN_ON(1);
                         continue;
                 }
  
-               if (!device->in_fs_metadata)
+               if (!device->in_fs_metadata ||
+                   device->is_tgtdev_for_dev_replace)
                         continue;
  
                 if (device->total_bytes > device->bytes_used)
@@ -3382,6 +3739,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                 devices_info[ndevs].total_avail = total_avail;
                 devices_info[ndevs].dev = device;
                 ++ndevs;
+               WARN_ON(ndevs > fs_devices->rw_devices);
         }
  
         /*
@@ -3467,12 +3825,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
         if (ret)
                 goto error;
  
-       ret = btrfs_make_block_group(trans, extent_root, 0, type,
-                                    BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-                                    start, num_bytes);
-       if (ret)
-               goto error;
-
         for (i = 0; i < map->num_stripes; ++i) {
                 struct btrfs_device *device;
                 u64 dev_offset;
@@ -3484,15 +3836,33 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                                 info->chunk_root->root_key.objectid,
                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
                                 start, dev_offset, stripe_size);
-               if (ret) {
-                       btrfs_abort_transaction(trans, extent_root, ret);
-                       goto error;
-               }
+               if (ret)
+                       goto error_dev_extent;
+       }
+
+       ret = btrfs_make_block_group(trans, extent_root, 0, type,
+                                    BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+                                    start, num_bytes);
+       if (ret) {
+               i = map->num_stripes - 1;
+               goto error_dev_extent;
         }
  
         kfree(devices_info);
         return 0;
  
+error_dev_extent:
+       for (; i >= 0; i--) {
+               struct btrfs_device *device;
+               int err;
+
+               device = map->stripes[i].dev;
+               err = btrfs_free_dev_extent(trans, device, start);
+               if (err) {
+                       btrfs_abort_transaction(trans, extent_root, err);
+                       break;
+               }
+       }
  error:
         kfree(map);
         kfree(devices_info);
@@ -3632,10 +4002,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
         if (ret)
                 return ret;
  
-       alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
-                               fs_info->avail_metadata_alloc_bits;
-       alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
-
+       alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
                                   &stripe_size, chunk_offset, alloc_profile);
         if (ret)
@@ -3643,10 +4010,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
  
         sys_chunk_offset = chunk_offset + chunk_size;
  
-       alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
-                               fs_info->avail_system_alloc_bits;
-       alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
-
+       alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
         ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
                                   &sys_chunk_size, &sys_stripe_size,
                                   sys_chunk_offset, alloc_profile);
@@ -3740,8 +4104,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
         }
  }
  
-int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
  {
+       struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
         struct extent_map *em;
         struct map_lookup *map;
         struct extent_map_tree *em_tree = &map_tree->map_tree;
@@ -3761,32 +4126,60 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
         else
                 ret = 1;
         free_extent_map(em);
+
+       btrfs_dev_replace_lock(&fs_info->dev_replace);
+       if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
+               ret++;
+       btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
         return ret;
  }
  
-static int find_live_mirror(struct map_lookup *map, int first, int num,
-                           int optimal)
+static int find_live_mirror(struct btrfs_fs_info *fs_info,
+                           struct map_lookup *map, int first, int num,
+                           int optimal, int dev_replace_is_ongoing)
  {
         int i;
-       if (map->stripes[optimal].dev->bdev)
-               return optimal;
-       for (i = first; i < first + num; i++) {
-               if (map->stripes[i].dev->bdev)
-                       return i;
+       int tolerance;
+       struct btrfs_device *srcdev;
+
+       if (dev_replace_is_ongoing &&
+           fs_info->dev_replace.cont_reading_from_srcdev_mode ==
+            BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
+               srcdev = fs_info->dev_replace.srcdev;
+       else
+               srcdev = NULL;
+
+       /*
+        * try to avoid the drive that is the source drive for a
+        * dev-replace procedure, only choose it if no other non-missing
+        * mirror is available
+        */
+       for (tolerance = 0; tolerance < 2; tolerance++) {
+               if (map->stripes[optimal].dev->bdev &&
+                   (tolerance || map->stripes[optimal].dev != srcdev))
+                       return optimal;
+               for (i = first; i < first + num; i++) {
+                       if (map->stripes[i].dev->bdev &&
+                           (tolerance || map->stripes[i].dev != srcdev))
+                               return i;
+               }
         }
+
         /* we couldn't find one that doesn't fail.  Just return something
          * and the io error handling code will clean up eventually
          */
         return optimal;
  }
  
-static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                              u64 logical, u64 *length,
                              struct btrfs_bio **bbio_ret,
                              int mirror_num)
  {
         struct extent_map *em;
         struct map_lookup *map;
+       struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
         struct extent_map_tree *em_tree = &map_tree->map_tree;
         u64 offset;
         u64 stripe_offset;
@@ -3800,6 +4193,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
         int num_stripes;
         int max_errors = 0;
         struct btrfs_bio *bbio = NULL;
+       struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+       int dev_replace_is_ongoing = 0;
+       int num_alloc_stripes;
+       int patch_the_first_stripe_for_dev_replace = 0;
+       u64 physical_to_patch_in_first_stripe = 0;
  
         read_lock(&em_tree->lock);
         em = lookup_extent_mapping(em_tree, logical, *length);
@@ -3816,9 +4214,6 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
         map = (struct map_lookup *)em->bdev;
         offset = logical - em->start;
  
-       if (mirror_num > map->num_stripes)
-               mirror_num = 0;
-
         stripe_nr = offset;
         /*
          * stripe_nr counts the total number of stripes we have to stride
@@ -3845,6 +4240,93 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
         if (!bbio_ret)
                 goto out;
  
+       btrfs_dev_replace_lock(dev_replace);
+       dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+       if (!dev_replace_is_ongoing)
+               btrfs_dev_replace_unlock(dev_replace);
+
+       if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
+           !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
+           dev_replace->tgtdev != NULL) {
+               /*
+                * in dev-replace case, for repair case (that's the only
+                * case where the mirror is selected explicitly when
+                * calling btrfs_map_block), blocks left of the left cursor
+                * can also be read from the target drive.
+                * For REQ_GET_READ_MIRRORS, the target drive is added as
+                * the last one to the array of stripes. For READ, it also
+                * needs to be supported using the same mirror number.
+                * If the requested block is not left of the left cursor,
+                * EIO is returned. This can happen because btrfs_num_copies()
+                * returns one more in the dev-replace case.
+                */
+               u64 tmp_length = *length;
+               struct btrfs_bio *tmp_bbio = NULL;
+               int tmp_num_stripes;
+               u64 srcdev_devid = dev_replace->srcdev->devid;
+               int index_srcdev = 0;
+               int found = 0;
+               u64 physical_of_found = 0;
+
+               ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
+                            logical, &tmp_length, &tmp_bbio, 0);
+               if (ret) {
+                       WARN_ON(tmp_bbio != NULL);
+                       goto out;
+               }
+
+               tmp_num_stripes = tmp_bbio->num_stripes;
+               if (mirror_num > tmp_num_stripes) {
+                       /*
+                        * REQ_GET_READ_MIRRORS does not contain this
+                        * mirror, that means that the requested area
+                        * is not left of the left cursor
+                        */
+                       ret = -EIO;
+                       kfree(tmp_bbio);
+                       goto out;
+               }
+
+               /*
+                * process the rest of the function using the mirror_num
+                * of the source drive. Therefore look it up first.
+                * At the end, patch the device pointer to the one of the
+                * target drive.
+                */
+               for (i = 0; i < tmp_num_stripes; i++) {
+                       if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
+                               /*
+                                * In case of DUP, in order to keep it
+                                * simple, only add the mirror with the
+                                * lowest physical address
+                                */
+                               if (found &&
+                                   physical_of_found <=
+                                    tmp_bbio->stripes[i].physical)
+                                       continue;
+                               index_srcdev = i;
+                               found = 1;
+                               physical_of_found =
+                                       tmp_bbio->stripes[i].physical;
+                       }
+               }
+
+               if (found) {
+                       mirror_num = index_srcdev + 1;
+                       patch_the_first_stripe_for_dev_replace = 1;
+                       physical_to_patch_in_first_stripe = physical_of_found;
+               } else {
+                       WARN_ON(1);
+                       ret = -EIO;
+                       kfree(tmp_bbio);
+                       goto out;
+               }
+
+               kfree(tmp_bbio);
+       } else if (mirror_num > map->num_stripes) {
+               mirror_num = 0;
+       }
+
         num_stripes = 1;
         stripe_index = 0;
         stripe_nr_orig = stripe_nr;
@@ -3859,19 +4341,20 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                                             stripe_nr_end - stripe_nr_orig);
                 stripe_index = do_div(stripe_nr, map->num_stripes);
         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-               if (rw & (REQ_WRITE | REQ_DISCARD))
+               if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
                         num_stripes = map->num_stripes;
                 else if (mirror_num)
                         stripe_index = mirror_num - 1;
                 else {
-                       stripe_index = find_live_mirror(map, 0,
+                       stripe_index = find_live_mirror(fs_info, map, 0,
                                             map->num_stripes,
-                                           current->pid % map->num_stripes);
+                                           current->pid % map->num_stripes,
+                                           dev_replace_is_ongoing);
                         mirror_num = stripe_index + 1;
                 }
  
         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-               if (rw & (REQ_WRITE | REQ_DISCARD)) {
+               if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
                         num_stripes = map->num_stripes;
                 } else if (mirror_num) {
                         stripe_index = mirror_num - 1;
@@ -3885,7 +4368,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                 stripe_index = do_div(stripe_nr, factor);
                 stripe_index *= map->sub_stripes;
  
-               if (rw & REQ_WRITE)
+               if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
                         num_stripes = map->sub_stripes;
                 else if (rw & REQ_DISCARD)
                         num_stripes = min_t(u64, map->sub_stripes *
@@ -3895,9 +4378,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                         stripe_index += mirror_num - 1;
                 else {
                         int old_stripe_index = stripe_index;
-                       stripe_index = find_live_mirror(map, stripe_index,
+                       stripe_index = find_live_mirror(fs_info, map,
+                                             stripe_index,
                                               map->sub_stripes, stripe_index +
-                                             current->pid % map->sub_stripes);
+                                             current->pid % map->sub_stripes,
+                                             dev_replace_is_ongoing);
                         mirror_num = stripe_index - old_stripe_index + 1;
                 }
         } else {
@@ -3911,7 +4396,14 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
         }
         BUG_ON(stripe_index >= map->num_stripes);
  
-       bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
+       num_alloc_stripes = num_stripes;
+       if (dev_replace_is_ongoing) {
+               if (rw & (REQ_WRITE | REQ_DISCARD))
+                       num_alloc_stripes <<= 1;
+               if (rw & REQ_GET_READ_MIRRORS)
+                       num_alloc_stripes++;
+       }
+       bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
         if (!bbio) {
                 ret = -ENOMEM;
                 goto out;
@@ -3998,7 +4490,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                 }
         }
  
-       if (rw & REQ_WRITE) {
+       if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
                                  BTRFS_BLOCK_GROUP_RAID10 |
                                  BTRFS_BLOCK_GROUP_DUP)) {
@@ -4006,20 +4498,115 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                 }
         }
  
+       if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+           dev_replace->tgtdev != NULL) {
+               int index_where_to_add;
+               u64 srcdev_devid = dev_replace->srcdev->devid;
+
+               /*
+                * duplicate the write operations while the dev replace
+                * procedure is running. Since the copying of the old disk
+                * to the new disk takes place at run time while the
+                * filesystem is mounted writable, the regular write
+                * operations to the old disk have to be duplicated to go
+                * to the new disk as well.
+                * Note that device->missing is handled by the caller, and
+                * that the write to the old disk is already set up in the
+                * stripes array.
+                */
+               index_where_to_add = num_stripes;
+               for (i = 0; i < num_stripes; i++) {
+                       if (bbio->stripes[i].dev->devid == srcdev_devid) {
+                               /* write to new disk, too */
+                               struct btrfs_bio_stripe *new =
+                                       bbio->stripes + index_where_to_add;
+                               struct btrfs_bio_stripe *old =
+                                       bbio->stripes + i;
+
+                               new->physical = old->physical;
+                               new->length = old->length;
+                               new->dev = dev_replace->tgtdev;
+                               index_where_to_add++;
+                               max_errors++;
+                       }
+               }
+               num_stripes = index_where_to_add;
+       } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
+                  dev_replace->tgtdev != NULL) {
+               u64 srcdev_devid = dev_replace->srcdev->devid;
+               int index_srcdev = 0;
+               int found = 0;
+               u64 physical_of_found = 0;
+
+               /*
+                * During the dev-replace procedure, the target drive can
+                * also be used to read data in case it is needed to repair
+                * a corrupt block elsewhere. This is possible if the
+                * requested area is left of the left cursor. In this area,
+                * the target drive is a full copy of the source drive.
+                */
+               for (i = 0; i < num_stripes; i++) {
+                       if (bbio->stripes[i].dev->devid == srcdev_devid) {
+                               /*
+                                * In case of DUP, in order to keep it
+                                * simple, only add the mirror with the
+                                * lowest physical address
+                                */
+                               if (found &&
+                                   physical_of_found <=
+                                    bbio->stripes[i].physical)
+                                       continue;
+                               index_srcdev = i;
+                               found = 1;
+                               physical_of_found = bbio->stripes[i].physical;
+                       }
+               }
+               if (found) {
+                       u64 length = map->stripe_len;
+
+                       if (physical_of_found + length <=
+                           dev_replace->cursor_left) {
+                               struct btrfs_bio_stripe *tgtdev_stripe =
+                                       bbio->stripes + num_stripes;
+
+                               tgtdev_stripe->physical = physical_of_found;
+                               tgtdev_stripe->length =
+                                       bbio->stripes[index_srcdev].length;
+                               tgtdev_stripe->dev = dev_replace->tgtdev;
+
+                               num_stripes++;
+                       }
+               }
+       }
+
         *bbio_ret = bbio;
         bbio->num_stripes = num_stripes;
         bbio->max_errors = max_errors;
         bbio->mirror_num = mirror_num;
+
+       /*
+        * this is the case that REQ_READ && dev_replace_is_ongoing &&
+        * mirror_num == num_stripes + 1 && dev_replace target drive is
+        * available as a mirror
+        */
+       if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
+               WARN_ON(num_stripes > 1);
+               bbio->stripes[0].dev = dev_replace->tgtdev;
+               bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
+               bbio->mirror_num = map->num_stripes + 1;
+       }
  out:
+       if (dev_replace_is_ongoing)
+               btrfs_dev_replace_unlock(dev_replace);
         free_extent_map(em);
         return ret;
  }
  
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                       u64 logical, u64 *length,
                       struct btrfs_bio **bbio_ret, int mirror_num)
  {
-       return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
+       return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
                                  mirror_num);
  }
  
@@ -4238,10 +4825,116 @@ static noinline void schedule_bio(struct btrfs_root *root,
                                    &device->work);
  }
  
+static int bio_size_ok(struct block_device *bdev, struct bio *bio,
+                      sector_t sector)
+{
+       struct bio_vec *prev;
+       struct request_queue *q = bdev_get_queue(bdev);
+       unsigned short max_sectors = queue_max_sectors(q);
+       struct bvec_merge_data bvm = {
+               .bi_bdev = bdev,
+               .bi_sector = sector,
+               .bi_rw = bio->bi_rw,
+       };
+
+       if (bio->bi_vcnt == 0) {
+               WARN_ON(1);
+               return 1;
+       }
+
+       prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
+       if ((bio->bi_size >> 9) > max_sectors)
+               return 0;
+
+       if (!q->merge_bvec_fn)
+               return 1;
+
+       bvm.bi_size = bio->bi_size - prev->bv_len;
+       if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
+               return 0;
+       return 1;
+}
+
+static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+                             struct bio *bio, u64 physical, int dev_nr,
+                             int rw, int async)
+{
+       struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
+
+       bio->bi_private = bbio;
+       bio->bi_private = merge_stripe_index_into_bio_private(
+                       bio->bi_private, (unsigned int)dev_nr);
+       bio->bi_end_io = btrfs_end_bio;
+       bio->bi_sector = physical >> 9;
+#ifdef DEBUG
+       {
+               struct rcu_string *name;
+
+               rcu_read_lock();
+               name = rcu_dereference(dev->name);
+               pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
+                        "(%s id %llu), size=%u\n", rw,
+                        (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
+                        name->str, dev->devid, bio->bi_size);
+               rcu_read_unlock();
+       }
+#endif
+       bio->bi_bdev = dev->bdev;
+       if (async)
+               schedule_bio(root, dev, rw, bio);
+       else
+               btrfsic_submit_bio(rw, bio);
+}
+
+static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+                             struct bio *first_bio, struct btrfs_device *dev,
+                             int dev_nr, int rw, int async)
+{
+       struct bio_vec *bvec = first_bio->bi_io_vec;
+       struct bio *bio;
+       int nr_vecs = bio_get_nr_vecs(dev->bdev);
+       u64 physical = bbio->stripes[dev_nr].physical;
+
+again:
+       bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
+       if (!bio)
+               return -ENOMEM;
+
+       while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
+               if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+                                bvec->bv_offset) < bvec->bv_len) {
+                       u64 len = bio->bi_size;
+
+                       atomic_inc(&bbio->stripes_pending);
+                       submit_stripe_bio(root, bbio, bio, physical, dev_nr,
+                                         rw, async);
+                       physical += len;
+                       goto again;
+               }
+               bvec++;
+       }
+
+       submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
+       return 0;
+}
+
+static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+{
+       atomic_inc(&bbio->error);
+       if (atomic_dec_and_test(&bbio->stripes_pending)) {
+               bio->bi_private = bbio->private;
+               bio->bi_end_io = bbio->end_io;
+               bio->bi_bdev = (struct block_device *)
+                       (unsigned long)bbio->mirror_num;
+               bio->bi_sector = logical >> 9;
+               kfree(bbio);
+               bio_endio(bio, -EIO);
+       }
+}
+
  int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                   int mirror_num, int async_submit)
  {
-       struct btrfs_mapping_tree *map_tree;
         struct btrfs_device *dev;
         struct bio *first_bio = bio;
         u64 logical = (u64)bio->bi_sector << 9;
@@ -4253,12 +4946,11 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
         struct btrfs_bio *bbio = NULL;
  
         length = bio->bi_size;
-       map_tree = &root->fs_info->mapping_tree;
         map_length = length;
  
-       ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
+       ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
                               mirror_num);
-       if (ret) /* -ENOMEM */
+       if (ret)
                 return ret;
  
         total_devs = bbio->num_stripes;
@@ -4276,52 +4968,48 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
  
         while (dev_nr < total_devs) {
+               dev = bbio->stripes[dev_nr].dev;
+               if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
+                       bbio_error(bbio, first_bio, logical);
+                       dev_nr++;
+                       continue;
+               }
+
+               /*
+                * Check and see if we're ok with this bio based on it's size
+                * and offset with the given device.
+                */
+               if (!bio_size_ok(dev->bdev, first_bio,
+                                bbio->stripes[dev_nr].physical >> 9)) {
+                       ret = breakup_stripe_bio(root, bbio, first_bio, dev,
+                                                dev_nr, rw, async_submit);
+                       BUG_ON(ret);
+                       dev_nr++;
+                       continue;
+               }
+
                 if (dev_nr < total_devs - 1) {
                         bio = bio_clone(first_bio, GFP_NOFS);
                         BUG_ON(!bio); /* -ENOMEM */
                 } else {
                         bio = first_bio;
                 }
-               bio->bi_private = bbio;
-               bio->bi_private = merge_stripe_index_into_bio_private(
-                               bio->bi_private, (unsigned int)dev_nr);
-               bio->bi_end_io = btrfs_end_bio;
-               bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
-               dev = bbio->stripes[dev_nr].dev;
-               if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
-#ifdef DEBUG
-                       struct rcu_string *name;
-
-                       rcu_read_lock();
-                       name = rcu_dereference(dev->name);
-                       pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
-                                "(%s id %llu), size=%u\n", rw,
-                                (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
-                                name->str, dev->devid, bio->bi_size);
-                       rcu_read_unlock();
-#endif
-                       bio->bi_bdev = dev->bdev;
-                       if (async_submit)
-                               schedule_bio(root, dev, rw, bio);
-                       else
-                               btrfsic_submit_bio(rw, bio);
-               } else {
-                       bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
-                       bio->bi_sector = logical >> 9;
-                       bio_endio(bio, -EIO);
-               }
+
+               submit_stripe_bio(root, bbio, bio,
+                                 bbio->stripes[dev_nr].physical, dev_nr, rw,
+                                 async_submit);
                 dev_nr++;
         }
         return 0;
  }
  
-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
                                        u8 *uuid, u8 *fsid)
  {
         struct btrfs_device *device;
         struct btrfs_fs_devices *cur_devices;
  
-       cur_devices = root->fs_info->fs_devices;
+       cur_devices = fs_info->fs_devices;
         while (cur_devices) {
                 if (!fsid ||
                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
@@ -4402,6 +5090,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
         em->bdev = (struct block_device *)map;
         em->start = logical;
         em->len = length;
+       em->orig_start = 0;
         em->block_start = 0;
         em->block_len = em->len;
  
@@ -4419,8 +5108,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
                 read_extent_buffer(leaf, uuid, (unsigned long)
                                    btrfs_stripe_dev_uuid_nr(chunk, i),
                                    BTRFS_UUID_SIZE);
-               map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
-                                                       NULL);
+               map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
+                                                       uuid, NULL);
                 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
                         kfree(map);
                         free_extent_map(em);
@@ -4461,6 +5150,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
         device->io_align = btrfs_device_io_align(leaf, dev_item);
         device->io_width = btrfs_device_io_width(leaf, dev_item);
         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+       WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
+       device->is_tgtdev_for_dev_replace = 0;
  
         ptr = (unsigned long)btrfs_device_uuid(dev_item);
         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
@@ -4538,7 +5229,7 @@ static int read_one_dev(struct btrfs_root *root,
                         return ret;
         }
  
-       device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+       device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
         if (!device || !device->bdev) {
                 if (!btrfs_test_opt(root, DEGRADED))
                         return -EIO;
@@ -4571,7 +5262,7 @@ static int read_one_dev(struct btrfs_root *root,
         fill_device_from_item(leaf, dev_item, device);
         device->dev_root = root->fs_info->dev_root;
         device->in_fs_metadata = 1;
-       if (device->writeable) {
+       if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                 device->fs_devices->total_rw_bytes += device->total_bytes;
                 spin_lock(&root->fs_info->free_chunk_lock);
                 root->fs_info->free_chunk_space += device->total_bytes -
@@ -4930,7 +5621,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
         int i;
  
         mutex_lock(&fs_devices->device_list_mutex);
-       dev = btrfs_find_device(root, stats->devid, NULL, NULL);
+       dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
         mutex_unlock(&fs_devices->device_list_mutex);
  
         if (!dev) {
@@ -4958,3 +5649,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
         return 0;
  }
+
+int btrfs_scratch_superblock(struct btrfs_device *device)
+{
+       struct buffer_head *bh;
+       struct btrfs_super_block *disk_super;
+
+       bh = btrfs_read_dev_super(device->bdev);
+       if (!bh)
+               return -EINVAL;
+       disk_super = (struct btrfs_super_block *)bh->b_data;
+
+       memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+       set_buffer_dirty(bh);
+       sync_dirty_buffer(bh);
+       brelse(bh);
+
+       return 0;
+}