Merge branches 'pm-domains', 'powercap' and 'pm-tools'

[cascardo/linux.git] / fs / btrfs / raid56.c
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c

index 6a41631..8ab2a17 100644 (file)
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -58,9 +58,23 @@
   */
  #define RBIO_CACHE_READY_BIT   3
  
+/*
+ * bbio and raid_map is managed by the caller, so we shouldn't free
+ * them here. And besides that, all rbios with this flag should not
+ * be cached, because we need raid_map to check the rbios' stripe
+ * is the same or not, but it is very likely that the caller has
+ * free raid_map, so don't cache those rbios.
+ */
+#define RBIO_HOLD_BBIO_MAP_BIT 4
  
  #define RBIO_CACHE_SIZE 1024
  
+enum btrfs_rbio_ops {
+       BTRFS_RBIO_WRITE        = 0,
+       BTRFS_RBIO_READ_REBUILD = 1,
+       BTRFS_RBIO_PARITY_SCRUB = 2,
+};
+
  struct btrfs_raid_bio {
         struct btrfs_fs_info *fs_info;
         struct btrfs_bio *bbio;
@@ -117,13 +131,16 @@ struct btrfs_raid_bio {
         /* number of data stripes (no p/q) */
         int nr_data;
  
+       int real_stripes;
+
+       int stripe_npages;
         /*
          * set if we're doing a parity rebuild
          * for a read from higher up, which is handled
          * differently from a parity rebuild as part of
          * rmw
          */
-       int read_rebuild;
+       enum btrfs_rbio_ops operation;
  
         /* first bad stripe */
         int faila;
@@ -131,6 +148,7 @@ struct btrfs_raid_bio {
         /* second bad stripe (for raid6 use) */
         int failb;
  
+       int scrubp;
         /*
          * number of pages needed to represent the full
          * stripe
@@ -144,8 +162,13 @@ struct btrfs_raid_bio {
          */
         int bio_list_bytes;
  
+       int generic_bio_cnt;
+
         atomic_t refs;
  
+       atomic_t stripes_pending;
+
+       atomic_t error;
         /*
          * these are two arrays of pointers.  We allocate the
          * rbio big enough to hold them both and setup their
@@ -162,6 +185,11 @@ struct btrfs_raid_bio {
          * here for faster lookup
          */
         struct page **bio_pages;
+
+       /*
+        * bitmap to record which horizontal stripe has data
+        */
+       unsigned long *dbitmap;
  };
  
  static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
@@ -176,6 +204,10 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio);
  static void index_rbio_pages(struct btrfs_raid_bio *rbio);
  static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
  
+static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
+                                        int need_check);
+static void async_scrub_parity(struct btrfs_raid_bio *rbio);
+
  /*
   * the stripe hash table is used for locking, and to collect
   * bios in hopes of making a full stripe
@@ -324,6 +356,7 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
  {
         bio_list_merge(&dest->bio_list, &victim->bio_list);
         dest->bio_list_bytes += victim->bio_list_bytes;
+       dest->generic_bio_cnt += victim->generic_bio_cnt;
         bio_list_init(&victim->bio_list);
  }
  
@@ -577,11 +610,20 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
             cur->raid_map[0])
                 return 0;
  
-       /* reads can't merge with writes */
-       if (last->read_rebuild !=
-           cur->read_rebuild) {
+       /* we can't merge with different operations */
+       if (last->operation != cur->operation)
+               return 0;
+       /*
+        * We've need read the full stripe from the drive.
+        * check and repair the parity and write the new results.
+        *
+        * We're not allowed to add any new bios to the
+        * bio list here, anyone else that wants to
+        * change this stripe needs to do their own rmw.
+        */
+       if (last->operation == BTRFS_RBIO_PARITY_SCRUB ||
+           cur->operation == BTRFS_RBIO_PARITY_SCRUB)
                 return 0;
-       }
  
         return 1;
  }
@@ -601,7 +643,7 @@ static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
   */
  static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
  {
-       if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
+       if (rbio->nr_data + 1 == rbio->real_stripes)
                 return NULL;
  
         index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
@@ -772,11 +814,14 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
                         spin_unlock(&rbio->bio_list_lock);
                         spin_unlock_irqrestore(&h->lock, flags);
  
-                       if (next->read_rebuild)
+                       if (next->operation == BTRFS_RBIO_READ_REBUILD)
                                 async_read_rebuild(next);
-                       else {
+                       else if (next->operation == BTRFS_RBIO_WRITE) {
                                 steal_rbio(rbio, next);
                                 async_rmw_stripe(next);
+                       } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
+                               steal_rbio(rbio, next);
+                               async_scrub_parity(next);
                         }
  
                         goto done_nolock;
@@ -796,6 +841,21 @@ done_nolock:
                 remove_rbio_from_cache(rbio);
  }
  
+static inline void
+__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
+{
+       if (need) {
+               kfree(raid_map);
+               kfree(bbio);
+       }
+}
+
+static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
+{
+       __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
+                       !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
+}
+
  static void __free_raid_bio(struct btrfs_raid_bio *rbio)
  {
         int i;
@@ -814,8 +874,9 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
                         rbio->stripe_pages[i] = NULL;
                 }
         }
-       kfree(rbio->raid_map);
-       kfree(rbio->bbio);
+
+       free_bbio_and_raid_map(rbio);
+
         kfree(rbio);
  }
  
@@ -833,6 +894,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
  {
         struct bio *cur = bio_list_get(&rbio->bio_list);
         struct bio *next;
+
+       if (rbio->generic_bio_cnt)
+               btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
+
         free_raid_bio(rbio);
  
         while (cur) {
@@ -858,13 +923,13 @@ static void raid_write_end_io(struct bio *bio, int err)
  
         bio_put(bio);
  
-       if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+       if (!atomic_dec_and_test(&rbio->stripes_pending))
                 return;
  
         err = 0;
  
         /* OK, we have read all the stripes we need to. */
-       if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+       if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
                 err = -EIO;
  
         rbio_orig_end_io(rbio, err, 0);
@@ -925,16 +990,16 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
  {
         struct btrfs_raid_bio *rbio;
         int nr_data = 0;
-       int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
+       int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+       int num_pages = rbio_nr_pages(stripe_len, real_stripes);
+       int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
         void *p;
  
-       rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
+       rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
+                      DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
                         GFP_NOFS);
-       if (!rbio) {
-               kfree(raid_map);
-               kfree(bbio);
+       if (!rbio)
                 return ERR_PTR(-ENOMEM);
-       }
  
         bio_list_init(&rbio->bio_list);
         INIT_LIST_HEAD(&rbio->plug_list);
@@ -946,9 +1011,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
         rbio->fs_info = root->fs_info;
         rbio->stripe_len = stripe_len;
         rbio->nr_pages = num_pages;
+       rbio->real_stripes = real_stripes;
+       rbio->stripe_npages = stripe_npages;
         rbio->faila = -1;
         rbio->failb = -1;
         atomic_set(&rbio->refs, 1);
+       atomic_set(&rbio->error, 0);
+       atomic_set(&rbio->stripes_pending, 0);
  
         /*
          * the stripe_pages and bio_pages array point to the extra
@@ -957,11 +1026,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
         p = rbio + 1;
         rbio->stripe_pages = p;
         rbio->bio_pages = p + sizeof(struct page *) * num_pages;
+       rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
  
-       if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
-               nr_data = bbio->num_stripes - 2;
+       if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
+               nr_data = real_stripes - 2;
         else
-               nr_data = bbio->num_stripes - 1;
+               nr_data = real_stripes - 1;
  
         rbio->nr_data = nr_data;
         return rbio;
@@ -1073,7 +1143,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
  static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
  {
         if (rbio->faila >= 0 || rbio->failb >= 0) {
-               BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
+               BUG_ON(rbio->faila == rbio->real_stripes - 1);
                 __raid56_parity_recover(rbio);
         } else {
                 finish_rmw(rbio);
@@ -1134,7 +1204,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
  static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
  {
         struct btrfs_bio *bbio = rbio->bbio;
-       void *pointers[bbio->num_stripes];
+       void *pointers[rbio->real_stripes];
         int stripe_len = rbio->stripe_len;
         int nr_data = rbio->nr_data;
         int stripe;
@@ -1148,11 +1218,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
  
         bio_list_init(&bio_list);
  
-       if (bbio->num_stripes - rbio->nr_data == 1) {
-               p_stripe = bbio->num_stripes - 1;
-       } else if (bbio->num_stripes - rbio->nr_data == 2) {
-               p_stripe = bbio->num_stripes - 2;
-               q_stripe = bbio->num_stripes - 1;
+       if (rbio->real_stripes - rbio->nr_data == 1) {
+               p_stripe = rbio->real_stripes - 1;
+       } else if (rbio->real_stripes - rbio->nr_data == 2) {
+               p_stripe = rbio->real_stripes - 2;
+               q_stripe = rbio->real_stripes - 1;
         } else {
                 BUG();
         }
@@ -1169,7 +1239,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
         set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
         spin_unlock_irq(&rbio->bio_list_lock);
  
-       atomic_set(&rbio->bbio->error, 0);
+       atomic_set(&rbio->error, 0);
  
         /*
          * now that we've set rmw_locked, run through the
@@ -1209,7 +1279,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
                         SetPageUptodate(p);
                         pointers[stripe++] = kmap(p);
  
-                       raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
+                       raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
                                                 pointers);
                 } else {
                         /* raid5 */
@@ -1218,7 +1288,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
                 }
  
  
-               for (stripe = 0; stripe < bbio->num_stripes; stripe++)
+               for (stripe = 0; stripe < rbio->real_stripes; stripe++)
                         kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
         }
  
@@ -1227,7 +1297,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
          * higher layers (the bio_list in our rbio) and our p/q.  Ignore
          * everything else.
          */
-       for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
+       for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
                 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
                         struct page *page;
                         if (stripe < rbio->nr_data) {
@@ -1245,8 +1315,34 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
                 }
         }
  
-       atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
-       BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
+       if (likely(!bbio->num_tgtdevs))
+               goto write_data;
+
+       for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+               if (!bbio->tgtdev_map[stripe])
+                       continue;
+
+               for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+                       struct page *page;
+                       if (stripe < rbio->nr_data) {
+                               page = page_in_rbio(rbio, stripe, pagenr, 1);
+                               if (!page)
+                                       continue;
+                       } else {
+                              page = rbio_stripe_page(rbio, stripe, pagenr);
+                       }
+
+                       ret = rbio_add_io_page(rbio, &bio_list, page,
+                                              rbio->bbio->tgtdev_map[stripe],
+                                              pagenr, rbio->stripe_len);
+                       if (ret)
+                               goto cleanup;
+               }
+       }
+
+write_data:
+       atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
+       BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
  
         while (1) {
                 bio = bio_list_pop(&bio_list);
@@ -1283,7 +1379,8 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
                 stripe = &rbio->bbio->stripes[i];
                 stripe_start = stripe->physical;
                 if (physical >= stripe_start &&
-                   physical < stripe_start + rbio->stripe_len) {
+                   physical < stripe_start + rbio->stripe_len &&
+                   bio->bi_bdev == stripe->dev->bdev) {
                         return i;
                 }
         }
@@ -1331,11 +1428,11 @@ static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
         if (rbio->faila == -1) {
                 /* first failure on this rbio */
                 rbio->faila = failed;
-               atomic_inc(&rbio->bbio->error);
+               atomic_inc(&rbio->error);
         } else if (rbio->failb == -1) {
                 /* second failure on this rbio */
                 rbio->failb = failed;
-               atomic_inc(&rbio->bbio->error);
+               atomic_inc(&rbio->error);
         } else {
                 ret = -EIO;
         }
@@ -1394,11 +1491,11 @@ static void raid_rmw_end_io(struct bio *bio, int err)
  
         bio_put(bio);
  
-       if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+       if (!atomic_dec_and_test(&rbio->stripes_pending))
                 return;
  
         err = 0;
-       if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+       if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
                 goto cleanup;
  
         /*
@@ -1439,7 +1536,6 @@ static void async_read_rebuild(struct btrfs_raid_bio *rbio)
  static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
  {
         int bios_to_read = 0;
-       struct btrfs_bio *bbio = rbio->bbio;
         struct bio_list bio_list;
         int ret;
         int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
@@ -1455,7 +1551,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
  
         index_rbio_pages(rbio);
  
-       atomic_set(&rbio->bbio->error, 0);
+       atomic_set(&rbio->error, 0);
         /*
          * build a list of bios to read all the missing parts of this
          * stripe
@@ -1503,7 +1599,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
          * the bbio may be freed once we submit the last bio.  Make sure
          * not to touch it after that
          */
-       atomic_set(&bbio->stripes_pending, bios_to_read);
+       atomic_set(&rbio->stripes_pending, bios_to_read);
         while (1) {
                 bio = bio_list_pop(&bio_list);
                 if (!bio)
@@ -1686,19 +1782,30 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
         struct btrfs_raid_bio *rbio;
         struct btrfs_plug_cb *plug = NULL;
         struct blk_plug_cb *cb;
+       int ret;
  
         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
-       if (IS_ERR(rbio))
+       if (IS_ERR(rbio)) {
+               __free_bbio_and_raid_map(bbio, raid_map, 1);
                 return PTR_ERR(rbio);
+       }
         bio_list_add(&rbio->bio_list, bio);
         rbio->bio_list_bytes = bio->bi_iter.bi_size;
+       rbio->operation = BTRFS_RBIO_WRITE;
+
+       btrfs_bio_counter_inc_noblocked(root->fs_info);
+       rbio->generic_bio_cnt = 1;
  
         /*
          * don't plug on full rbios, just get them out the door
          * as quickly as we can
          */
-       if (rbio_is_full(rbio))
-               return full_stripe_write(rbio);
+       if (rbio_is_full(rbio)) {
+               ret = full_stripe_write(rbio);
+               if (ret)
+                       btrfs_bio_counter_dec(root->fs_info);
+               return ret;
+       }
  
         cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
                                sizeof(*plug));
@@ -1709,10 +1816,13 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
                         INIT_LIST_HEAD(&plug->rbio_list);
                 }
                 list_add_tail(&rbio->plug_list, &plug->rbio_list);
+               ret = 0;
         } else {
-               return __raid56_parity_write(rbio);
+               ret = __raid56_parity_write(rbio);
+               if (ret)
+                       btrfs_bio_counter_dec(root->fs_info);
         }
-       return 0;
+       return ret;
  }
  
  /*
@@ -1730,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
         int err;
         int i;
  
-       pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
+       pointers = kzalloc(rbio->real_stripes * sizeof(void *),
                            GFP_NOFS);
         if (!pointers) {
                 err = -ENOMEM;
@@ -1740,7 +1850,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
         faila = rbio->faila;
         failb = rbio->failb;
  
-       if (rbio->read_rebuild) {
+       if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
                 spin_lock_irq(&rbio->bio_list_lock);
                 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
                 spin_unlock_irq(&rbio->bio_list_lock);
@@ -1749,15 +1859,23 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
         index_rbio_pages(rbio);
  
         for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+               /*
+                * Now we just use bitmap to mark the horizontal stripes in
+                * which we have data when doing parity scrub.
+                */
+               if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
+                   !test_bit(pagenr, rbio->dbitmap))
+                       continue;
+
                 /* setup our array of pointers with pages
                  * from each stripe
                  */
-               for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
+               for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
                         /*
                          * if we're rebuilding a read, we have to use
                          * pages from the bio list
                          */
-                       if (rbio->read_rebuild &&
+                       if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
                             (stripe == faila || stripe == failb)) {
                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
                         } else {
@@ -1767,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                 }
  
                 /* all raid6 handling here */
-               if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
+               if (rbio->raid_map[rbio->real_stripes - 1] ==
                     RAID6_Q_STRIPE) {
  
                         /*
@@ -1817,10 +1935,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                         }
  
                         if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
-                               raid6_datap_recov(rbio->bbio->num_stripes,
+                               raid6_datap_recov(rbio->real_stripes,
                                                   PAGE_SIZE, faila, pointers);
                         } else {
-                               raid6_2data_recov(rbio->bbio->num_stripes,
+                               raid6_2data_recov(rbio->real_stripes,
                                                   PAGE_SIZE, faila, failb,
                                                   pointers);
                         }
@@ -1850,7 +1968,7 @@ pstripe:
                  * know they can be trusted.  If this was a read reconstruction,
                  * other endio functions will fiddle the uptodate bits
                  */
-               if (!rbio->read_rebuild) {
+               if (rbio->operation == BTRFS_RBIO_WRITE) {
                         for (i = 0;  i < nr_pages; i++) {
                                 if (faila != -1) {
                                         page = rbio_stripe_page(rbio, faila, i);
@@ -1862,12 +1980,12 @@ pstripe:
                                 }
                         }
                 }
-               for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
+               for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
                         /*
                          * if we're rebuilding a read, we have to use
                          * pages from the bio list
                          */
-                       if (rbio->read_rebuild &&
+                       if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
                             (stripe == faila || stripe == failb)) {
                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
                         } else {
@@ -1882,9 +2000,9 @@ cleanup:
         kfree(pointers);
  
  cleanup_io:
-
-       if (rbio->read_rebuild) {
-               if (err == 0)
+       if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
+               if (err == 0 &&
+                   !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
                         cache_rbio_pages(rbio);
                 else
                         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -1893,7 +2011,13 @@ cleanup_io:
         } else if (err == 0) {
                 rbio->faila = -1;
                 rbio->failb = -1;
-               finish_rmw(rbio);
+
+               if (rbio->operation == BTRFS_RBIO_WRITE)
+                       finish_rmw(rbio);
+               else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
+                       finish_parity_scrub(rbio, 0);
+               else
+                       BUG();
         } else {
                 rbio_orig_end_io(rbio, err, 0);
         }
@@ -1917,10 +2041,10 @@ static void raid_recover_end_io(struct bio *bio, int err)
                 set_bio_pages_uptodate(bio);
         bio_put(bio);
  
-       if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+       if (!atomic_dec_and_test(&rbio->stripes_pending))
                 return;
  
-       if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+       if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
                 rbio_orig_end_io(rbio, -EIO, 0);
         else
                 __raid_recover_end_io(rbio);
@@ -1937,7 +2061,6 @@ static void raid_recover_end_io(struct bio *bio, int err)
  static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
  {
         int bios_to_read = 0;
-       struct btrfs_bio *bbio = rbio->bbio;
         struct bio_list bio_list;
         int ret;
         int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
@@ -1951,16 +2074,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
         if (ret)
                 goto cleanup;
  
-       atomic_set(&rbio->bbio->error, 0);
+       atomic_set(&rbio->error, 0);
  
         /*
          * read everything that hasn't failed.  Thanks to the
          * stripe cache, it is possible that some or all of these
          * pages are going to be uptodate.
          */
-       for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
+       for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
                 if (rbio->faila == stripe || rbio->failb == stripe) {
-                       atomic_inc(&rbio->bbio->error);
+                       atomic_inc(&rbio->error);
                         continue;
                 }
  
@@ -1990,7 +2113,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
                  * were up to date, or we might have no bios to read because
                  * the devices were gone.
                  */
-               if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
+               if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
                         __raid_recover_end_io(rbio);
                         goto out;
                 } else {
@@ -2002,7 +2125,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
          * the bbio may be freed once we submit the last bio.  Make sure
          * not to touch it after that
          */
-       atomic_set(&bbio->stripes_pending, bios_to_read);
+       atomic_set(&rbio->stripes_pending, bios_to_read);
         while (1) {
                 bio = bio_list_pop(&bio_list);
                 if (!bio)
@@ -2021,7 +2144,7 @@ out:
         return 0;
  
  cleanup:
-       if (rbio->read_rebuild)
+       if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
                 rbio_orig_end_io(rbio, -EIO, 0);
         return -EIO;
  }
@@ -2034,34 +2157,42 @@ cleanup:
   */
  int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
                           struct btrfs_bio *bbio, u64 *raid_map,
-                         u64 stripe_len, int mirror_num)
+                         u64 stripe_len, int mirror_num, int generic_io)
  {
         struct btrfs_raid_bio *rbio;
         int ret;
  
         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
-       if (IS_ERR(rbio))
+       if (IS_ERR(rbio)) {
+               __free_bbio_and_raid_map(bbio, raid_map, generic_io);
                 return PTR_ERR(rbio);
+       }
  
-       rbio->read_rebuild = 1;
+       rbio->operation = BTRFS_RBIO_READ_REBUILD;
         bio_list_add(&rbio->bio_list, bio);
         rbio->bio_list_bytes = bio->bi_iter.bi_size;
  
         rbio->faila = find_logical_bio_stripe(rbio, bio);
         if (rbio->faila == -1) {
                 BUG();
-               kfree(raid_map);
-               kfree(bbio);
+               __free_bbio_and_raid_map(bbio, raid_map, generic_io);
                 kfree(rbio);
                 return -EIO;
         }
  
+       if (generic_io) {
+               btrfs_bio_counter_inc_noblocked(root->fs_info);
+               rbio->generic_bio_cnt = 1;
+       } else {
+               set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
+       }
+
         /*
          * reconstruct from the q stripe if they are
          * asking for mirror 3
          */
         if (mirror_num == 3)
-               rbio->failb = bbio->num_stripes - 2;
+               rbio->failb = rbio->real_stripes - 2;
  
         ret = lock_stripe_add(rbio);
  
@@ -2098,3 +2229,483 @@ static void read_rebuild_work(struct btrfs_work *work)
         rbio = container_of(work, struct btrfs_raid_bio, work);
         __raid56_parity_recover(rbio);
  }
+
+/*
+ * The following code is used to scrub/replace the parity stripe
+ *
+ * Note: We need make sure all the pages that add into the scrub/replace
+ * raid bio are correct and not be changed during the scrub/replace. That
+ * is those pages just hold metadata or file data with checksum.
+ */
+
+struct btrfs_raid_bio *
+raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
+                              struct btrfs_bio *bbio, u64 *raid_map,
+                              u64 stripe_len, struct btrfs_device *scrub_dev,
+                              unsigned long *dbitmap, int stripe_nsectors)
+{
+       struct btrfs_raid_bio *rbio;
+       int i;
+
+       rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+       if (IS_ERR(rbio))
+               return NULL;
+       bio_list_add(&rbio->bio_list, bio);
+       /*
+        * This is a special bio which is used to hold the completion handler
+        * and make the scrub rbio is similar to the other types
+        */
+       ASSERT(!bio->bi_iter.bi_size);
+       rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
+
+       for (i = 0; i < rbio->real_stripes; i++) {
+               if (bbio->stripes[i].dev == scrub_dev) {
+                       rbio->scrubp = i;
+                       break;
+               }
+       }
+
+       /* Now we just support the sectorsize equals to page size */
+       ASSERT(root->sectorsize == PAGE_SIZE);
+       ASSERT(rbio->stripe_npages == stripe_nsectors);
+       bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
+
+       return rbio;
+}
+
+void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
+                                  struct page *page, u64 logical)
+{
+       int stripe_offset;
+       int index;
+
+       ASSERT(logical >= rbio->raid_map[0]);
+       ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
+                               rbio->stripe_len * rbio->nr_data);
+       stripe_offset = (int)(logical - rbio->raid_map[0]);
+       index = stripe_offset >> PAGE_CACHE_SHIFT;
+       rbio->bio_pages[index] = page;
+}
+
+/*
+ * We just scrub the parity that we have correct data on the same horizontal,
+ * so we needn't allocate all pages for all the stripes.
+ */
+static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
+{
+       int i;
+       int bit;
+       int index;
+       struct page *page;
+
+       for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
+               for (i = 0; i < rbio->real_stripes; i++) {
+                       index = i * rbio->stripe_npages + bit;
+                       if (rbio->stripe_pages[index])
+                               continue;
+
+                       page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                       if (!page)
+                               return -ENOMEM;
+                       rbio->stripe_pages[index] = page;
+                       ClearPageUptodate(page);
+               }
+       }
+       return 0;
+}
+
+/*
+ * end io function used by finish_rmw.  When we finally
+ * get here, we've written a full stripe
+ */
+static void raid_write_parity_end_io(struct bio *bio, int err)
+{
+       struct btrfs_raid_bio *rbio = bio->bi_private;
+
+       if (err)
+               fail_bio_stripe(rbio, bio);
+
+       bio_put(bio);
+
+       if (!atomic_dec_and_test(&rbio->stripes_pending))
+               return;
+
+       err = 0;
+
+       if (atomic_read(&rbio->error))
+               err = -EIO;
+
+       rbio_orig_end_io(rbio, err, 0);
+}
+
+static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
+                                        int need_check)
+{
+       struct btrfs_bio *bbio = rbio->bbio;
+       void *pointers[rbio->real_stripes];
+       DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
+       int nr_data = rbio->nr_data;
+       int stripe;
+       int pagenr;
+       int p_stripe = -1;
+       int q_stripe = -1;
+       struct page *p_page = NULL;
+       struct page *q_page = NULL;
+       struct bio_list bio_list;
+       struct bio *bio;
+       int is_replace = 0;
+       int ret;
+
+       bio_list_init(&bio_list);
+
+       if (rbio->real_stripes - rbio->nr_data == 1) {
+               p_stripe = rbio->real_stripes - 1;
+       } else if (rbio->real_stripes - rbio->nr_data == 2) {
+               p_stripe = rbio->real_stripes - 2;
+               q_stripe = rbio->real_stripes - 1;
+       } else {
+               BUG();
+       }
+
+       if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
+               is_replace = 1;
+               bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
+       }
+
+       /*
+        * Because the higher layers(scrubber) are unlikely to
+        * use this area of the disk again soon, so don't cache
+        * it.
+        */
+       clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+       if (!need_check)
+               goto writeback;
+
+       p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+       if (!p_page)
+               goto cleanup;
+       SetPageUptodate(p_page);
+
+       if (q_stripe != -1) {
+               q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+               if (!q_page) {
+                       __free_page(p_page);
+                       goto cleanup;
+               }
+               SetPageUptodate(q_page);
+       }
+
+       atomic_set(&rbio->error, 0);
+
+       for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
+               struct page *p;
+               void *parity;
+               /* first collect one page from each data stripe */
+               for (stripe = 0; stripe < nr_data; stripe++) {
+                       p = page_in_rbio(rbio, stripe, pagenr, 0);
+                       pointers[stripe] = kmap(p);
+               }
+
+               /* then add the parity stripe */
+               pointers[stripe++] = kmap(p_page);
+
+               if (q_stripe != -1) {
+
+                       /*
+                        * raid6, add the qstripe and call the
+                        * library function to fill in our p/q
+                        */
+                       pointers[stripe++] = kmap(q_page);
+
+                       raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
+                                               pointers);
+               } else {
+                       /* raid5 */
+                       memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
+                       run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+               }
+
+               /* Check scrubbing pairty and repair it */
+               p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
+               parity = kmap(p);
+               if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
+                       memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
+               else
+                       /* Parity is right, needn't writeback */
+                       bitmap_clear(rbio->dbitmap, pagenr, 1);
+               kunmap(p);
+
+               for (stripe = 0; stripe < rbio->real_stripes; stripe++)
+                       kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+       }
+
+       __free_page(p_page);
+       if (q_page)
+               __free_page(q_page);
+
+writeback:
+       /*
+        * time to start writing.  Make bios for everything from the
+        * higher layers (the bio_list in our rbio) and our p/q.  Ignore
+        * everything else.
+        */
+       for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
+               struct page *page;
+
+               page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
+               ret = rbio_add_io_page(rbio, &bio_list,
+                              page, rbio->scrubp, pagenr, rbio->stripe_len);
+               if (ret)
+                       goto cleanup;
+       }
+
+       if (!is_replace)
+               goto submit_write;
+
+       for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
+               struct page *page;
+
+               page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
+               ret = rbio_add_io_page(rbio, &bio_list, page,
+                                      bbio->tgtdev_map[rbio->scrubp],
+                                      pagenr, rbio->stripe_len);
+               if (ret)
+                       goto cleanup;
+       }
+
+submit_write:
+       nr_data = bio_list_size(&bio_list);
+       if (!nr_data) {
+               /* Every parity is right */
+               rbio_orig_end_io(rbio, 0, 0);
+               return;
+       }
+
+       atomic_set(&rbio->stripes_pending, nr_data);
+
+       while (1) {
+               bio = bio_list_pop(&bio_list);
+               if (!bio)
+                       break;
+
+               bio->bi_private = rbio;
+               bio->bi_end_io = raid_write_parity_end_io;
+               BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+               submit_bio(WRITE, bio);
+       }
+       return;
+
+cleanup:
+       rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
+{
+       if (stripe >= 0 && stripe < rbio->nr_data)
+               return 1;
+       return 0;
+}
+
+/*
+ * While we're doing the parity check and repair, we could have errors
+ * in reading pages off the disk.  This checks for errors and if we're
+ * not able to read the page it'll trigger parity reconstruction.  The
+ * parity scrub will be finished after we've reconstructed the failed
+ * stripes
+ */
+static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
+{
+       if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+               goto cleanup;
+
+       if (rbio->faila >= 0 || rbio->failb >= 0) {
+               int dfail = 0, failp = -1;
+
+               if (is_data_stripe(rbio, rbio->faila))
+                       dfail++;
+               else if (is_parity_stripe(rbio->faila))
+                       failp = rbio->faila;
+
+               if (is_data_stripe(rbio, rbio->failb))
+                       dfail++;
+               else if (is_parity_stripe(rbio->failb))
+                       failp = rbio->failb;
+
+               /*
+                * Because we can not use a scrubbing parity to repair
+                * the data, so the capability of the repair is declined.
+                * (In the case of RAID5, we can not repair anything)
+                */
+               if (dfail > rbio->bbio->max_errors - 1)
+                       goto cleanup;
+
+               /*
+                * If all data is good, only parity is correctly, just
+                * repair the parity.
+                */
+               if (dfail == 0) {
+                       finish_parity_scrub(rbio, 0);
+                       return;
+               }
+
+               /*
+                * Here means we got one corrupted data stripe and one
+                * corrupted parity on RAID6, if the corrupted parity
+                * is scrubbing parity, luckly, use the other one to repair
+                * the data, or we can not repair the data stripe.
+                */
+               if (failp != rbio->scrubp)
+                       goto cleanup;
+
+               __raid_recover_end_io(rbio);
+       } else {
+               finish_parity_scrub(rbio, 1);
+       }
+       return;
+
+cleanup:
+       rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+/*
+ * end io for the read phase of the rmw cycle.  All the bios here are physical
+ * stripe bios we've read from the disk so we can recalculate the parity of the
+ * stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid56_parity_scrub_end_io(struct bio *bio, int err)
+{
+       struct btrfs_raid_bio *rbio = bio->bi_private;
+
+       if (err)
+               fail_bio_stripe(rbio, bio);
+       else
+               set_bio_pages_uptodate(bio);
+
+       bio_put(bio);
+
+       if (!atomic_dec_and_test(&rbio->stripes_pending))
+               return;
+
+       /*
+        * this will normally call finish_rmw to start our write
+        * but if there are any failed stripes we'll reconstruct
+        * from parity first
+        */
+       validate_rbio_for_parity_scrub(rbio);
+}
+
+static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
+{
+       int bios_to_read = 0;
+       struct bio_list bio_list;
+       int ret;
+       int pagenr;
+       int stripe;
+       struct bio *bio;
+
+       ret = alloc_rbio_essential_pages(rbio);
+       if (ret)
+               goto cleanup;
+
+       bio_list_init(&bio_list);
+
+       atomic_set(&rbio->error, 0);
+       /*
+        * build a list of bios to read all the missing parts of this
+        * stripe
+        */
+       for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+               for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
+                       struct page *page;
+                       /*
+                        * we want to find all the pages missing from
+                        * the rbio and read them from the disk.  If
+                        * page_in_rbio finds a page in the bio list
+                        * we don't need to read it off the stripe.
+                        */
+                       page = page_in_rbio(rbio, stripe, pagenr, 1);
+                       if (page)
+                               continue;
+
+                       page = rbio_stripe_page(rbio, stripe, pagenr);
+                       /*
+                        * the bio cache may have handed us an uptodate
+                        * page.  If so, be happy and use it
+                        */
+                       if (PageUptodate(page))
+                               continue;
+
+                       ret = rbio_add_io_page(rbio, &bio_list, page,
+                                      stripe, pagenr, rbio->stripe_len);
+                       if (ret)
+                               goto cleanup;
+               }
+       }
+
+       bios_to_read = bio_list_size(&bio_list);
+       if (!bios_to_read) {
+               /*
+                * this can happen if others have merged with
+                * us, it means there is nothing left to read.
+                * But if there are missing devices it may not be
+                * safe to do the full stripe write yet.
+                */
+               goto finish;
+       }
+
+       /*
+        * the bbio may be freed once we submit the last bio.  Make sure
+        * not to touch it after that
+        */
+       atomic_set(&rbio->stripes_pending, bios_to_read);
+       while (1) {
+               bio = bio_list_pop(&bio_list);
+               if (!bio)
+                       break;
+
+               bio->bi_private = rbio;
+               bio->bi_end_io = raid56_parity_scrub_end_io;
+
+               btrfs_bio_wq_end_io(rbio->fs_info, bio,
+                                   BTRFS_WQ_ENDIO_RAID56);
+
+               BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+               submit_bio(READ, bio);
+       }
+       /* the actual write will happen once the reads are done */
+       return;
+
+cleanup:
+       rbio_orig_end_io(rbio, -EIO, 0);
+       return;
+
+finish:
+       validate_rbio_for_parity_scrub(rbio);
+}
+
+static void scrub_parity_work(struct btrfs_work *work)
+{
+       struct btrfs_raid_bio *rbio;
+
+       rbio = container_of(work, struct btrfs_raid_bio, work);
+       raid56_parity_scrub_stripe(rbio);
+}
+
+static void async_scrub_parity(struct btrfs_raid_bio *rbio)
+{
+       btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+                       scrub_parity_work, NULL, NULL);
+
+       btrfs_queue_work(rbio->fs_info->rmw_workers,
+                        &rbio->work);
+}
+
+void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
+{
+       if (!lock_stripe_add(rbio))
+               async_scrub_parity(rbio);
+}