fs/btrfs/volumes.c

   1 /*
   2  * Copyright (C) 2007 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18 #include <linux/sched.h>
  19 #include <linux/bio.h>
  20 #include <linux/slab.h>
  21 #include <linux/buffer_head.h>
  22 #include <linux/blkdev.h>
  23 #include <linux/random.h>
  24 #include <linux/iocontext.h>
  25 #include <linux/capability.h>
  26 #include <linux/ratelimit.h>
  27 #include <linux/kthread.h>
  28 #include <linux/raid/pq.h>
  29 #include <linux/semaphore.h>
  30 #include <asm/div64.h>
  31 #include "ctree.h"
  32 #include "extent_map.h"
  33 #include "disk-io.h"
  34 #include "transaction.h"
  35 #include "print-tree.h"
  36 #include "volumes.h"
  37 #include "raid56.h"
  38 #include "async-thread.h"
  39 #include "check-integrity.h"
  40 #include "rcu-string.h"
  41 #include "math.h"
  42 #include "dev-replace.h"
  43 #include "sysfs.h"
  44
  45 static int init_first_rw_device(struct btrfs_trans_handle *trans,
  46                                 struct btrfs_root *root,
  47                                 struct btrfs_device *device);
  48 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
  49 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
  50 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
  51 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
  52
  53 DEFINE_MUTEX(uuid_mutex);
  54 static LIST_HEAD(fs_uuids);
  55 struct list_head *btrfs_get_fs_uuids(void)
  56 {
  57         return &fs_uuids;
  58 }
  59
  60 static struct btrfs_fs_devices *__alloc_fs_devices(void)
  61 {
  62         struct btrfs_fs_devices *fs_devs;
  63
  64         fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS);
  65         if (!fs_devs)
  66                 return ERR_PTR(-ENOMEM);
  67
  68         mutex_init(&fs_devs->device_list_mutex);
  69
  70         INIT_LIST_HEAD(&fs_devs->devices);
  71         INIT_LIST_HEAD(&fs_devs->resized_devices);
  72         INIT_LIST_HEAD(&fs_devs->alloc_list);
  73         INIT_LIST_HEAD(&fs_devs->list);
  74
  75         return fs_devs;
  76 }
  77
  78 /**
  79  * alloc_fs_devices - allocate struct btrfs_fs_devices
  80  * @fsid:       a pointer to UUID for this FS.  If NULL a new UUID is
  81  *              generated.
  82  *
  83  * Return: a pointer to a new &struct btrfs_fs_devices on success;
  84  * ERR_PTR() on error.  Returned struct is not linked onto any lists and
  85  * can be destroyed with kfree() right away.
  86  */
  87 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
  88 {
  89         struct btrfs_fs_devices *fs_devs;
  90
  91         fs_devs = __alloc_fs_devices();
  92         if (IS_ERR(fs_devs))
  93                 return fs_devs;
  94
  95         if (fsid)
  96                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
  97         else
  98                 generate_random_uuid(fs_devs->fsid);
  99
 100         return fs_devs;
 101 }
 102
 103 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 104 {
 105         struct btrfs_device *device;
 106         WARN_ON(fs_devices->opened);
 107         while (!list_empty(&fs_devices->devices)) {
 108                 device = list_entry(fs_devices->devices.next,
 109                                     struct btrfs_device, dev_list);
 110                 list_del(&device->dev_list);
 111                 rcu_string_free(device->name);
 112                 kfree(device);
 113         }
 114         kfree(fs_devices);
 115 }
 116
 117 static void btrfs_kobject_uevent(struct block_device *bdev,
 118                                  enum kobject_action action)
 119 {
 120         int ret;
 121
 122         ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
 123         if (ret)
 124                 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
 125                         action,
 126                         kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
 127                         &disk_to_dev(bdev->bd_disk)->kobj);
 128 }
 129
 130 void btrfs_cleanup_fs_uuids(void)
 131 {
 132         struct btrfs_fs_devices *fs_devices;
 133
 134         while (!list_empty(&fs_uuids)) {
 135                 fs_devices = list_entry(fs_uuids.next,
 136                                         struct btrfs_fs_devices, list);
 137                 list_del(&fs_devices->list);
 138                 free_fs_devices(fs_devices);
 139         }
 140 }
 141
 142 static struct btrfs_device *__alloc_device(void)
 143 {
 144         struct btrfs_device *dev;
 145
 146         dev = kzalloc(sizeof(*dev), GFP_NOFS);
 147         if (!dev)
 148                 return ERR_PTR(-ENOMEM);
 149
 150         INIT_LIST_HEAD(&dev->dev_list);
 151         INIT_LIST_HEAD(&dev->dev_alloc_list);
 152         INIT_LIST_HEAD(&dev->resized_list);
 153
 154         spin_lock_init(&dev->io_lock);
 155
 156         spin_lock_init(&dev->reada_lock);
 157         atomic_set(&dev->reada_in_flight, 0);
 158         atomic_set(&dev->dev_stats_ccnt, 0);
 159         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
 160         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
 161
 162         return dev;
 163 }
 164
 165 static noinline struct btrfs_device *__find_device(struct list_head *head,
 166                                                    u64 devid, u8 *uuid)
 167 {
 168         struct btrfs_device *dev;
 169
 170         list_for_each_entry(dev, head, dev_list) {
 171                 if (dev->devid == devid &&
 172                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
 173                         return dev;
 174                 }
 175         }
 176         return NULL;
 177 }
 178
 179 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 180 {
 181         struct btrfs_fs_devices *fs_devices;
 182
 183         list_for_each_entry(fs_devices, &fs_uuids, list) {
 184                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 185                         return fs_devices;
 186         }
 187         return NULL;
 188 }
 189
 190 static int
 191 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 192                       int flush, struct block_device **bdev,
 193                       struct buffer_head **bh)
 194 {
 195         int ret;
 196
 197         *bdev = blkdev_get_by_path(device_path, flags, holder);
 198
 199         if (IS_ERR(*bdev)) {
 200                 ret = PTR_ERR(*bdev);
 201                 goto error;
 202         }
 203
 204         if (flush)
 205                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
 206         ret = set_blocksize(*bdev, 4096);
 207         if (ret) {
 208                 blkdev_put(*bdev, flags);
 209                 goto error;
 210         }
 211         invalidate_bdev(*bdev);
 212         *bh = btrfs_read_dev_super(*bdev);
 213         if (IS_ERR(*bh)) {
 214                 ret = PTR_ERR(*bh);
 215                 blkdev_put(*bdev, flags);
 216                 goto error;
 217         }
 218
 219         return 0;
 220
 221 error:
 222         *bdev = NULL;
 223         *bh = NULL;
 224         return ret;
 225 }
 226
 227 static void requeue_list(struct btrfs_pending_bios *pending_bios,
 228                         struct bio *head, struct bio *tail)
 229 {
 230
 231         struct bio *old_head;
 232
 233         old_head = pending_bios->head;
 234         pending_bios->head = head;
 235         if (pending_bios->tail)
 236                 tail->bi_next = old_head;
 237         else
 238                 pending_bios->tail = tail;
 239 }
 240
 241 /*
 242  * we try to collect pending bios for a device so we don't get a large
 243  * number of procs sending bios down to the same device.  This greatly
 244  * improves the schedulers ability to collect and merge the bios.
 245  *
 246  * But, it also turns into a long list of bios to process and that is sure
 247  * to eventually make the worker thread block.  The solution here is to
 248  * make some progress and then put this work struct back at the end of
 249  * the list if the block device is congested.  This way, multiple devices
 250  * can make progress from a single worker thread.
 251  */
 252 static noinline void run_scheduled_bios(struct btrfs_device *device)
 253 {
 254         struct bio *pending;
 255         struct backing_dev_info *bdi;
 256         struct btrfs_fs_info *fs_info;
 257         struct btrfs_pending_bios *pending_bios;
 258         struct bio *tail;
 259         struct bio *cur;
 260         int again = 0;
 261         unsigned long num_run;
 262         unsigned long batch_run = 0;
 263         unsigned long limit;
 264         unsigned long last_waited = 0;
 265         int force_reg = 0;
 266         int sync_pending = 0;
 267         struct blk_plug plug;
 268
 269         /*
 270          * this function runs all the bios we've collected for
 271          * a particular device.  We don't want to wander off to
 272          * another device without first sending all of these down.
 273          * So, setup a plug here and finish it off before we return
 274          */
 275         blk_start_plug(&plug);
 276
 277         bdi = blk_get_backing_dev_info(device->bdev);
 278         fs_info = device->dev_root->fs_info;
 279         limit = btrfs_async_submit_limit(fs_info);
 280         limit = limit * 2 / 3;
 281
 282 loop:
 283         spin_lock(&device->io_lock);
 284
 285 loop_lock:
 286         num_run = 0;
 287
 288         /* take all the bios off the list at once and process them
 289          * later on (without the lock held).  But, remember the
 290          * tail and other pointers so the bios can be properly reinserted
 291          * into the list if we hit congestion
 292          */
 293         if (!force_reg && device->pending_sync_bios.head) {
 294                 pending_bios = &device->pending_sync_bios;
 295                 force_reg = 1;
 296         } else {
 297                 pending_bios = &device->pending_bios;
 298                 force_reg = 0;
 299         }
 300
 301         pending = pending_bios->head;
 302         tail = pending_bios->tail;
 303         WARN_ON(pending && !tail);
 304
 305         /*
 306          * if pending was null this time around, no bios need processing
 307          * at all and we can stop.  Otherwise it'll loop back up again
 308          * and do an additional check so no bios are missed.
 309          *
 310          * device->running_pending is used to synchronize with the
 311          * schedule_bio code.
 312          */
 313         if (device->pending_sync_bios.head == NULL &&
 314             device->pending_bios.head == NULL) {
 315                 again = 0;
 316                 device->running_pending = 0;
 317         } else {
 318                 again = 1;
 319                 device->running_pending = 1;
 320         }
 321
 322         pending_bios->head = NULL;
 323         pending_bios->tail = NULL;
 324
 325         spin_unlock(&device->io_lock);
 326
 327         while (pending) {
 328
 329                 rmb();
 330                 /* we want to work on both lists, but do more bios on the
 331                  * sync list than the regular list
 332                  */
 333                 if ((num_run > 32 &&
 334                     pending_bios != &device->pending_sync_bios &&
 335                     device->pending_sync_bios.head) ||
 336                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
 337                     device->pending_bios.head)) {
 338                         spin_lock(&device->io_lock);
 339                         requeue_list(pending_bios, pending, tail);
 340                         goto loop_lock;
 341                 }
 342
 343                 cur = pending;
 344                 pending = pending->bi_next;
 345                 cur->bi_next = NULL;
 346
 347                 if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
 348                     waitqueue_active(&fs_info->async_submit_wait))
 349                         wake_up(&fs_info->async_submit_wait);
 350
 351                 BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
 352
 353                 /*
 354                  * if we're doing the sync list, record that our
 355                  * plug has some sync requests on it
 356                  *
 357                  * If we're doing the regular list and there are
 358                  * sync requests sitting around, unplug before
 359                  * we add more
 360                  */
 361                 if (pending_bios == &device->pending_sync_bios) {
 362                         sync_pending = 1;
 363                 } else if (sync_pending) {
 364                         blk_finish_plug(&plug);
 365                         blk_start_plug(&plug);
 366                         sync_pending = 0;
 367                 }
 368
 369                 btrfsic_submit_bio(cur->bi_rw, cur);
 370                 num_run++;
 371                 batch_run++;
 372
 373                 cond_resched();
 374
 375                 /*
 376                  * we made progress, there is more work to do and the bdi
 377                  * is now congested.  Back off and let other work structs
 378                  * run instead
 379                  */
 380                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
 381                     fs_info->fs_devices->open_devices > 1) {
 382                         struct io_context *ioc;
 383
 384                         ioc = current->io_context;
 385
 386                         /*
 387                          * the main goal here is that we don't want to
 388                          * block if we're going to be able to submit
 389                          * more requests without blocking.
 390                          *
 391                          * This code does two great things, it pokes into
 392                          * the elevator code from a filesystem _and_
 393                          * it makes assumptions about how batching works.
 394                          */
 395                         if (ioc && ioc->nr_batch_requests > 0 &&
 396                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
 397                             (last_waited == 0 ||
 398                              ioc->last_waited == last_waited)) {
 399                                 /*
 400                                  * we want to go through our batch of
 401                                  * requests and stop.  So, we copy out
 402                                  * the ioc->last_waited time and test
 403                                  * against it before looping
 404                                  */
 405                                 last_waited = ioc->last_waited;
 406                                 cond_resched();
 407                                 continue;
 408                         }
 409                         spin_lock(&device->io_lock);
 410                         requeue_list(pending_bios, pending, tail);
 411                         device->running_pending = 1;
 412
 413                         spin_unlock(&device->io_lock);
 414                         btrfs_queue_work(fs_info->submit_workers,
 415                                          &device->work);
 416                         goto done;
 417                 }
 418                 /* unplug every 64 requests just for good measure */
 419                 if (batch_run % 64 == 0) {
 420                         blk_finish_plug(&plug);
 421                         blk_start_plug(&plug);
 422                         sync_pending = 0;
 423                 }
 424         }
 425
 426         cond_resched();
 427         if (again)
 428                 goto loop;
 429
 430         spin_lock(&device->io_lock);
 431         if (device->pending_bios.head || device->pending_sync_bios.head)
 432                 goto loop_lock;
 433         spin_unlock(&device->io_lock);
 434
 435 done:
 436         blk_finish_plug(&plug);
 437 }
 438
 439 static void pending_bios_fn(struct btrfs_work *work)
 440 {
 441         struct btrfs_device *device;
 442
 443         device = container_of(work, struct btrfs_device, work);
 444         run_scheduled_bios(device);
 445 }
 446
 447
 448 void btrfs_free_stale_device(struct btrfs_device *cur_dev)
 449 {
 450         struct btrfs_fs_devices *fs_devs;
 451         struct btrfs_device *dev;
 452
 453         if (!cur_dev->name)
 454                 return;
 455
 456         list_for_each_entry(fs_devs, &fs_uuids, list) {
 457                 int del = 1;
 458
 459                 if (fs_devs->opened)
 460                         continue;
 461                 if (fs_devs->seeding)
 462                         continue;
 463
 464                 list_for_each_entry(dev, &fs_devs->devices, dev_list) {
 465
 466                         if (dev == cur_dev)
 467                                 continue;
 468                         if (!dev->name)
 469                                 continue;
 470
 471                         /*
 472                          * Todo: This won't be enough. What if the same device
 473                          * comes back (with new uuid and) with its mapper path?
 474                          * But for now, this does help as mostly an admin will
 475                          * either use mapper or non mapper path throughout.
 476                          */
 477                         rcu_read_lock();
 478                         del = strcmp(rcu_str_deref(dev->name),
 479                                                 rcu_str_deref(cur_dev->name));
 480                         rcu_read_unlock();
 481                         if (!del)
 482                                 break;
 483                 }
 484
 485                 if (!del) {
 486                         /* delete the stale device */
 487                         if (fs_devs->num_devices == 1) {
 488                                 btrfs_sysfs_remove_fsid(fs_devs);
 489                                 list_del(&fs_devs->list);
 490                                 free_fs_devices(fs_devs);
 491                         } else {
 492                                 fs_devs->num_devices--;
 493                                 list_del(&dev->dev_list);
 494                                 rcu_string_free(dev->name);
 495                                 kfree(dev);
 496                         }
 497                         break;
 498                 }
 499         }
 500 }
 501
 502 /*
 503  * Add new device to list of registered devices
 504  *
 505  * Returns:
 506  * 1   - first time device is seen
 507  * 0   - device already known
 508  * < 0 - error
 509  */
 510 static noinline int device_list_add(const char *path,
 511                            struct btrfs_super_block *disk_super,
 512                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
 513 {
 514         struct btrfs_device *device;
 515         struct btrfs_fs_devices *fs_devices;
 516         struct rcu_string *name;
 517         int ret = 0;
 518         u64 found_transid = btrfs_super_generation(disk_super);
 519
 520         fs_devices = find_fsid(disk_super->fsid);
 521         if (!fs_devices) {
 522                 fs_devices = alloc_fs_devices(disk_super->fsid);
 523                 if (IS_ERR(fs_devices))
 524                         return PTR_ERR(fs_devices);
 525
 526                 list_add(&fs_devices->list, &fs_uuids);
 527
 528                 device = NULL;
 529         } else {
 530                 device = __find_device(&fs_devices->devices, devid,
 531                                        disk_super->dev_item.uuid);
 532         }
 533
 534         if (!device) {
 535                 if (fs_devices->opened)
 536                         return -EBUSY;
 537
 538                 device = btrfs_alloc_device(NULL, &devid,
 539                                             disk_super->dev_item.uuid);
 540                 if (IS_ERR(device)) {
 541                         /* we can safely leave the fs_devices entry around */
 542                         return PTR_ERR(device);
 543                 }
 544
 545                 name = rcu_string_strdup(path, GFP_NOFS);
 546                 if (!name) {
 547                         kfree(device);
 548                         return -ENOMEM;
 549                 }
 550                 rcu_assign_pointer(device->name, name);
 551
 552                 mutex_lock(&fs_devices->device_list_mutex);
 553                 list_add_rcu(&device->dev_list, &fs_devices->devices);
 554                 fs_devices->num_devices++;
 555                 mutex_unlock(&fs_devices->device_list_mutex);
 556
 557                 ret = 1;
 558                 device->fs_devices = fs_devices;
 559         } else if (!device->name || strcmp(device->name->str, path)) {
 560                 /*
 561                  * When FS is already mounted.
 562                  * 1. If you are here and if the device->name is NULL that
 563                  *    means this device was missing at time of FS mount.
 564                  * 2. If you are here and if the device->name is different
 565                  *    from 'path' that means either
 566                  *      a. The same device disappeared and reappeared with
 567                  *         different name. or
 568                  *      b. The missing-disk-which-was-replaced, has
 569                  *         reappeared now.
 570                  *
 571                  * We must allow 1 and 2a above. But 2b would be a spurious
 572                  * and unintentional.
 573                  *
 574                  * Further in case of 1 and 2a above, the disk at 'path'
 575                  * would have missed some transaction when it was away and
 576                  * in case of 2a the stale bdev has to be updated as well.
 577                  * 2b must not be allowed at all time.
 578                  */
 579
 580                 /*
 581                  * For now, we do allow update to btrfs_fs_device through the
 582                  * btrfs dev scan cli after FS has been mounted.  We're still
 583                  * tracking a problem where systems fail mount by subvolume id
 584                  * when we reject replacement on a mounted FS.
 585                  */
 586                 if (!fs_devices->opened && found_transid < device->generation) {
 587                         /*
 588                          * That is if the FS is _not_ mounted and if you
 589                          * are here, that means there is more than one
 590                          * disk with same uuid and devid.We keep the one
 591                          * with larger generation number or the last-in if
 592                          * generation are equal.
 593                          */
 594                         return -EEXIST;
 595                 }
 596
 597                 name = rcu_string_strdup(path, GFP_NOFS);
 598                 if (!name)
 599                         return -ENOMEM;
 600                 rcu_string_free(device->name);
 601                 rcu_assign_pointer(device->name, name);
 602                 if (device->missing) {
 603                         fs_devices->missing_devices--;
 604                         device->missing = 0;
 605                 }
 606         }
 607
 608         /*
 609          * Unmount does not free the btrfs_device struct but would zero
 610          * generation along with most of the other members. So just update
 611          * it back. We need it to pick the disk with largest generation
 612          * (as above).
 613          */
 614         if (!fs_devices->opened)
 615                 device->generation = found_transid;
 616
 617         /*
 618          * if there is new btrfs on an already registered device,
 619          * then remove the stale device entry.
 620          */
 621         btrfs_free_stale_device(device);
 622
 623         *fs_devices_ret = fs_devices;
 624
 625         return ret;
 626 }
 627
 628 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 629 {
 630         struct btrfs_fs_devices *fs_devices;
 631         struct btrfs_device *device;
 632         struct btrfs_device *orig_dev;
 633
 634         fs_devices = alloc_fs_devices(orig->fsid);
 635         if (IS_ERR(fs_devices))
 636                 return fs_devices;
 637
 638         mutex_lock(&orig->device_list_mutex);
 639         fs_devices->total_devices = orig->total_devices;
 640
 641         /* We have held the volume lock, it is safe to get the devices. */
 642         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
 643                 struct rcu_string *name;
 644
 645                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
 646                                             orig_dev->uuid);
 647                 if (IS_ERR(device))
 648                         goto error;
 649
 650                 /*
 651                  * This is ok to do without rcu read locked because we hold the
 652                  * uuid mutex so nothing we touch in here is going to disappear.
 653                  */
 654                 if (orig_dev->name) {
 655                         name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
 656                         if (!name) {
 657                                 kfree(device);
 658                                 goto error;
 659                         }
 660                         rcu_assign_pointer(device->name, name);
 661                 }
 662
 663                 list_add(&device->dev_list, &fs_devices->devices);
 664                 device->fs_devices = fs_devices;
 665                 fs_devices->num_devices++;
 666         }
 667         mutex_unlock(&orig->device_list_mutex);
 668         return fs_devices;
 669 error:
 670         mutex_unlock(&orig->device_list_mutex);
 671         free_fs_devices(fs_devices);
 672         return ERR_PTR(-ENOMEM);
 673 }
 674
 675 void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
 676 {
 677         struct btrfs_device *device, *next;
 678         struct btrfs_device *latest_dev = NULL;
 679
 680         mutex_lock(&uuid_mutex);
 681 again:
 682         /* This is the initialized path, it is safe to release the devices. */
 683         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
 684                 if (device->in_fs_metadata) {
 685                         if (!device->is_tgtdev_for_dev_replace &&
 686                             (!latest_dev ||
 687                              device->generation > latest_dev->generation)) {
 688                                 latest_dev = device;
 689                         }
 690                         continue;
 691                 }
 692
 693                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
 694                         /*
 695                          * In the first step, keep the device which has
 696                          * the correct fsid and the devid that is used
 697                          * for the dev_replace procedure.
 698                          * In the second step, the dev_replace state is
 699                          * read from the device tree and it is known
 700                          * whether the procedure is really active or
 701                          * not, which means whether this device is
 702                          * used or whether it should be removed.
 703                          */
 704                         if (step == 0 || device->is_tgtdev_for_dev_replace) {
 705                                 continue;
 706                         }
 707                 }
 708                 if (device->bdev) {
 709                         blkdev_put(device->bdev, device->mode);
 710                         device->bdev = NULL;
 711                         fs_devices->open_devices--;
 712                 }
 713                 if (device->writeable) {
 714                         list_del_init(&device->dev_alloc_list);
 715                         device->writeable = 0;
 716                         if (!device->is_tgtdev_for_dev_replace)
 717                                 fs_devices->rw_devices--;
 718                 }
 719                 list_del_init(&device->dev_list);
 720                 fs_devices->num_devices--;
 721                 rcu_string_free(device->name);
 722                 kfree(device);
 723         }
 724
 725         if (fs_devices->seed) {
 726                 fs_devices = fs_devices->seed;
 727                 goto again;
 728         }
 729
 730         fs_devices->latest_bdev = latest_dev->bdev;
 731
 732         mutex_unlock(&uuid_mutex);
 733 }
 734
 735 static void __free_device(struct work_struct *work)
 736 {
 737         struct btrfs_device *device;
 738
 739         device = container_of(work, struct btrfs_device, rcu_work);
 740
 741         if (device->bdev)
 742                 blkdev_put(device->bdev, device->mode);
 743
 744         rcu_string_free(device->name);
 745         kfree(device);
 746 }
 747
 748 static void free_device(struct rcu_head *head)
 749 {
 750         struct btrfs_device *device;
 751
 752         device = container_of(head, struct btrfs_device, rcu);
 753
 754         INIT_WORK(&device->rcu_work, __free_device);
 755         schedule_work(&device->rcu_work);
 756 }
 757
 758 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 759 {
 760         struct btrfs_device *device, *tmp;
 761
 762         if (--fs_devices->opened > 0)
 763                 return 0;
 764
 765         mutex_lock(&fs_devices->device_list_mutex);
 766         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
 767                 btrfs_close_one_device(device);
 768         }
 769         mutex_unlock(&fs_devices->device_list_mutex);
 770
 771         WARN_ON(fs_devices->open_devices);
 772         WARN_ON(fs_devices->rw_devices);
 773         fs_devices->opened = 0;
 774         fs_devices->seeding = 0;
 775
 776         return 0;
 777 }
 778
 779 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 780 {
 781         struct btrfs_fs_devices *seed_devices = NULL;
 782         int ret;
 783
 784         mutex_lock(&uuid_mutex);
 785         ret = __btrfs_close_devices(fs_devices);
 786         if (!fs_devices->opened) {
 787                 seed_devices = fs_devices->seed;
 788                 fs_devices->seed = NULL;
 789         }
 790         mutex_unlock(&uuid_mutex);
 791
 792         while (seed_devices) {
 793                 fs_devices = seed_devices;
 794                 seed_devices = fs_devices->seed;
 795                 __btrfs_close_devices(fs_devices);
 796                 free_fs_devices(fs_devices);
 797         }
 798         /*
 799          * Wait for rcu kworkers under __btrfs_close_devices
 800          * to finish all blkdev_puts so device is really
 801          * free when umount is done.
 802          */
 803         rcu_barrier();
 804         return ret;
 805 }
 806
 807 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 808                                 fmode_t flags, void *holder)
 809 {
 810         struct request_queue *q;
 811         struct block_device *bdev;
 812         struct list_head *head = &fs_devices->devices;
 813         struct btrfs_device *device;
 814         struct btrfs_device *latest_dev = NULL;
 815         struct buffer_head *bh;
 816         struct btrfs_super_block *disk_super;
 817         u64 devid;
 818         int seeding = 1;
 819         int ret = 0;
 820
 821         flags |= FMODE_EXCL;
 822
 823         list_for_each_entry(device, head, dev_list) {
 824                 if (device->bdev)
 825                         continue;
 826                 if (!device->name)
 827                         continue;
 828
 829                 /* Just open everything we can; ignore failures here */
 830                 if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
 831                                             &bdev, &bh))
 832                         continue;
 833
 834                 disk_super = (struct btrfs_super_block *)bh->b_data;
 835                 devid = btrfs_stack_device_id(&disk_super->dev_item);
 836                 if (devid != device->devid)
 837                         goto error_brelse;
 838
 839                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
 840                            BTRFS_UUID_SIZE))
 841                         goto error_brelse;
 842
 843                 device->generation = btrfs_super_generation(disk_super);
 844                 if (!latest_dev ||
 845                     device->generation > latest_dev->generation)
 846                         latest_dev = device;
 847
 848                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
 849                         device->writeable = 0;
 850                 } else {
 851                         device->writeable = !bdev_read_only(bdev);
 852                         seeding = 0;
 853                 }
 854
 855                 q = bdev_get_queue(bdev);
 856                 if (blk_queue_discard(q))
 857                         device->can_discard = 1;
 858
 859                 device->bdev = bdev;
 860                 device->in_fs_metadata = 0;
 861                 device->mode = flags;
 862
 863                 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
 864                         fs_devices->rotating = 1;
 865
 866                 fs_devices->open_devices++;
 867                 if (device->writeable &&
 868                     device->devid != BTRFS_DEV_REPLACE_DEVID) {
 869                         fs_devices->rw_devices++;
 870                         list_add(&device->dev_alloc_list,
 871                                  &fs_devices->alloc_list);
 872                 }
 873                 brelse(bh);
 874                 continue;
 875
 876 error_brelse:
 877                 brelse(bh);
 878                 blkdev_put(bdev, flags);
 879                 continue;
 880         }
 881         if (fs_devices->open_devices == 0) {
 882                 ret = -EINVAL;
 883                 goto out;
 884         }
 885         fs_devices->seeding = seeding;
 886         fs_devices->opened = 1;
 887         fs_devices->latest_bdev = latest_dev->bdev;
 888         fs_devices->total_rw_bytes = 0;
 889 out:
 890         return ret;
 891 }
 892
 893 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 894                        fmode_t flags, void *holder)
 895 {
 896         int ret;
 897
 898         mutex_lock(&uuid_mutex);
 899         if (fs_devices->opened) {
 900                 fs_devices->opened++;
 901                 ret = 0;
 902         } else {
 903                 ret = __btrfs_open_devices(fs_devices, flags, holder);
 904         }
 905         mutex_unlock(&uuid_mutex);
 906         return ret;
 907 }
 908
 909 /*
 910  * Look for a btrfs signature on a device. This may be called out of the mount path
 911  * and we are not allowed to call set_blocksize during the scan. The superblock
 912  * is read via pagecache
 913  */
 914 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 915                           struct btrfs_fs_devices **fs_devices_ret)
 916 {
 917         struct btrfs_super_block *disk_super;
 918         struct block_device *bdev;
 919         struct page *page;
 920         void *p;
 921         int ret = -EINVAL;
 922         u64 devid;
 923         u64 transid;
 924         u64 total_devices;
 925         u64 bytenr;
 926         pgoff_t index;
 927
 928         /*
 929          * we would like to check all the supers, but that would make
 930          * a btrfs mount succeed after a mkfs from a different FS.
 931          * So, we need to add a special mount option to scan for
 932          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
 933          */
 934         bytenr = btrfs_sb_offset(0);
 935         flags |= FMODE_EXCL;
 936         mutex_lock(&uuid_mutex);
 937
 938         bdev = blkdev_get_by_path(path, flags, holder);
 939
 940         if (IS_ERR(bdev)) {
 941                 ret = PTR_ERR(bdev);
 942                 goto error;
 943         }
 944
 945         /* make sure our super fits in the device */
 946         if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
 947                 goto error_bdev_put;
 948
 949         /* make sure our super fits in the page */
 950         if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
 951                 goto error_bdev_put;
 952
 953         /* make sure our super doesn't straddle pages on disk */
 954         index = bytenr >> PAGE_CACHE_SHIFT;
 955         if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
 956                 goto error_bdev_put;
 957
 958         /* pull in the page with our super */
 959         page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
 960                                    index, GFP_NOFS);
 961
 962         if (IS_ERR_OR_NULL(page))
 963                 goto error_bdev_put;
 964
 965         p = kmap(page);
 966
 967         /* align our pointer to the offset of the super block */
 968         disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
 969
 970         if (btrfs_super_bytenr(disk_super) != bytenr ||
 971             btrfs_super_magic(disk_super) != BTRFS_MAGIC)
 972                 goto error_unmap;
 973
 974         devid = btrfs_stack_device_id(&disk_super->dev_item);
 975         transid = btrfs_super_generation(disk_super);
 976         total_devices = btrfs_super_num_devices(disk_super);
 977
 978         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 979         if (ret > 0) {
 980                 if (disk_super->label[0]) {
 981                         if (disk_super->label[BTRFS_LABEL_SIZE - 1])
 982                                 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
 983                         printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
 984                 } else {
 985                         printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
 986                 }
 987
 988                 printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
 989                 ret = 0;
 990         }
 991         if (!ret && fs_devices_ret)
 992                 (*fs_devices_ret)->total_devices = total_devices;
 993
 994 error_unmap:
 995         kunmap(page);
 996         page_cache_release(page);
 997
 998 error_bdev_put:
 999         blkdev_put(bdev, flags);
1000 error:
1001         mutex_unlock(&uuid_mutex);
1002         return ret;
1003 }
1004
1005 /* helper to account the used device space in the range */
1006 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
1007                                    u64 end, u64 *length)
1008 {
1009         struct btrfs_key key;
1010         struct btrfs_root *root = device->dev_root;
1011         struct btrfs_dev_extent *dev_extent;
1012         struct btrfs_path *path;
1013         u64 extent_end;
1014         int ret;
1015         int slot;
1016         struct extent_buffer *l;
1017
1018         *length = 0;
1019
1020         if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
1021                 return 0;
1022
1023         path = btrfs_alloc_path();
1024         if (!path)
1025                 return -ENOMEM;
1026         path->reada = 2;
1027
1028         key.objectid = device->devid;
1029         key.offset = start;
1030         key.type = BTRFS_DEV_EXTENT_KEY;
1031
1032         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1033         if (ret < 0)
1034                 goto out;
1035         if (ret > 0) {
1036                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1037                 if (ret < 0)
1038                         goto out;
1039         }
1040
1041         while (1) {
1042                 l = path->nodes[0];
1043                 slot = path->slots[0];
1044                 if (slot >= btrfs_header_nritems(l)) {
1045                         ret = btrfs_next_leaf(root, path);
1046                         if (ret == 0)
1047                                 continue;
1048                         if (ret < 0)
1049                                 goto out;
1050
1051                         break;
1052                 }
1053                 btrfs_item_key_to_cpu(l, &key, slot);
1054
1055                 if (key.objectid < device->devid)
1056                         goto next;
1057
1058                 if (key.objectid > device->devid)
1059                         break;
1060
1061                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1062                         goto next;
1063
1064                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1065                 extent_end = key.offset + btrfs_dev_extent_length(l,
1066                                                                   dev_extent);
1067                 if (key.offset <= start && extent_end > end) {
1068                         *length = end - start + 1;
1069                         break;
1070                 } else if (key.offset <= start && extent_end > start)
1071                         *length += extent_end - start;
1072                 else if (key.offset > start && extent_end <= end)
1073                         *length += extent_end - key.offset;
1074                 else if (key.offset > start && key.offset <= end) {
1075                         *length += end - key.offset + 1;
1076                         break;
1077                 } else if (key.offset > end)
1078                         break;
1079
1080 next:
1081                 path->slots[0]++;
1082         }
1083         ret = 0;
1084 out:
1085         btrfs_free_path(path);
1086         return ret;
1087 }
1088
1089 static int contains_pending_extent(struct btrfs_transaction *transaction,
1090                                    struct btrfs_device *device,
1091                                    u64 *start, u64 len)
1092 {
1093         struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
1094         struct extent_map *em;
1095         struct list_head *search_list = &fs_info->pinned_chunks;
1096         int ret = 0;
1097         u64 physical_start = *start;
1098
1099         if (transaction)
1100                 search_list = &transaction->pending_chunks;
1101 again:
1102         list_for_each_entry(em, search_list, list) {
1103                 struct map_lookup *map;
1104                 int i;
1105
1106                 map = (struct map_lookup *)em->bdev;
1107                 for (i = 0; i < map->num_stripes; i++) {
1108                         u64 end;
1109
1110                         if (map->stripes[i].dev != device)
1111                                 continue;
1112                         if (map->stripes[i].physical >= physical_start + len ||
1113                             map->stripes[i].physical + em->orig_block_len <=
1114                             physical_start)
1115                                 continue;
1116                         /*
1117                          * Make sure that while processing the pinned list we do
1118                          * not override our *start with a lower value, because
1119                          * we can have pinned chunks that fall within this
1120                          * device hole and that have lower physical addresses
1121                          * than the pending chunks we processed before. If we
1122                          * do not take this special care we can end up getting
1123                          * 2 pending chunks that start at the same physical
1124                          * device offsets because the end offset of a pinned
1125                          * chunk can be equal to the start offset of some
1126                          * pending chunk.
1127                          */
1128                         end = map->stripes[i].physical + em->orig_block_len;
1129                         if (end > *start) {
1130                                 *start = end;
1131                                 ret = 1;
1132                         }
1133                 }
1134         }
1135         if (search_list != &fs_info->pinned_chunks) {
1136                 search_list = &fs_info->pinned_chunks;
1137                 goto again;
1138         }
1139
1140         return ret;
1141 }
1142
1143
1144 /*
1145  * find_free_dev_extent_start - find free space in the specified device
1146  * @device:       the device which we search the free space in
1147  * @num_bytes:    the size of the free space that we need
1148  * @search_start: the position from which to begin the search
1149  * @start:        store the start of the free space.
1150  * @len:          the size of the free space. that we find, or the size
1151  *                of the max free space if we don't find suitable free space
1152  *
1153  * this uses a pretty simple search, the expectation is that it is
1154  * called very infrequently and that a given device has a small number
1155  * of extents
1156  *
1157  * @start is used to store the start of the free space if we find. But if we
1158  * don't find suitable free space, it will be used to store the start position
1159  * of the max free space.
1160  *
1161  * @len is used to store the size of the free space that we find.
1162  * But if we don't find suitable free space, it is used to store the size of
1163  * the max free space.
1164  */
1165 int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1166                                struct btrfs_device *device, u64 num_bytes,
1167                                u64 search_start, u64 *start, u64 *len)
1168 {
1169         struct btrfs_key key;
1170         struct btrfs_root *root = device->dev_root;
1171         struct btrfs_dev_extent *dev_extent;
1172         struct btrfs_path *path;
1173         u64 hole_size;
1174         u64 max_hole_start;
1175         u64 max_hole_size;
1176         u64 extent_end;
1177         u64 search_end = device->total_bytes;
1178         int ret;
1179         int slot;
1180         struct extent_buffer *l;
1181
1182         path = btrfs_alloc_path();
1183         if (!path)
1184                 return -ENOMEM;
1185
1186         max_hole_start = search_start;
1187         max_hole_size = 0;
1188
1189 again:
1190         if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
1191                 ret = -ENOSPC;
1192                 goto out;
1193         }
1194
1195         path->reada = 2;
1196         path->search_commit_root = 1;
1197         path->skip_locking = 1;
1198
1199         key.objectid = device->devid;
1200         key.offset = search_start;
1201         key.type = BTRFS_DEV_EXTENT_KEY;
1202
1203         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1204         if (ret < 0)
1205                 goto out;
1206         if (ret > 0) {
1207                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1208                 if (ret < 0)
1209                         goto out;
1210         }
1211
1212         while (1) {
1213                 l = path->nodes[0];
1214                 slot = path->slots[0];
1215                 if (slot >= btrfs_header_nritems(l)) {
1216                         ret = btrfs_next_leaf(root, path);
1217                         if (ret == 0)
1218                                 continue;
1219                         if (ret < 0)
1220                                 goto out;
1221
1222                         break;
1223                 }
1224                 btrfs_item_key_to_cpu(l, &key, slot);
1225
1226                 if (key.objectid < device->devid)
1227                         goto next;
1228
1229                 if (key.objectid > device->devid)
1230                         break;
1231
1232                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1233                         goto next;
1234
1235                 if (key.offset > search_start) {
1236                         hole_size = key.offset - search_start;
1237
1238                         /*
1239                          * Have to check before we set max_hole_start, otherwise
1240                          * we could end up sending back this offset anyway.
1241                          */
1242                         if (contains_pending_extent(transaction, device,
1243                                                     &search_start,
1244                                                     hole_size)) {
1245                                 if (key.offset >= search_start) {
1246                                         hole_size = key.offset - search_start;
1247                                 } else {
1248                                         WARN_ON_ONCE(1);
1249                                         hole_size = 0;
1250                                 }
1251                         }
1252
1253                         if (hole_size > max_hole_size) {
1254                                 max_hole_start = search_start;
1255                                 max_hole_size = hole_size;
1256                         }
1257
1258                         /*
1259                          * If this free space is greater than which we need,
1260                          * it must be the max free space that we have found
1261                          * until now, so max_hole_start must point to the start
1262                          * of this free space and the length of this free space
1263                          * is stored in max_hole_size. Thus, we return
1264                          * max_hole_start and max_hole_size and go back to the
1265                          * caller.
1266                          */
1267                         if (hole_size >= num_bytes) {
1268                                 ret = 0;
1269                                 goto out;
1270                         }
1271                 }
1272
1273                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1274                 extent_end = key.offset + btrfs_dev_extent_length(l,
1275                                                                   dev_extent);
1276                 if (extent_end > search_start)
1277                         search_start = extent_end;
1278 next:
1279                 path->slots[0]++;
1280                 cond_resched();
1281         }
1282
1283         /*
1284          * At this point, search_start should be the end of
1285          * allocated dev extents, and when shrinking the device,
1286          * search_end may be smaller than search_start.
1287          */
1288         if (search_end > search_start) {
1289                 hole_size = search_end - search_start;
1290
1291                 if (contains_pending_extent(transaction, device, &search_start,
1292                                             hole_size)) {
1293                         btrfs_release_path(path);
1294                         goto again;
1295                 }
1296
1297                 if (hole_size > max_hole_size) {
1298                         max_hole_start = search_start;
1299                         max_hole_size = hole_size;
1300                 }
1301         }
1302
1303         /* See above. */
1304         if (max_hole_size < num_bytes)
1305                 ret = -ENOSPC;
1306         else
1307                 ret = 0;
1308
1309 out:
1310         btrfs_free_path(path);
1311         *start = max_hole_start;
1312         if (len)
1313                 *len = max_hole_size;
1314         return ret;
1315 }
1316
1317 int find_free_dev_extent(struct btrfs_trans_handle *trans,
1318                          struct btrfs_device *device, u64 num_bytes,
1319                          u64 *start, u64 *len)
1320 {
1321         struct btrfs_root *root = device->dev_root;
1322         u64 search_start;
1323
1324         /* FIXME use last free of some kind */
1325
1326         /*
1327          * we don't want to overwrite the superblock on the drive,
1328          * so we make sure to start at an offset of at least 1MB
1329          */
1330         search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
1331         return find_free_dev_extent_start(trans->transaction, device,
1332                                           num_bytes, search_start, start, len);
1333 }
1334
1335 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1336                           struct btrfs_device *device,
1337                           u64 start, u64 *dev_extent_len)
1338 {
1339         int ret;
1340         struct btrfs_path *path;
1341         struct btrfs_root *root = device->dev_root;
1342         struct btrfs_key key;
1343         struct btrfs_key found_key;
1344         struct extent_buffer *leaf = NULL;
1345         struct btrfs_dev_extent *extent = NULL;
1346
1347         path = btrfs_alloc_path();
1348         if (!path)
1349                 return -ENOMEM;
1350
1351         key.objectid = device->devid;
1352         key.offset = start;
1353         key.type = BTRFS_DEV_EXTENT_KEY;
1354 again:
1355         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1356         if (ret > 0) {
1357                 ret = btrfs_previous_item(root, path, key.objectid,
1358                                           BTRFS_DEV_EXTENT_KEY);
1359                 if (ret)
1360                         goto out;
1361                 leaf = path->nodes[0];
1362                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1363                 extent = btrfs_item_ptr(leaf, path->slots[0],
1364                                         struct btrfs_dev_extent);
1365                 BUG_ON(found_key.offset > start || found_key.offset +
1366                        btrfs_dev_extent_length(leaf, extent) < start);
1367                 key = found_key;
1368                 btrfs_release_path(path);
1369                 goto again;
1370         } else if (ret == 0) {
1371                 leaf = path->nodes[0];
1372                 extent = btrfs_item_ptr(leaf, path->slots[0],
1373                                         struct btrfs_dev_extent);
1374         } else {
1375                 btrfs_std_error(root->fs_info, ret, "Slot search failed");
1376                 goto out;
1377         }
1378
1379         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1380
1381         ret = btrfs_del_item(trans, root, path);
1382         if (ret) {
1383                 btrfs_std_error(root->fs_info, ret,
1384                             "Failed to remove dev extent item");
1385         } else {
1386                 trans->transaction->have_free_bgs = 1;
1387         }
1388 out:
1389         btrfs_free_path(path);
1390         return ret;
1391 }
1392
1393 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1394                                   struct btrfs_device *device,
1395                                   u64 chunk_tree, u64 chunk_objectid,
1396                                   u64 chunk_offset, u64 start, u64 num_bytes)
1397 {
1398         int ret;
1399         struct btrfs_path *path;
1400         struct btrfs_root *root = device->dev_root;
1401         struct btrfs_dev_extent *extent;
1402         struct extent_buffer *leaf;
1403         struct btrfs_key key;
1404
1405         WARN_ON(!device->in_fs_metadata);
1406         WARN_ON(device->is_tgtdev_for_dev_replace);
1407         path = btrfs_alloc_path();
1408         if (!path)
1409                 return -ENOMEM;
1410
1411         key.objectid = device->devid;
1412         key.offset = start;
1413         key.type = BTRFS_DEV_EXTENT_KEY;
1414         ret = btrfs_insert_empty_item(trans, root, path, &key,
1415                                       sizeof(*extent));
1416         if (ret)
1417                 goto out;
1418
1419         leaf = path->nodes[0];
1420         extent = btrfs_item_ptr(leaf, path->slots[0],
1421                                 struct btrfs_dev_extent);
1422         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1423         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1424         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1425
1426         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1427                     btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE);
1428
1429         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1430         btrfs_mark_buffer_dirty(leaf);
1431 out:
1432         btrfs_free_path(path);
1433         return ret;
1434 }
1435
1436 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1437 {
1438         struct extent_map_tree *em_tree;
1439         struct extent_map *em;
1440         struct rb_node *n;
1441         u64 ret = 0;
1442
1443         em_tree = &fs_info->mapping_tree.map_tree;
1444         read_lock(&em_tree->lock);
1445         n = rb_last(&em_tree->map);
1446         if (n) {
1447                 em = rb_entry(n, struct extent_map, rb_node);
1448                 ret = em->start + em->len;
1449         }
1450         read_unlock(&em_tree->lock);
1451
1452         return ret;
1453 }
1454
1455 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1456                                     u64 *devid_ret)
1457 {
1458         int ret;
1459         struct btrfs_key key;
1460         struct btrfs_key found_key;
1461         struct btrfs_path *path;
1462
1463         path = btrfs_alloc_path();
1464         if (!path)
1465                 return -ENOMEM;
1466
1467         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1468         key.type = BTRFS_DEV_ITEM_KEY;
1469         key.offset = (u64)-1;
1470
1471         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1472         if (ret < 0)
1473                 goto error;
1474
1475         BUG_ON(ret == 0); /* Corruption */
1476
1477         ret = btrfs_previous_item(fs_info->chunk_root, path,
1478                                   BTRFS_DEV_ITEMS_OBJECTID,
1479                                   BTRFS_DEV_ITEM_KEY);
1480         if (ret) {
1481                 *devid_ret = 1;
1482         } else {
1483                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1484                                       path->slots[0]);
1485                 *devid_ret = found_key.offset + 1;
1486         }
1487         ret = 0;
1488 error:
1489         btrfs_free_path(path);
1490         return ret;
1491 }
1492
1493 /*
1494  * the device information is stored in the chunk root
1495  * the btrfs_device struct should be fully filled in
1496  */
1497 static int btrfs_add_device(struct btrfs_trans_handle *trans,
1498                             struct btrfs_root *root,
1499                             struct btrfs_device *device)
1500 {
1501         int ret;
1502         struct btrfs_path *path;
1503         struct btrfs_dev_item *dev_item;
1504         struct extent_buffer *leaf;
1505         struct btrfs_key key;
1506         unsigned long ptr;
1507
1508         root = root->fs_info->chunk_root;
1509
1510         path = btrfs_alloc_path();
1511         if (!path)
1512                 return -ENOMEM;
1513
1514         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1515         key.type = BTRFS_DEV_ITEM_KEY;
1516         key.offset = device->devid;
1517
1518         ret = btrfs_insert_empty_item(trans, root, path, &key,
1519                                       sizeof(*dev_item));
1520         if (ret)
1521                 goto out;
1522
1523         leaf = path->nodes[0];
1524         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1525
1526         btrfs_set_device_id(leaf, dev_item, device->devid);
1527         btrfs_set_device_generation(leaf, dev_item, 0);
1528         btrfs_set_device_type(leaf, dev_item, device->type);
1529         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1530         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1531         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1532         btrfs_set_device_total_bytes(leaf, dev_item,
1533                                      btrfs_device_get_disk_total_bytes(device));
1534         btrfs_set_device_bytes_used(leaf, dev_item,
1535                                     btrfs_device_get_bytes_used(device));
1536         btrfs_set_device_group(leaf, dev_item, 0);
1537         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1538         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1539         btrfs_set_device_start_offset(leaf, dev_item, 0);
1540
1541         ptr = btrfs_device_uuid(dev_item);
1542         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1543         ptr = btrfs_device_fsid(dev_item);
1544         write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1545         btrfs_mark_buffer_dirty(leaf);
1546
1547         ret = 0;
1548 out:
1549         btrfs_free_path(path);
1550         return ret;
1551 }
1552
1553 /*
1554  * Function to update ctime/mtime for a given device path.
1555  * Mainly used for ctime/mtime based probe like libblkid.
1556  */
1557 static void update_dev_time(char *path_name)
1558 {
1559         struct file *filp;
1560
1561         filp = filp_open(path_name, O_RDWR, 0);
1562         if (IS_ERR(filp))
1563                 return;
1564         file_update_time(filp);
1565         filp_close(filp, NULL);
1566         return;
1567 }
1568
1569 static int btrfs_rm_dev_item(struct btrfs_root *root,
1570                              struct btrfs_device *device)
1571 {
1572         int ret;
1573         struct btrfs_path *path;
1574         struct btrfs_key key;
1575         struct btrfs_trans_handle *trans;
1576
1577         root = root->fs_info->chunk_root;
1578
1579         path = btrfs_alloc_path();
1580         if (!path)
1581                 return -ENOMEM;
1582
1583         trans = btrfs_start_transaction(root, 0);
1584         if (IS_ERR(trans)) {
1585                 btrfs_free_path(path);
1586                 return PTR_ERR(trans);
1587         }
1588         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1589         key.type = BTRFS_DEV_ITEM_KEY;
1590         key.offset = device->devid;
1591
1592         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1593         if (ret < 0)
1594                 goto out;
1595
1596         if (ret > 0) {
1597                 ret = -ENOENT;
1598                 goto out;
1599         }
1600
1601         ret = btrfs_del_item(trans, root, path);
1602         if (ret)
1603                 goto out;
1604 out:
1605         btrfs_free_path(path);
1606         btrfs_commit_transaction(trans, root);
1607         return ret;
1608 }
1609
1610 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1611 {
1612         struct btrfs_device *device;
1613         struct btrfs_device *next_device;
1614         struct block_device *bdev;
1615         struct buffer_head *bh = NULL;
1616         struct btrfs_super_block *disk_super;
1617         struct btrfs_fs_devices *cur_devices;
1618         u64 all_avail;
1619         u64 devid;
1620         u64 num_devices;
1621         u8 *dev_uuid;
1622         unsigned seq;
1623         int ret = 0;
1624         bool clear_super = false;
1625
1626         mutex_lock(&uuid_mutex);
1627
1628         do {
1629                 seq = read_seqbegin(&root->fs_info->profiles_lock);
1630
1631                 all_avail = root->fs_info->avail_data_alloc_bits |
1632                             root->fs_info->avail_system_alloc_bits |
1633                             root->fs_info->avail_metadata_alloc_bits;
1634         } while (read_seqretry(&root->fs_info->profiles_lock, seq));
1635
1636         num_devices = root->fs_info->fs_devices->num_devices;
1637         btrfs_dev_replace_lock(&root->fs_info->dev_replace);
1638         if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
1639                 WARN_ON(num_devices < 1);
1640                 num_devices--;
1641         }
1642         btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1643
1644         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1645                 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
1646                 goto out;
1647         }
1648
1649         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1650                 ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
1651                 goto out;
1652         }
1653
1654         if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1655             root->fs_info->fs_devices->rw_devices <= 2) {
1656                 ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
1657                 goto out;
1658         }
1659         if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1660             root->fs_info->fs_devices->rw_devices <= 3) {
1661                 ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
1662                 goto out;
1663         }
1664
1665         if (strcmp(device_path, "missing") == 0) {
1666                 struct list_head *devices;
1667                 struct btrfs_device *tmp;
1668
1669                 device = NULL;
1670                 devices = &root->fs_info->fs_devices->devices;
1671                 /*
1672                  * It is safe to read the devices since the volume_mutex
1673                  * is held.
1674                  */
1675                 list_for_each_entry(tmp, devices, dev_list) {
1676                         if (tmp->in_fs_metadata &&
1677                             !tmp->is_tgtdev_for_dev_replace &&
1678                             !tmp->bdev) {
1679                                 device = tmp;
1680                                 break;
1681                         }
1682                 }
1683                 bdev = NULL;
1684                 bh = NULL;
1685                 disk_super = NULL;
1686                 if (!device) {
1687                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
1688                         goto out;
1689                 }
1690         } else {
1691                 ret = btrfs_get_bdev_and_sb(device_path,
1692                                             FMODE_WRITE | FMODE_EXCL,
1693                                             root->fs_info->bdev_holder, 0,
1694                                             &bdev, &bh);
1695                 if (ret)
1696                         goto out;
1697                 disk_super = (struct btrfs_super_block *)bh->b_data;
1698                 devid = btrfs_stack_device_id(&disk_super->dev_item);
1699                 dev_uuid = disk_super->dev_item.uuid;
1700                 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1701                                            disk_super->fsid);
1702                 if (!device) {
1703                         ret = -ENOENT;
1704                         goto error_brelse;
1705                 }
1706         }
1707
1708         if (device->is_tgtdev_for_dev_replace) {
1709                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1710                 goto error_brelse;
1711         }
1712
1713         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1714                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1715                 goto error_brelse;
1716         }
1717
1718         if (device->writeable) {
1719                 lock_chunks(root);
1720                 list_del_init(&device->dev_alloc_list);
1721                 device->fs_devices->rw_devices--;
1722                 unlock_chunks(root);
1723                 clear_super = true;
1724         }
1725
1726         mutex_unlock(&uuid_mutex);
1727         ret = btrfs_shrink_device(device, 0);
1728         mutex_lock(&uuid_mutex);
1729         if (ret)
1730                 goto error_undo;
1731
1732         /*
1733          * TODO: the superblock still includes this device in its num_devices
1734          * counter although write_all_supers() is not locked out. This
1735          * could give a filesystem state which requires a degraded mount.
1736          */
1737         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1738         if (ret)
1739                 goto error_undo;
1740
1741         device->in_fs_metadata = 0;
1742         btrfs_scrub_cancel_dev(root->fs_info, device);
1743
1744         /*
1745          * the device list mutex makes sure that we don't change
1746          * the device list while someone else is writing out all
1747          * the device supers. Whoever is writing all supers, should
1748          * lock the device list mutex before getting the number of
1749          * devices in the super block (super_copy). Conversely,
1750          * whoever updates the number of devices in the super block
1751          * (super_copy) should hold the device list mutex.
1752          */
1753
1754         cur_devices = device->fs_devices;
1755         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1756         list_del_rcu(&device->dev_list);
1757
1758         device->fs_devices->num_devices--;
1759         device->fs_devices->total_devices--;
1760
1761         if (device->missing)
1762                 device->fs_devices->missing_devices--;
1763
1764         next_device = list_entry(root->fs_info->fs_devices->devices.next,
1765                                  struct btrfs_device, dev_list);
1766         if (device->bdev == root->fs_info->sb->s_bdev)
1767                 root->fs_info->sb->s_bdev = next_device->bdev;
1768         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1769                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1770
1771         if (device->bdev) {
1772                 device->fs_devices->open_devices--;
1773                 /* remove sysfs entry */
1774                 btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
1775         }
1776
1777         call_rcu(&device->rcu, free_device);
1778
1779         num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1780         btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1781         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1782
1783         if (cur_devices->open_devices == 0) {
1784                 struct btrfs_fs_devices *fs_devices;
1785                 fs_devices = root->fs_info->fs_devices;
1786                 while (fs_devices) {
1787                         if (fs_devices->seed == cur_devices) {
1788                                 fs_devices->seed = cur_devices->seed;
1789                                 break;
1790                         }
1791                         fs_devices = fs_devices->seed;
1792                 }
1793                 cur_devices->seed = NULL;
1794                 __btrfs_close_devices(cur_devices);
1795                 free_fs_devices(cur_devices);
1796         }
1797
1798         root->fs_info->num_tolerated_disk_barrier_failures =
1799                 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
1800
1801         /*
1802          * at this point, the device is zero sized.  We want to
1803          * remove it from the devices list and zero out the old super
1804          */
1805         if (clear_super && disk_super) {
1806                 u64 bytenr;
1807                 int i;
1808
1809                 /* make sure this device isn't detected as part of
1810                  * the FS anymore
1811                  */
1812                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1813                 set_buffer_dirty(bh);
1814                 sync_dirty_buffer(bh);
1815
1816                 /* clear the mirror copies of super block on the disk
1817                  * being removed, 0th copy is been taken care above and
1818                  * the below would take of the rest
1819                  */
1820                 for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1821                         bytenr = btrfs_sb_offset(i);
1822                         if (bytenr + BTRFS_SUPER_INFO_SIZE >=
1823                                         i_size_read(bdev->bd_inode))
1824                                 break;
1825
1826                         brelse(bh);
1827                         bh = __bread(bdev, bytenr / 4096,
1828                                         BTRFS_SUPER_INFO_SIZE);
1829                         if (!bh)
1830                                 continue;
1831
1832                         disk_super = (struct btrfs_super_block *)bh->b_data;
1833
1834                         if (btrfs_super_bytenr(disk_super) != bytenr ||
1835                                 btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1836                                 continue;
1837                         }
1838                         memset(&disk_super->magic, 0,
1839                                                 sizeof(disk_super->magic));
1840                         set_buffer_dirty(bh);
1841                         sync_dirty_buffer(bh);
1842                 }
1843         }
1844
1845         ret = 0;
1846
1847         if (bdev) {
1848                 /* Notify udev that device has changed */
1849                 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
1850
1851                 /* Update ctime/mtime for device path for libblkid */
1852                 update_dev_time(device_path);
1853         }
1854
1855 error_brelse:
1856         brelse(bh);
1857         if (bdev)
1858                 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1859 out:
1860         mutex_unlock(&uuid_mutex);
1861         return ret;
1862 error_undo:
1863         if (device->writeable) {
1864                 lock_chunks(root);
1865                 list_add(&device->dev_alloc_list,
1866                          &root->fs_info->fs_devices->alloc_list);
1867                 device->fs_devices->rw_devices++;
1868                 unlock_chunks(root);
1869         }
1870         goto error_brelse;
1871 }
1872
1873 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
1874                                         struct btrfs_device *srcdev)
1875 {
1876         struct btrfs_fs_devices *fs_devices;
1877
1878         WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1879
1880         /*
1881          * in case of fs with no seed, srcdev->fs_devices will point
1882          * to fs_devices of fs_info. However when the dev being replaced is
1883          * a seed dev it will point to the seed's local fs_devices. In short
1884          * srcdev will have its correct fs_devices in both the cases.
1885          */
1886         fs_devices = srcdev->fs_devices;
1887
1888         list_del_rcu(&srcdev->dev_list);
1889         list_del_rcu(&srcdev->dev_alloc_list);
1890         fs_devices->num_devices--;
1891         if (srcdev->missing)
1892                 fs_devices->missing_devices--;
1893
1894         if (srcdev->writeable) {
1895                 fs_devices->rw_devices--;
1896                 /* zero out the old super if it is writable */
1897                 btrfs_scratch_superblocks(srcdev->bdev,
1898                                         rcu_str_deref(srcdev->name));
1899         }
1900
1901         if (srcdev->bdev)
1902                 fs_devices->open_devices--;
1903 }
1904
1905 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
1906                                       struct btrfs_device *srcdev)
1907 {
1908         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
1909
1910         call_rcu(&srcdev->rcu, free_device);
1911
1912         /*
1913          * unless fs_devices is seed fs, num_devices shouldn't go
1914          * zero
1915          */
1916         BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
1917
1918         /* if this is no devs we rather delete the fs_devices */
1919         if (!fs_devices->num_devices) {
1920                 struct btrfs_fs_devices *tmp_fs_devices;
1921
1922                 tmp_fs_devices = fs_info->fs_devices;
1923                 while (tmp_fs_devices) {
1924                         if (tmp_fs_devices->seed == fs_devices) {
1925                                 tmp_fs_devices->seed = fs_devices->seed;
1926                                 break;
1927                         }
1928                         tmp_fs_devices = tmp_fs_devices->seed;
1929                 }
1930                 fs_devices->seed = NULL;
1931                 __btrfs_close_devices(fs_devices);
1932                 free_fs_devices(fs_devices);
1933         }
1934 }
1935
1936 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1937                                       struct btrfs_device *tgtdev)
1938 {
1939         struct btrfs_device *next_device;
1940
1941         mutex_lock(&uuid_mutex);
1942         WARN_ON(!tgtdev);
1943         mutex_lock(&fs_info->fs_devices->device_list_mutex);
1944
1945         btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
1946
1947         if (tgtdev->bdev) {
1948                 btrfs_scratch_superblocks(tgtdev->bdev,
1949                                         rcu_str_deref(tgtdev->name));
1950                 fs_info->fs_devices->open_devices--;
1951         }
1952         fs_info->fs_devices->num_devices--;
1953
1954         next_device = list_entry(fs_info->fs_devices->devices.next,
1955                                  struct btrfs_device, dev_list);
1956         if (tgtdev->bdev == fs_info->sb->s_bdev)
1957                 fs_info->sb->s_bdev = next_device->bdev;
1958         if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
1959                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1960         list_del_rcu(&tgtdev->dev_list);
1961
1962         call_rcu(&tgtdev->rcu, free_device);
1963
1964         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1965         mutex_unlock(&uuid_mutex);
1966 }
1967
1968 static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
1969                                      struct btrfs_device **device)
1970 {
1971         int ret = 0;
1972         struct btrfs_super_block *disk_super;
1973         u64 devid;
1974         u8 *dev_uuid;
1975         struct block_device *bdev;
1976         struct buffer_head *bh;
1977
1978         *device = NULL;
1979         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
1980                                     root->fs_info->bdev_holder, 0, &bdev, &bh);
1981         if (ret)
1982                 return ret;
1983         disk_super = (struct btrfs_super_block *)bh->b_data;
1984         devid = btrfs_stack_device_id(&disk_super->dev_item);
1985         dev_uuid = disk_super->dev_item.uuid;
1986         *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1987                                     disk_super->fsid);
1988         brelse(bh);
1989         if (!*device)
1990                 ret = -ENOENT;
1991         blkdev_put(bdev, FMODE_READ);
1992         return ret;
1993 }
1994
1995 int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
1996                                          char *device_path,
1997                                          struct btrfs_device **device)
1998 {
1999         *device = NULL;
2000         if (strcmp(device_path, "missing") == 0) {
2001                 struct list_head *devices;
2002                 struct btrfs_device *tmp;
2003
2004                 devices = &root->fs_info->fs_devices->devices;
2005                 /*
2006                  * It is safe to read the devices since the volume_mutex
2007                  * is held by the caller.
2008                  */
2009                 list_for_each_entry(tmp, devices, dev_list) {
2010                         if (tmp->in_fs_metadata && !tmp->bdev) {
2011                                 *device = tmp;
2012                                 break;
2013                         }
2014                 }
2015
2016                 if (!*device)
2017                         return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2018
2019                 return 0;
2020         } else {
2021                 return btrfs_find_device_by_path(root, device_path, device);
2022         }
2023 }
2024
2025 /*
2026  * does all the dirty work required for changing file system's UUID.
2027  */
2028 static int btrfs_prepare_sprout(struct btrfs_root *root)
2029 {
2030         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
2031         struct btrfs_fs_devices *old_devices;
2032         struct btrfs_fs_devices *seed_devices;
2033         struct btrfs_super_block *disk_super = root->fs_info->super_copy;
2034         struct btrfs_device *device;
2035         u64 super_flags;
2036
2037         BUG_ON(!mutex_is_locked(&uuid_mutex));
2038         if (!fs_devices->seeding)
2039                 return -EINVAL;
2040
2041         seed_devices = __alloc_fs_devices();
2042         if (IS_ERR(seed_devices))
2043                 return PTR_ERR(seed_devices);
2044
2045         old_devices = clone_fs_devices(fs_devices);
2046         if (IS_ERR(old_devices)) {
2047                 kfree(seed_devices);
2048                 return PTR_ERR(old_devices);
2049         }
2050
2051         list_add(&old_devices->list, &fs_uuids);
2052
2053         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2054         seed_devices->opened = 1;
2055         INIT_LIST_HEAD(&seed_devices->devices);
2056         INIT_LIST_HEAD(&seed_devices->alloc_list);
2057         mutex_init(&seed_devices->device_list_mutex);
2058
2059         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2060         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2061                               synchronize_rcu);
2062         list_for_each_entry(device, &seed_devices->devices, dev_list)
2063                 device->fs_devices = seed_devices;
2064
2065         lock_chunks(root);
2066         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2067         unlock_chunks(root);
2068
2069         fs_devices->seeding = 0;
2070         fs_devices->num_devices = 0;
2071         fs_devices->open_devices = 0;
2072         fs_devices->missing_devices = 0;
2073         fs_devices->rotating = 0;
2074         fs_devices->seed = seed_devices;
2075
2076         generate_random_uuid(fs_devices->fsid);
2077         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2078         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2079         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2080
2081         super_flags = btrfs_super_flags(disk_super) &
2082                       ~BTRFS_SUPER_FLAG_SEEDING;
2083         btrfs_set_super_flags(disk_super, super_flags);
2084
2085         return 0;
2086 }
2087
2088 /*
2089  * strore the expected generation for seed devices in device items.
2090  */
2091 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2092                                struct btrfs_root *root)
2093 {
2094         struct btrfs_path *path;
2095         struct extent_buffer *leaf;
2096         struct btrfs_dev_item *dev_item;
2097         struct btrfs_device *device;
2098         struct btrfs_key key;
2099         u8 fs_uuid[BTRFS_UUID_SIZE];
2100         u8 dev_uuid[BTRFS_UUID_SIZE];
2101         u64 devid;
2102         int ret;
2103
2104         path = btrfs_alloc_path();
2105         if (!path)
2106                 return -ENOMEM;
2107
2108         root = root->fs_info->chunk_root;
2109         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2110         key.offset = 0;
2111         key.type = BTRFS_DEV_ITEM_KEY;
2112
2113         while (1) {
2114                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2115                 if (ret < 0)
2116                         goto error;
2117
2118                 leaf = path->nodes[0];
2119 next_slot:
2120                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2121                         ret = btrfs_next_leaf(root, path);
2122                         if (ret > 0)
2123                                 break;
2124                         if (ret < 0)
2125                                 goto error;
2126                         leaf = path->nodes[0];
2127                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2128                         btrfs_release_path(path);
2129                         continue;
2130                 }
2131
2132                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2133                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2134                     key.type != BTRFS_DEV_ITEM_KEY)
2135                         break;
2136
2137                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2138                                           struct btrfs_dev_item);
2139                 devid = btrfs_device_id(leaf, dev_item);
2140                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2141                                    BTRFS_UUID_SIZE);
2142                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2143                                    BTRFS_UUID_SIZE);
2144                 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
2145                                            fs_uuid);
2146                 BUG_ON(!device); /* Logic error */
2147
2148                 if (device->fs_devices->seeding) {
2149                         btrfs_set_device_generation(leaf, dev_item,
2150                                                     device->generation);
2151                         btrfs_mark_buffer_dirty(leaf);
2152                 }
2153
2154                 path->slots[0]++;
2155                 goto next_slot;
2156         }
2157         ret = 0;
2158 error:
2159         btrfs_free_path(path);
2160         return ret;
2161 }
2162
2163 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2164 {
2165         struct request_queue *q;
2166         struct btrfs_trans_handle *trans;
2167         struct btrfs_device *device;
2168         struct block_device *bdev;
2169         struct list_head *devices;
2170         struct super_block *sb = root->fs_info->sb;
2171         struct rcu_string *name;
2172         u64 tmp;
2173         int seeding_dev = 0;
2174         int ret = 0;
2175
2176         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
2177                 return -EROFS;
2178
2179         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2180                                   root->fs_info->bdev_holder);
2181         if (IS_ERR(bdev))
2182                 return PTR_ERR(bdev);
2183
2184         if (root->fs_info->fs_devices->seeding) {
2185                 seeding_dev = 1;
2186                 down_write(&sb->s_umount);
2187                 mutex_lock(&uuid_mutex);
2188         }
2189
2190         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2191
2192         devices = &root->fs_info->fs_devices->devices;
2193
2194         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2195         list_for_each_entry(device, devices, dev_list) {
2196                 if (device->bdev == bdev) {
2197                         ret = -EEXIST;
2198                         mutex_unlock(
2199                                 &root->fs_info->fs_devices->device_list_mutex);
2200                         goto error;
2201                 }
2202         }
2203         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2204
2205         device = btrfs_alloc_device(root->fs_info, NULL, NULL);
2206         if (IS_ERR(device)) {
2207                 /* we can safely leave the fs_devices entry around */
2208                 ret = PTR_ERR(device);
2209                 goto error;
2210         }
2211
2212         name = rcu_string_strdup(device_path, GFP_NOFS);
2213         if (!name) {
2214                 kfree(device);
2215                 ret = -ENOMEM;
2216                 goto error;
2217         }
2218         rcu_assign_pointer(device->name, name);
2219
2220         trans = btrfs_start_transaction(root, 0);
2221         if (IS_ERR(trans)) {
2222                 rcu_string_free(device->name);
2223                 kfree(device);
2224                 ret = PTR_ERR(trans);
2225                 goto error;
2226         }
2227
2228         q = bdev_get_queue(bdev);
2229         if (blk_queue_discard(q))
2230                 device->can_discard = 1;
2231         device->writeable = 1;
2232         device->generation = trans->transid;
2233         device->io_width = root->sectorsize;
2234         device->io_align = root->sectorsize;
2235         device->sector_size = root->sectorsize;
2236         device->total_bytes = i_size_read(bdev->bd_inode);
2237         device->disk_total_bytes = device->total_bytes;
2238         device->commit_total_bytes = device->total_bytes;
2239         device->dev_root = root->fs_info->dev_root;
2240         device->bdev = bdev;
2241         device->in_fs_metadata = 1;
2242         device->is_tgtdev_for_dev_replace = 0;
2243         device->mode = FMODE_EXCL;
2244         device->dev_stats_valid = 1;
2245         set_blocksize(device->bdev, 4096);
2246
2247         if (seeding_dev) {
2248                 sb->s_flags &= ~MS_RDONLY;
2249                 ret = btrfs_prepare_sprout(root);
2250                 BUG_ON(ret); /* -ENOMEM */
2251         }
2252
2253         device->fs_devices = root->fs_info->fs_devices;
2254
2255         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2256         lock_chunks(root);
2257         list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
2258         list_add(&device->dev_alloc_list,
2259                  &root->fs_info->fs_devices->alloc_list);
2260         root->fs_info->fs_devices->num_devices++;
2261         root->fs_info->fs_devices->open_devices++;
2262         root->fs_info->fs_devices->rw_devices++;
2263         root->fs_info->fs_devices->total_devices++;
2264         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2265
2266         spin_lock(&root->fs_info->free_chunk_lock);
2267         root->fs_info->free_chunk_space += device->total_bytes;
2268         spin_unlock(&root->fs_info->free_chunk_lock);
2269
2270         if (!blk_queue_nonrot(bdev_get_queue(bdev)))
2271                 root->fs_info->fs_devices->rotating = 1;
2272
2273         tmp = btrfs_super_total_bytes(root->fs_info->super_copy);
2274         btrfs_set_super_total_bytes(root->fs_info->super_copy,
2275                                     tmp + device->total_bytes);
2276
2277         tmp = btrfs_super_num_devices(root->fs_info->super_copy);
2278         btrfs_set_super_num_devices(root->fs_info->super_copy,
2279                                     tmp + 1);
2280
2281         /* add sysfs device entry */
2282         btrfs_sysfs_add_device_link(root->fs_info->fs_devices, device);
2283
2284         /*
2285          * we've got more storage, clear any full flags on the space
2286          * infos
2287          */
2288         btrfs_clear_space_info_full(root->fs_info);
2289
2290         unlock_chunks(root);
2291         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2292
2293         if (seeding_dev) {
2294                 lock_chunks(root);
2295                 ret = init_first_rw_device(trans, root, device);
2296                 unlock_chunks(root);
2297                 if (ret) {
2298                         btrfs_abort_transaction(trans, root, ret);
2299                         goto error_trans;
2300                 }
2301         }
2302
2303         ret = btrfs_add_device(trans, root, device);
2304         if (ret) {
2305                 btrfs_abort_transaction(trans, root, ret);
2306                 goto error_trans;
2307         }
2308
2309         if (seeding_dev) {
2310                 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2311
2312                 ret = btrfs_finish_sprout(trans, root);
2313                 if (ret) {
2314                         btrfs_abort_transaction(trans, root, ret);
2315                         goto error_trans;
2316                 }
2317
2318                 /* Sprouting would change fsid of the mounted root,
2319                  * so rename the fsid on the sysfs
2320                  */
2321                 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2322                                                 root->fs_info->fsid);
2323                 if (kobject_rename(&root->fs_info->fs_devices->fsid_kobj,
2324                                                                 fsid_buf))
2325                         btrfs_warn(root->fs_info,
2326                                 "sysfs: failed to create fsid for sprout");
2327         }
2328
2329         root->fs_info->num_tolerated_disk_barrier_failures =
2330                 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
2331         ret = btrfs_commit_transaction(trans, root);
2332
2333         if (seeding_dev) {
2334                 mutex_unlock(&uuid_mutex);
2335                 up_write(&sb->s_umount);
2336
2337                 if (ret) /* transaction commit */
2338                         return ret;
2339
2340                 ret = btrfs_relocate_sys_chunks(root);
2341                 if (ret < 0)
2342                         btrfs_std_error(root->fs_info, ret,
2343                                     "Failed to relocate sys chunks after "
2344                                     "device initialization. This can be fixed "
2345                                     "using the \"btrfs balance\" command.");
2346                 trans = btrfs_attach_transaction(root);
2347                 if (IS_ERR(trans)) {
2348                         if (PTR_ERR(trans) == -ENOENT)
2349                                 return 0;
2350                         return PTR_ERR(trans);
2351                 }
2352                 ret = btrfs_commit_transaction(trans, root);
2353         }
2354
2355         /* Update ctime/mtime for libblkid */
2356         update_dev_time(device_path);
2357         return ret;
2358
2359 error_trans:
2360         btrfs_end_transaction(trans, root);
2361         rcu_string_free(device->name);
2362         btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
2363         kfree(device);
2364 error:
2365         blkdev_put(bdev, FMODE_EXCL);
2366         if (seeding_dev) {
2367                 mutex_unlock(&uuid_mutex);
2368                 up_write(&sb->s_umount);
2369         }
2370         return ret;
2371 }
2372
2373 int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2374                                   struct btrfs_device *srcdev,
2375                                   struct btrfs_device **device_out)
2376 {
2377         struct request_queue *q;
2378         struct btrfs_device *device;
2379         struct block_device *bdev;
2380         struct btrfs_fs_info *fs_info = root->fs_info;
2381         struct list_head *devices;
2382         struct rcu_string *name;
2383         u64 devid = BTRFS_DEV_REPLACE_DEVID;
2384         int ret = 0;
2385
2386         *device_out = NULL;
2387         if (fs_info->fs_devices->seeding) {
2388                 btrfs_err(fs_info, "the filesystem is a seed filesystem!");
2389                 return -EINVAL;
2390         }
2391
2392         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2393                                   fs_info->bdev_holder);
2394         if (IS_ERR(bdev)) {
2395                 btrfs_err(fs_info, "target device %s is invalid!", device_path);
2396                 return PTR_ERR(bdev);
2397         }
2398
2399         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2400
2401         devices = &fs_info->fs_devices->devices;
2402         list_for_each_entry(device, devices, dev_list) {
2403                 if (device->bdev == bdev) {
2404                         btrfs_err(fs_info, "target device is in the filesystem!");
2405                         ret = -EEXIST;
2406                         goto error;
2407                 }
2408         }
2409
2410
2411         if (i_size_read(bdev->bd_inode) <
2412             btrfs_device_get_total_bytes(srcdev)) {
2413                 btrfs_err(fs_info, "target device is smaller than source device!");
2414                 ret = -EINVAL;
2415                 goto error;
2416         }
2417
2418
2419         device = btrfs_alloc_device(NULL, &devid, NULL);
2420         if (IS_ERR(device)) {
2421                 ret = PTR_ERR(device);
2422                 goto error;
2423         }
2424
2425         name = rcu_string_strdup(device_path, GFP_NOFS);
2426         if (!name) {
2427                 kfree(device);
2428                 ret = -ENOMEM;
2429                 goto error;
2430         }
2431         rcu_assign_pointer(device->name, name);
2432
2433         q = bdev_get_queue(bdev);
2434         if (blk_queue_discard(q))
2435                 device->can_discard = 1;
2436         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2437         device->writeable = 1;
2438         device->generation = 0;
2439         device->io_width = root->sectorsize;
2440         device->io_align = root->sectorsize;
2441         device->sector_size = root->sectorsize;
2442         device->total_bytes = btrfs_device_get_total_bytes(srcdev);
2443         device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
2444         device->bytes_used = btrfs_device_get_bytes_used(srcdev);
2445         ASSERT(list_empty(&srcdev->resized_list));
2446         device->commit_total_bytes = srcdev->commit_total_bytes;
2447         device->commit_bytes_used = device->bytes_used;
2448         device->dev_root = fs_info->dev_root;
2449         device->bdev = bdev;
2450         device->in_fs_metadata = 1;
2451         device->is_tgtdev_for_dev_replace = 1;
2452         device->mode = FMODE_EXCL;
2453         device->dev_stats_valid = 1;
2454         set_blocksize(device->bdev, 4096);
2455         device->fs_devices = fs_info->fs_devices;
2456         list_add(&device->dev_list, &fs_info->fs_devices->devices);
2457         fs_info->fs_devices->num_devices++;
2458         fs_info->fs_devices->open_devices++;
2459         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2460
2461         *device_out = device;
2462         return ret;
2463
2464 error:
2465         blkdev_put(bdev, FMODE_EXCL);
2466         return ret;
2467 }
2468
2469 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
2470                                               struct btrfs_device *tgtdev)
2471 {
2472         WARN_ON(fs_info->fs_devices->rw_devices == 0);
2473         tgtdev->io_width = fs_info->dev_root->sectorsize;
2474         tgtdev->io_align = fs_info->dev_root->sectorsize;
2475         tgtdev->sector_size = fs_info->dev_root->sectorsize;
2476         tgtdev->dev_root = fs_info->dev_root;
2477         tgtdev->in_fs_metadata = 1;
2478 }
2479
2480 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2481                                         struct btrfs_device *device)
2482 {
2483         int ret;
2484         struct btrfs_path *path;
2485         struct btrfs_root *root;
2486         struct btrfs_dev_item *dev_item;
2487         struct extent_buffer *leaf;
2488         struct btrfs_key key;
2489
2490         root = device->dev_root->fs_info->chunk_root;
2491
2492         path = btrfs_alloc_path();
2493         if (!path)
2494                 return -ENOMEM;
2495
2496         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2497         key.type = BTRFS_DEV_ITEM_KEY;
2498         key.offset = device->devid;
2499
2500         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2501         if (ret < 0)
2502                 goto out;
2503
2504         if (ret > 0) {
2505                 ret = -ENOENT;
2506                 goto out;
2507         }
2508
2509         leaf = path->nodes[0];
2510         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2511
2512         btrfs_set_device_id(leaf, dev_item, device->devid);
2513         btrfs_set_device_type(leaf, dev_item, device->type);
2514         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2515         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2516         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2517         btrfs_set_device_total_bytes(leaf, dev_item,
2518                                      btrfs_device_get_disk_total_bytes(device));
2519         btrfs_set_device_bytes_used(leaf, dev_item,
2520                                     btrfs_device_get_bytes_used(device));
2521         btrfs_mark_buffer_dirty(leaf);
2522
2523 out:
2524         btrfs_free_path(path);
2525         return ret;
2526 }
2527
2528 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2529                       struct btrfs_device *device, u64 new_size)
2530 {
2531         struct btrfs_super_block *super_copy =
2532                 device->dev_root->fs_info->super_copy;
2533         struct btrfs_fs_devices *fs_devices;
2534         u64 old_total;
2535         u64 diff;
2536
2537         if (!device->writeable)
2538                 return -EACCES;
2539
2540         lock_chunks(device->dev_root);
2541         old_total = btrfs_super_total_bytes(super_copy);
2542         diff = new_size - device->total_bytes;
2543
2544         if (new_size <= device->total_bytes ||
2545             device->is_tgtdev_for_dev_replace) {
2546                 unlock_chunks(device->dev_root);
2547                 return -EINVAL;
2548         }
2549
2550         fs_devices = device->dev_root->fs_info->fs_devices;
2551
2552         btrfs_set_super_total_bytes(super_copy, old_total + diff);
2553         device->fs_devices->total_rw_bytes += diff;
2554
2555         btrfs_device_set_total_bytes(device, new_size);
2556         btrfs_device_set_disk_total_bytes(device, new_size);
2557         btrfs_clear_space_info_full(device->dev_root->fs_info);
2558         if (list_empty(&device->resized_list))
2559                 list_add_tail(&device->resized_list,
2560                               &fs_devices->resized_devices);
2561         unlock_chunks(device->dev_root);
2562
2563         return btrfs_update_device(trans, device);
2564 }
2565
2566 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2567                             struct btrfs_root *root, u64 chunk_objectid,
2568                             u64 chunk_offset)
2569 {
2570         int ret;
2571         struct btrfs_path *path;
2572         struct btrfs_key key;
2573
2574         root = root->fs_info->chunk_root;
2575         path = btrfs_alloc_path();
2576         if (!path)
2577                 return -ENOMEM;
2578
2579         key.objectid = chunk_objectid;
2580         key.offset = chunk_offset;
2581         key.type = BTRFS_CHUNK_ITEM_KEY;
2582
2583         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2584         if (ret < 0)
2585                 goto out;
2586         else if (ret > 0) { /* Logic error or corruption */
2587                 btrfs_std_error(root->fs_info, -ENOENT,
2588                             "Failed lookup while freeing chunk.");
2589                 ret = -ENOENT;
2590                 goto out;
2591         }
2592
2593         ret = btrfs_del_item(trans, root, path);
2594         if (ret < 0)
2595                 btrfs_std_error(root->fs_info, ret,
2596                             "Failed to delete chunk item.");
2597 out:
2598         btrfs_free_path(path);
2599         return ret;
2600 }
2601
2602 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
2603                         chunk_offset)
2604 {
2605         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2606         struct btrfs_disk_key *disk_key;
2607         struct btrfs_chunk *chunk;
2608         u8 *ptr;
2609         int ret = 0;
2610         u32 num_stripes;
2611         u32 array_size;
2612         u32 len = 0;
2613         u32 cur;
2614         struct btrfs_key key;
2615
2616         lock_chunks(root);
2617         array_size = btrfs_super_sys_array_size(super_copy);
2618
2619         ptr = super_copy->sys_chunk_array;
2620         cur = 0;
2621
2622         while (cur < array_size) {
2623                 disk_key = (struct btrfs_disk_key *)ptr;
2624                 btrfs_disk_key_to_cpu(&key, disk_key);
2625
2626                 len = sizeof(*disk_key);
2627
2628                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2629                         chunk = (struct btrfs_chunk *)(ptr + len);
2630                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2631                         len += btrfs_chunk_item_size(num_stripes);
2632                 } else {
2633                         ret = -EIO;
2634                         break;
2635                 }
2636                 if (key.objectid == chunk_objectid &&
2637                     key.offset == chunk_offset) {
2638                         memmove(ptr, ptr + len, array_size - (cur + len));
2639                         array_size -= len;
2640                         btrfs_set_super_sys_array_size(super_copy, array_size);
2641                 } else {
2642                         ptr += len;
2643                         cur += len;
2644                 }
2645         }
2646         unlock_chunks(root);
2647         return ret;
2648 }
2649
2650 int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2651                        struct btrfs_root *root, u64 chunk_offset)
2652 {
2653         struct extent_map_tree *em_tree;
2654         struct extent_map *em;
2655         struct btrfs_root *extent_root = root->fs_info->extent_root;
2656         struct map_lookup *map;
2657         u64 dev_extent_len = 0;
2658         u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2659         int i, ret = 0;
2660
2661         /* Just in case */
2662         root = root->fs_info->chunk_root;
2663         em_tree = &root->fs_info->mapping_tree.map_tree;
2664
2665         read_lock(&em_tree->lock);
2666         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
2667         read_unlock(&em_tree->lock);
2668
2669         if (!em || em->start > chunk_offset ||
2670             em->start + em->len < chunk_offset) {
2671                 /*
2672                  * This is a logic error, but we don't want to just rely on the
2673                  * user having built with ASSERT enabled, so if ASSERT doens't
2674                  * do anything we still error out.
2675                  */
2676                 ASSERT(0);
2677                 if (em)
2678                         free_extent_map(em);
2679                 return -EINVAL;
2680         }
2681         map = (struct map_lookup *)em->bdev;
2682         lock_chunks(root->fs_info->chunk_root);
2683         check_system_chunk(trans, extent_root, map->type);
2684         unlock_chunks(root->fs_info->chunk_root);
2685
2686         for (i = 0; i < map->num_stripes; i++) {
2687                 struct btrfs_device *device = map->stripes[i].dev;
2688                 ret = btrfs_free_dev_extent(trans, device,
2689                                             map->stripes[i].physical,
2690                                             &dev_extent_len);
2691                 if (ret) {
2692                         btrfs_abort_transaction(trans, root, ret);
2693                         goto out;
2694                 }
2695
2696                 if (device->bytes_used > 0) {
2697                         lock_chunks(root);
2698                         btrfs_device_set_bytes_used(device,
2699                                         device->bytes_used - dev_extent_len);
2700                         spin_lock(&root->fs_info->free_chunk_lock);
2701                         root->fs_info->free_chunk_space += dev_extent_len;
2702                         spin_unlock(&root->fs_info->free_chunk_lock);
2703                         btrfs_clear_space_info_full(root->fs_info);
2704                         unlock_chunks(root);
2705                 }
2706
2707                 if (map->stripes[i].dev) {
2708                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2709                         if (ret) {
2710                                 btrfs_abort_transaction(trans, root, ret);
2711                                 goto out;
2712                         }
2713                 }
2714         }
2715         ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset);
2716         if (ret) {
2717                 btrfs_abort_transaction(trans, root, ret);
2718                 goto out;
2719         }
2720
2721         trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
2722
2723         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2724                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
2725                 if (ret) {
2726                         btrfs_abort_transaction(trans, root, ret);
2727                         goto out;
2728                 }
2729         }
2730
2731         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
2732         if (ret) {
2733                 btrfs_abort_transaction(trans, extent_root, ret);
2734                 goto out;
2735         }
2736
2737 out:
2738         /* once for us */
2739         free_extent_map(em);
2740         return ret;
2741 }
2742
2743 static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
2744 {
2745         struct btrfs_root *extent_root;
2746         struct btrfs_trans_handle *trans;
2747         int ret;
2748
2749         root = root->fs_info->chunk_root;
2750         extent_root = root->fs_info->extent_root;
2751
2752         /*
2753          * Prevent races with automatic removal of unused block groups.
2754          * After we relocate and before we remove the chunk with offset
2755          * chunk_offset, automatic removal of the block group can kick in,
2756          * resulting in a failure when calling btrfs_remove_chunk() below.
2757          *
2758          * Make sure to acquire this mutex before doing a tree search (dev
2759          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
2760          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
2761          * we release the path used to search the chunk/dev tree and before
2762          * the current task acquires this mutex and calls us.
2763          */
2764         ASSERT(mutex_is_locked(&root->fs_info->delete_unused_bgs_mutex));
2765
2766         ret = btrfs_can_relocate(extent_root, chunk_offset);
2767         if (ret)
2768                 return -ENOSPC;
2769
2770         /* step one, relocate all the extents inside this chunk */
2771         btrfs_scrub_pause(root);
2772         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
2773         btrfs_scrub_continue(root);
2774         if (ret)
2775                 return ret;
2776
2777         trans = btrfs_start_transaction(root, 0);
2778         if (IS_ERR(trans)) {
2779                 ret = PTR_ERR(trans);
2780                 btrfs_std_error(root->fs_info, ret, NULL);
2781                 return ret;
2782         }
2783
2784         /*
2785          * step two, delete the device extents and the
2786          * chunk tree entries
2787          */
2788         ret = btrfs_remove_chunk(trans, root, chunk_offset);
2789         btrfs_end_transaction(trans, root);
2790         return ret;
2791 }
2792
2793 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
2794 {
2795         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
2796         struct btrfs_path *path;
2797         struct extent_buffer *leaf;
2798         struct btrfs_chunk *chunk;
2799         struct btrfs_key key;
2800         struct btrfs_key found_key;
2801         u64 chunk_type;
2802         bool retried = false;
2803         int failed = 0;
2804         int ret;
2805
2806         path = btrfs_alloc_path();
2807         if (!path)
2808                 return -ENOMEM;
2809
2810 again:
2811         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2812         key.offset = (u64)-1;
2813         key.type = BTRFS_CHUNK_ITEM_KEY;
2814
2815         while (1) {
2816                 mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
2817                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2818                 if (ret < 0) {
2819                         mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
2820                         goto error;
2821                 }
2822                 BUG_ON(ret == 0); /* Corruption */
2823
2824                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2825                                           key.type);
2826                 if (ret)
2827                         mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
2828                 if (ret < 0)
2829                         goto error;
2830                 if (ret > 0)
2831                         break;
2832
2833                 leaf = path->nodes[0];
2834                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2835
2836                 chunk = btrfs_item_ptr(leaf, path->slots[0],
2837                                        struct btrfs_chunk);
2838                 chunk_type = btrfs_chunk_type(leaf, chunk);
2839                 btrfs_release_path(path);
2840
2841                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2842                         ret = btrfs_relocate_chunk(chunk_root,
2843                                                    found_key.offset);
2844                         if (ret == -ENOSPC)
2845                                 failed++;
2846                         else
2847                                 BUG_ON(ret);
2848                 }
2849                 mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
2850
2851                 if (found_key.offset == 0)
2852                         break;
2853                 key.offset = found_key.offset - 1;
2854         }
2855         ret = 0;
2856         if (failed && !retried) {
2857                 failed = 0;
2858                 retried = true;
2859                 goto again;
2860         } else if (WARN_ON(failed && retried)) {
2861                 ret = -ENOSPC;
2862         }
2863 error:
2864         btrfs_free_path(path);
2865         return ret;
2866 }
2867
2868 static int insert_balance_item(struct btrfs_root *root,
2869                                struct btrfs_balance_control *bctl)
2870 {
2871         struct btrfs_trans_handle *trans;
2872         struct btrfs_balance_item *item;
2873         struct btrfs_disk_balance_args disk_bargs;
2874         struct btrfs_path *path;
2875         struct extent_buffer *leaf;
2876         struct btrfs_key key;
2877         int ret, err;
2878
2879         path = btrfs_alloc_path();
2880         if (!path)
2881                 return -ENOMEM;
2882
2883         trans = btrfs_start_transaction(root, 0);
2884         if (IS_ERR(trans)) {
2885                 btrfs_free_path(path);
2886                 return PTR_ERR(trans);
2887         }
2888
2889         key.objectid = BTRFS_BALANCE_OBJECTID;
2890         key.type = BTRFS_BALANCE_ITEM_KEY;
2891         key.offset = 0;
2892
2893         ret = btrfs_insert_empty_item(trans, root, path, &key,
2894                                       sizeof(*item));
2895         if (ret)
2896                 goto out;
2897
2898         leaf = path->nodes[0];
2899         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2900
2901         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2902
2903         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2904         btrfs_set_balance_data(leaf, item, &disk_bargs);
2905         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2906         btrfs_set_balance_meta(leaf, item, &disk_bargs);
2907         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2908         btrfs_set_balance_sys(leaf, item, &disk_bargs);
2909
2910         btrfs_set_balance_flags(leaf, item, bctl->flags);
2911
2912         btrfs_mark_buffer_dirty(leaf);
2913 out:
2914         btrfs_free_path(path);
2915         err = btrfs_commit_transaction(trans, root);
2916         if (err && !ret)
2917                 ret = err;
2918         return ret;
2919 }
2920
2921 static int del_balance_item(struct btrfs_root *root)
2922 {
2923         struct btrfs_trans_handle *trans;
2924         struct btrfs_path *path;
2925         struct btrfs_key key;
2926         int ret, err;
2927
2928         path = btrfs_alloc_path();
2929         if (!path)
2930                 return -ENOMEM;
2931
2932         trans = btrfs_start_transaction(root, 0);
2933         if (IS_ERR(trans)) {
2934                 btrfs_free_path(path);
2935                 return PTR_ERR(trans);
2936         }
2937
2938         key.objectid = BTRFS_BALANCE_OBJECTID;
2939         key.type = BTRFS_BALANCE_ITEM_KEY;
2940         key.offset = 0;
2941
2942         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2943         if (ret < 0)
2944                 goto out;
2945         if (ret > 0) {
2946                 ret = -ENOENT;
2947                 goto out;
2948         }
2949
2950         ret = btrfs_del_item(trans, root, path);
2951 out:
2952         btrfs_free_path(path);
2953         err = btrfs_commit_transaction(trans, root);
2954         if (err && !ret)
2955                 ret = err;
2956         return ret;
2957 }
2958
2959 /*
2960  * This is a heuristic used to reduce the number of chunks balanced on
2961  * resume after balance was interrupted.
2962  */
2963 static void update_balance_args(struct btrfs_balance_control *bctl)
2964 {
2965         /*
2966          * Turn on soft mode for chunk types that were being converted.
2967          */
2968         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2969                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2970         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2971                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2972         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2973                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2974
2975         /*
2976          * Turn on usage filter if is not already used.  The idea is
2977          * that chunks that we have already balanced should be
2978          * reasonably full.  Don't do it for chunks that are being
2979          * converted - that will keep us from relocating unconverted
2980          * (albeit full) chunks.
2981          */
2982         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2983             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2984                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2985                 bctl->data.usage = 90;
2986         }
2987         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2988             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2989                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2990                 bctl->sys.usage = 90;
2991         }
2992         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2993             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2994                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2995                 bctl->meta.usage = 90;
2996         }
2997 }
2998
2999 /*
3000  * Should be called with both balance and volume mutexes held to
3001  * serialize other volume operations (add_dev/rm_dev/resize) with
3002  * restriper.  Same goes for unset_balance_control.
3003  */
3004 static void set_balance_control(struct btrfs_balance_control *bctl)
3005 {
3006         struct btrfs_fs_info *fs_info = bctl->fs_info;
3007
3008         BUG_ON(fs_info->balance_ctl);
3009
3010         spin_lock(&fs_info->balance_lock);
3011         fs_info->balance_ctl = bctl;
3012         spin_unlock(&fs_info->balance_lock);
3013 }
3014
3015 static void unset_balance_control(struct btrfs_fs_info *fs_info)
3016 {
3017         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3018
3019         BUG_ON(!fs_info->balance_ctl);
3020
3021         spin_lock(&fs_info->balance_lock);
3022         fs_info->balance_ctl = NULL;
3023         spin_unlock(&fs_info->balance_lock);
3024
3025         kfree(bctl);
3026 }
3027
3028 /*
3029  * Balance filters.  Return 1 if chunk should be filtered out
3030  * (should not be balanced).
3031  */
3032 static int chunk_profiles_filter(u64 chunk_type,
3033                                  struct btrfs_balance_args *bargs)
3034 {
3035         chunk_type = chunk_to_extended(chunk_type) &
3036                                 BTRFS_EXTENDED_PROFILE_MASK;
3037
3038         if (bargs->profiles & chunk_type)
3039                 return 0;
3040
3041         return 1;
3042 }
3043
3044 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3045                               struct btrfs_balance_args *bargs)
3046 {
3047         struct btrfs_block_group_cache *cache;
3048         u64 chunk_used, user_thresh;
3049         int ret = 1;
3050
3051         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3052         chunk_used = btrfs_block_group_used(&cache->item);
3053
3054         if (bargs->usage == 0)
3055                 user_thresh = 1;
3056         else if (bargs->usage > 100)
3057                 user_thresh = cache->key.offset;
3058         else
3059                 user_thresh = div_factor_fine(cache->key.offset,
3060                                               bargs->usage);
3061
3062         if (chunk_used < user_thresh)
3063                 ret = 0;
3064
3065         btrfs_put_block_group(cache);
3066         return ret;
3067 }
3068
3069 static int chunk_devid_filter(struct extent_buffer *leaf,
3070                               struct btrfs_chunk *chunk,
3071                               struct btrfs_balance_args *bargs)
3072 {
3073         struct btrfs_stripe *stripe;
3074         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3075         int i;
3076
3077         for (i = 0; i < num_stripes; i++) {
3078                 stripe = btrfs_stripe_nr(chunk, i);
3079                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3080                         return 0;
3081         }
3082
3083         return 1;
3084 }
3085
3086 /* [pstart, pend) */
3087 static int chunk_drange_filter(struct extent_buffer *leaf,
3088                                struct btrfs_chunk *chunk,
3089                                u64 chunk_offset,
3090                                struct btrfs_balance_args *bargs)
3091 {
3092         struct btrfs_stripe *stripe;
3093         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3094         u64 stripe_offset;
3095         u64 stripe_length;
3096         int factor;
3097         int i;
3098
3099         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3100                 return 0;
3101
3102         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
3103              BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
3104                 factor = num_stripes / 2;
3105         } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
3106                 factor = num_stripes - 1;
3107         } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
3108                 factor = num_stripes - 2;
3109         } else {
3110                 factor = num_stripes;
3111         }
3112
3113         for (i = 0; i < num_stripes; i++) {
3114                 stripe = btrfs_stripe_nr(chunk, i);
3115                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3116                         continue;
3117
3118                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3119                 stripe_length = btrfs_chunk_length(leaf, chunk);
3120                 stripe_length = div_u64(stripe_length, factor);
3121
3122                 if (stripe_offset < bargs->pend &&
3123                     stripe_offset + stripe_length > bargs->pstart)
3124                         return 0;
3125         }
3126
3127         return 1;
3128 }
3129
3130 /* [vstart, vend) */
3131 static int chunk_vrange_filter(struct extent_buffer *leaf,
3132                                struct btrfs_chunk *chunk,
3133                                u64 chunk_offset,
3134                                struct btrfs_balance_args *bargs)
3135 {
3136         if (chunk_offset < bargs->vend &&
3137             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3138                 /* at least part of the chunk is inside this vrange */
3139                 return 0;
3140
3141         return 1;
3142 }
3143
3144 static int chunk_soft_convert_filter(u64 chunk_type,
3145                                      struct btrfs_balance_args *bargs)
3146 {
3147         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3148                 return 0;
3149
3150         chunk_type = chunk_to_extended(chunk_type) &
3151                                 BTRFS_EXTENDED_PROFILE_MASK;
3152
3153         if (bargs->target == chunk_type)
3154                 return 1;
3155
3156         return 0;
3157 }
3158
3159 static int should_balance_chunk(struct btrfs_root *root,
3160                                 struct extent_buffer *leaf,
3161                                 struct btrfs_chunk *chunk, u64 chunk_offset)
3162 {
3163         struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
3164         struct btrfs_balance_args *bargs = NULL;
3165         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3166
3167         /* type filter */
3168         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3169               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3170                 return 0;
3171         }
3172
3173         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3174                 bargs = &bctl->data;
3175         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3176                 bargs = &bctl->sys;
3177         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3178                 bargs = &bctl->meta;
3179
3180         /* profiles filter */
3181         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3182             chunk_profiles_filter(chunk_type, bargs)) {
3183                 return 0;
3184         }
3185
3186         /* usage filter */
3187         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3188             chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
3189                 return 0;
3190         }
3191
3192         /* devid filter */
3193         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3194             chunk_devid_filter(leaf, chunk, bargs)) {
3195                 return 0;
3196         }
3197
3198         /* drange filter, makes sense only with devid filter */
3199         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3200             chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
3201                 return 0;
3202         }
3203
3204         /* vrange filter */
3205         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3206             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3207                 return 0;
3208         }
3209
3210         /* soft profile changing mode */
3211         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3212             chunk_soft_convert_filter(chunk_type, bargs)) {
3213                 return 0;
3214         }
3215
3216         /*
3217          * limited by count, must be the last filter
3218          */
3219         if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3220                 if (bargs->limit == 0)
3221                         return 0;
3222                 else
3223                         bargs->limit--;
3224         }
3225
3226         return 1;
3227 }
3228
3229 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3230 {
3231         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3232         struct btrfs_root *chunk_root = fs_info->chunk_root;
3233         struct btrfs_root *dev_root = fs_info->dev_root;
3234         struct list_head *devices;
3235         struct btrfs_device *device;
3236         u64 old_size;
3237         u64 size_to_free;
3238         struct btrfs_chunk *chunk;
3239         struct btrfs_path *path;
3240         struct btrfs_key key;
3241         struct btrfs_key found_key;
3242         struct btrfs_trans_handle *trans;
3243         struct extent_buffer *leaf;
3244         int slot;
3245         int ret;
3246         int enospc_errors = 0;
3247         bool counting = true;
3248         u64 limit_data = bctl->data.limit;
3249         u64 limit_meta = bctl->meta.limit;
3250         u64 limit_sys = bctl->sys.limit;
3251
3252         /* step one make some room on all the devices */
3253         devices = &fs_info->fs_devices->devices;
3254         list_for_each_entry(device, devices, dev_list) {
3255                 old_size = btrfs_device_get_total_bytes(device);
3256                 size_to_free = div_factor(old_size, 1);
3257                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
3258                 if (!device->writeable ||
3259                     btrfs_device_get_total_bytes(device) -
3260                     btrfs_device_get_bytes_used(device) > size_to_free ||
3261                     device->is_tgtdev_for_dev_replace)
3262                         continue;
3263
3264                 ret = btrfs_shrink_device(device, old_size - size_to_free);
3265                 if (ret == -ENOSPC)
3266                         break;
3267                 BUG_ON(ret);
3268
3269                 trans = btrfs_start_transaction(dev_root, 0);
3270                 BUG_ON(IS_ERR(trans));
3271
3272                 ret = btrfs_grow_device(trans, device, old_size);
3273                 BUG_ON(ret);
3274
3275                 btrfs_end_transaction(trans, dev_root);
3276         }
3277
3278         /* step two, relocate all the chunks */
3279         path = btrfs_alloc_path();
3280         if (!path) {
3281                 ret = -ENOMEM;
3282                 goto error;
3283         }
3284
3285         /* zero out stat counters */
3286         spin_lock(&fs_info->balance_lock);
3287         memset(&bctl->stat, 0, sizeof(bctl->stat));
3288         spin_unlock(&fs_info->balance_lock);
3289 again:
3290         if (!counting) {
3291                 bctl->data.limit = limit_data;
3292                 bctl->meta.limit = limit_meta;
3293                 bctl->sys.limit = limit_sys;
3294         }
3295         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3296         key.offset = (u64)-1;
3297         key.type = BTRFS_CHUNK_ITEM_KEY;
3298
3299         while (1) {
3300                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3301                     atomic_read(&fs_info->balance_cancel_req)) {
3302                         ret = -ECANCELED;
3303                         goto error;
3304                 }
3305
3306                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3307                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3308                 if (ret < 0) {
3309                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3310                         goto error;
3311                 }
3312
3313                 /*
3314                  * this shouldn't happen, it means the last relocate
3315                  * failed
3316                  */
3317                 if (ret == 0)
3318                         BUG(); /* FIXME break ? */
3319
3320                 ret = btrfs_previous_item(chunk_root, path, 0,
3321                                           BTRFS_CHUNK_ITEM_KEY);
3322                 if (ret) {
3323                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3324                         ret = 0;
3325                         break;
3326                 }
3327
3328                 leaf = path->nodes[0];
3329                 slot = path->slots[0];
3330                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3331
3332                 if (found_key.objectid != key.objectid) {
3333                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3334                         break;
3335                 }
3336
3337                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3338
3339                 if (!counting) {
3340                         spin_lock(&fs_info->balance_lock);
3341                         bctl->stat.considered++;
3342                         spin_unlock(&fs_info->balance_lock);
3343                 }
3344
3345                 ret = should_balance_chunk(chunk_root, leaf, chunk,
3346                                            found_key.offset);
3347                 btrfs_release_path(path);
3348                 if (!ret) {
3349                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3350                         goto loop;
3351                 }
3352
3353                 if (counting) {
3354                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3355                         spin_lock(&fs_info->balance_lock);
3356                         bctl->stat.expected++;
3357                         spin_unlock(&fs_info->balance_lock);
3358                         goto loop;
3359                 }
3360
3361                 ret = btrfs_relocate_chunk(chunk_root,
3362                                            found_key.offset);
3363                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3364                 if (ret && ret != -ENOSPC)
3365                         goto error;
3366                 if (ret == -ENOSPC) {
3367                         enospc_errors++;
3368                 } else {
3369                         spin_lock(&fs_info->balance_lock);
3370                         bctl->stat.completed++;
3371                         spin_unlock(&fs_info->balance_lock);
3372                 }
3373 loop:
3374                 if (found_key.offset == 0)
3375                         break;
3376                 key.offset = found_key.offset - 1;
3377         }
3378
3379         if (counting) {
3380                 btrfs_release_path(path);
3381                 counting = false;
3382                 goto again;
3383         }
3384 error:
3385         btrfs_free_path(path);
3386         if (enospc_errors) {
3387                 btrfs_info(fs_info, "%d enospc errors during balance",
3388                        enospc_errors);
3389                 if (!ret)
3390                         ret = -ENOSPC;
3391         }
3392
3393         return ret;
3394 }
3395
3396 /**
3397  * alloc_profile_is_valid - see if a given profile is valid and reduced
3398  * @flags: profile to validate
3399  * @extended: if true @flags is treated as an extended profile
3400  */
3401 static int alloc_profile_is_valid(u64 flags, int extended)
3402 {
3403         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3404                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
3405
3406         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3407
3408         /* 1) check that all other bits are zeroed */
3409         if (flags & ~mask)
3410                 return 0;
3411
3412         /* 2) see if profile is reduced */
3413         if (flags == 0)
3414                 return !extended; /* "0" is valid for usual profiles */
3415
3416         /* true if exactly one bit set */
3417         return (flags & (flags - 1)) == 0;
3418 }
3419
3420 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3421 {
3422         /* cancel requested || normal exit path */
3423         return atomic_read(&fs_info->balance_cancel_req) ||
3424                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
3425                  atomic_read(&fs_info->balance_cancel_req) == 0);
3426 }
3427
3428 static void __cancel_balance(struct btrfs_fs_info *fs_info)
3429 {
3430         int ret;
3431
3432         unset_balance_control(fs_info);
3433         ret = del_balance_item(fs_info->tree_root);
3434         if (ret)
3435                 btrfs_std_error(fs_info, ret, NULL);
3436
3437         atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3438 }
3439
3440 /*
3441  * Should be called with both balance and volume mutexes held
3442  */
3443 int btrfs_balance(struct btrfs_balance_control *bctl,
3444                   struct btrfs_ioctl_balance_args *bargs)
3445 {
3446         struct btrfs_fs_info *fs_info = bctl->fs_info;
3447         u64 allowed;
3448         int mixed = 0;
3449         int ret;
3450         u64 num_devices;
3451         unsigned seq;
3452
3453         if (btrfs_fs_closing(fs_info) ||
3454             atomic_read(&fs_info->balance_pause_req) ||
3455             atomic_read(&fs_info->balance_cancel_req)) {
3456                 ret = -EINVAL;
3457                 goto out;
3458         }
3459
3460         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
3461         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
3462                 mixed = 1;
3463
3464         /*
3465          * In case of mixed groups both data and meta should be picked,
3466          * and identical options should be given for both of them.
3467          */
3468         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
3469         if (mixed && (bctl->flags & allowed)) {
3470                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
3471                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
3472                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
3473                         btrfs_err(fs_info, "with mixed groups data and "
3474                                    "metadata balance options must be the same");
3475                         ret = -EINVAL;
3476                         goto out;
3477                 }
3478         }
3479
3480         num_devices = fs_info->fs_devices->num_devices;
3481         btrfs_dev_replace_lock(&fs_info->dev_replace);
3482         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3483                 BUG_ON(num_devices < 1);
3484                 num_devices--;
3485         }
3486         btrfs_dev_replace_unlock(&fs_info->dev_replace);
3487         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3488         if (num_devices == 1)
3489                 allowed |= BTRFS_BLOCK_GROUP_DUP;
3490         else if (num_devices > 1)
3491                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3492         if (num_devices > 2)
3493                 allowed |= BTRFS_BLOCK_GROUP_RAID5;
3494         if (num_devices > 3)
3495                 allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
3496                             BTRFS_BLOCK_GROUP_RAID6);
3497         if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3498             (!alloc_profile_is_valid(bctl->data.target, 1) ||
3499              (bctl->data.target & ~allowed))) {
3500                 btrfs_err(fs_info, "unable to start balance with target "
3501                            "data profile %llu",
3502                        bctl->data.target);
3503                 ret = -EINVAL;
3504                 goto out;
3505         }
3506         if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3507             (!alloc_profile_is_valid(bctl->meta.target, 1) ||
3508              (bctl->meta.target & ~allowed))) {
3509                 btrfs_err(fs_info,
3510                            "unable to start balance with target metadata profile %llu",
3511                        bctl->meta.target);
3512                 ret = -EINVAL;
3513                 goto out;
3514         }
3515         if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3516             (!alloc_profile_is_valid(bctl->sys.target, 1) ||
3517              (bctl->sys.target & ~allowed))) {
3518                 btrfs_err(fs_info,
3519                            "unable to start balance with target system profile %llu",
3520                        bctl->sys.target);
3521                 ret = -EINVAL;
3522                 goto out;
3523         }
3524
3525         /* allow dup'ed data chunks only in mixed mode */
3526         if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3527             (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
3528                 btrfs_err(fs_info, "dup for data is not allowed");
3529                 ret = -EINVAL;
3530                 goto out;
3531         }
3532
3533         /* allow to reduce meta or sys integrity only if force set */
3534         allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3535                         BTRFS_BLOCK_GROUP_RAID10 |
3536                         BTRFS_BLOCK_GROUP_RAID5 |
3537                         BTRFS_BLOCK_GROUP_RAID6;
3538         do {
3539                 seq = read_seqbegin(&fs_info->profiles_lock);
3540
3541                 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3542                      (fs_info->avail_system_alloc_bits & allowed) &&
3543                      !(bctl->sys.target & allowed)) ||
3544                     ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3545                      (fs_info->avail_metadata_alloc_bits & allowed) &&
3546                      !(bctl->meta.target & allowed))) {
3547                         if (bctl->flags & BTRFS_BALANCE_FORCE) {
3548                                 btrfs_info(fs_info, "force reducing metadata integrity");
3549                         } else {
3550                                 btrfs_err(fs_info, "balance will reduce metadata "
3551                                            "integrity, use force if you want this");
3552                                 ret = -EINVAL;
3553                                 goto out;
3554                         }
3555                 }
3556         } while (read_seqretry(&fs_info->profiles_lock, seq));
3557
3558         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3559                 fs_info->num_tolerated_disk_barrier_failures = min(
3560                         btrfs_calc_num_tolerated_disk_barrier_failures(fs_info),
3561                         btrfs_get_num_tolerated_disk_barrier_failures(
3562                                 bctl->sys.target));
3563         }
3564
3565         ret = insert_balance_item(fs_info->tree_root, bctl);
3566         if (ret && ret != -EEXIST)
3567                 goto out;
3568
3569         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
3570                 BUG_ON(ret == -EEXIST);
3571                 set_balance_control(bctl);
3572         } else {
3573                 BUG_ON(ret != -EEXIST);
3574                 spin_lock(&fs_info->balance_lock);
3575                 update_balance_args(bctl);
3576                 spin_unlock(&fs_info->balance_lock);
3577         }
3578
3579         atomic_inc(&fs_info->balance_running);
3580         mutex_unlock(&fs_info->balance_mutex);
3581
3582         ret = __btrfs_balance(fs_info);
3583
3584         mutex_lock(&fs_info->balance_mutex);
3585         atomic_dec(&fs_info->balance_running);
3586
3587         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3588                 fs_info->num_tolerated_disk_barrier_failures =
3589                         btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3590         }
3591
3592         if (bargs) {
3593                 memset(bargs, 0, sizeof(*bargs));
3594                 update_ioctl_balance_args(fs_info, 0, bargs);
3595         }
3596
3597         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
3598             balance_need_close(fs_info)) {
3599                 __cancel_balance(fs_info);
3600         }
3601
3602         wake_up(&fs_info->balance_wait_q);
3603
3604         return ret;
3605 out:
3606         if (bctl->flags & BTRFS_BALANCE_RESUME)
3607                 __cancel_balance(fs_info);
3608         else {
3609                 kfree(bctl);
3610                 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3611         }
3612         return ret;
3613 }
3614
3615 static int balance_kthread(void *data)
3616 {
3617         struct btrfs_fs_info *fs_info = data;
3618         int ret = 0;
3619
3620         mutex_lock(&fs_info->volume_mutex);
3621         mutex_lock(&fs_info->balance_mutex);
3622
3623         if (fs_info->balance_ctl) {
3624                 btrfs_info(fs_info, "continuing balance");
3625                 ret = btrfs_balance(fs_info->balance_ctl, NULL);
3626         }
3627
3628         mutex_unlock(&fs_info->balance_mutex);
3629         mutex_unlock(&fs_info->volume_mutex);
3630
3631         return ret;
3632 }
3633
3634 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3635 {
3636         struct task_struct *tsk;
3637
3638         spin_lock(&fs_info->balance_lock);
3639         if (!fs_info->balance_ctl) {
3640                 spin_unlock(&fs_info->balance_lock);
3641                 return 0;
3642         }
3643         spin_unlock(&fs_info->balance_lock);
3644
3645         if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
3646                 btrfs_info(fs_info, "force skipping balance");
3647                 return 0;
3648         }
3649
3650         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3651         return PTR_ERR_OR_ZERO(tsk);
3652 }
3653
3654 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
3655 {
3656         struct btrfs_balance_control *bctl;
3657         struct btrfs_balance_item *item;
3658         struct btrfs_disk_balance_args disk_bargs;
3659         struct btrfs_path *path;
3660         struct extent_buffer *leaf;
3661         struct btrfs_key key;
3662         int ret;
3663
3664         path = btrfs_alloc_path();
3665         if (!path)
3666                 return -ENOMEM;
3667
3668         key.objectid = BTRFS_BALANCE_OBJECTID;
3669         key.type = BTRFS_BALANCE_ITEM_KEY;
3670         key.offset = 0;
3671
3672         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
3673         if (ret < 0)
3674                 goto out;
3675         if (ret > 0) { /* ret = -ENOENT; */
3676                 ret = 0;
3677                 goto out;
3678         }
3679
3680         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3681         if (!bctl) {
3682                 ret = -ENOMEM;
3683                 goto out;
3684         }
3685
3686         leaf = path->nodes[0];
3687         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3688
3689         bctl->fs_info = fs_info;
3690         bctl->flags = btrfs_balance_flags(leaf, item);
3691         bctl->flags |= BTRFS_BALANCE_RESUME;
3692
3693         btrfs_balance_data(leaf, item, &disk_bargs);
3694         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
3695         btrfs_balance_meta(leaf, item, &disk_bargs);
3696         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
3697         btrfs_balance_sys(leaf, item, &disk_bargs);
3698         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
3699
3700         WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
3701
3702         mutex_lock(&fs_info->volume_mutex);
3703         mutex_lock(&fs_info->balance_mutex);
3704
3705         set_balance_control(bctl);
3706
3707         mutex_unlock(&fs_info->balance_mutex);
3708         mutex_unlock(&fs_info->volume_mutex);
3709 out:
3710         btrfs_free_path(path);
3711         return ret;
3712 }
3713
3714 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
3715 {
3716         int ret = 0;
3717
3718         mutex_lock(&fs_info->balance_mutex);
3719         if (!fs_info->balance_ctl) {
3720                 mutex_unlock(&fs_info->balance_mutex);
3721                 return -ENOTCONN;
3722         }
3723
3724         if (atomic_read(&fs_info->balance_running)) {
3725                 atomic_inc(&fs_info->balance_pause_req);
3726                 mutex_unlock(&fs_info->balance_mutex);
3727
3728                 wait_event(fs_info->balance_wait_q,
3729                            atomic_read(&fs_info->balance_running) == 0);
3730
3731                 mutex_lock(&fs_info->balance_mutex);
3732                 /* we are good with balance_ctl ripped off from under us */
3733                 BUG_ON(atomic_read(&fs_info->balance_running));
3734                 atomic_dec(&fs_info->balance_pause_req);
3735         } else {
3736                 ret = -ENOTCONN;
3737         }
3738
3739         mutex_unlock(&fs_info->balance_mutex);
3740         return ret;
3741 }
3742
3743 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
3744 {
3745         if (fs_info->sb->s_flags & MS_RDONLY)
3746                 return -EROFS;
3747
3748         mutex_lock(&fs_info->balance_mutex);
3749         if (!fs_info->balance_ctl) {
3750                 mutex_unlock(&fs_info->balance_mutex);
3751                 return -ENOTCONN;
3752         }
3753
3754         atomic_inc(&fs_info->balance_cancel_req);
3755         /*
3756          * if we are running just wait and return, balance item is
3757          * deleted in btrfs_balance in this case
3758          */
3759         if (atomic_read(&fs_info->balance_running)) {
3760                 mutex_unlock(&fs_info->balance_mutex);
3761                 wait_event(fs_info->balance_wait_q,
3762                            atomic_read(&fs_info->balance_running) == 0);
3763                 mutex_lock(&fs_info->balance_mutex);
3764         } else {
3765                 /* __cancel_balance needs volume_mutex */
3766                 mutex_unlock(&fs_info->balance_mutex);
3767                 mutex_lock(&fs_info->volume_mutex);
3768                 mutex_lock(&fs_info->balance_mutex);
3769
3770                 if (fs_info->balance_ctl)
3771                         __cancel_balance(fs_info);
3772
3773                 mutex_unlock(&fs_info->volume_mutex);
3774         }
3775
3776         BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
3777         atomic_dec(&fs_info->balance_cancel_req);
3778         mutex_unlock(&fs_info->balance_mutex);
3779         return 0;
3780 }
3781
3782 static int btrfs_uuid_scan_kthread(void *data)
3783 {
3784         struct btrfs_fs_info *fs_info = data;
3785         struct btrfs_root *root = fs_info->tree_root;
3786         struct btrfs_key key;
3787         struct btrfs_key max_key;
3788         struct btrfs_path *path = NULL;
3789         int ret = 0;
3790         struct extent_buffer *eb;
3791         int slot;
3792         struct btrfs_root_item root_item;
3793         u32 item_size;
3794         struct btrfs_trans_handle *trans = NULL;
3795
3796         path = btrfs_alloc_path();
3797         if (!path) {
3798                 ret = -ENOMEM;
3799                 goto out;
3800         }
3801
3802         key.objectid = 0;
3803         key.type = BTRFS_ROOT_ITEM_KEY;
3804         key.offset = 0;
3805
3806         max_key.objectid = (u64)-1;
3807         max_key.type = BTRFS_ROOT_ITEM_KEY;
3808         max_key.offset = (u64)-1;
3809
3810         while (1) {
3811                 ret = btrfs_search_forward(root, &key, path, 0);
3812                 if (ret) {
3813                         if (ret > 0)
3814                                 ret = 0;
3815                         break;
3816                 }
3817
3818                 if (key.type != BTRFS_ROOT_ITEM_KEY ||
3819                     (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
3820                      key.objectid != BTRFS_FS_TREE_OBJECTID) ||
3821                     key.objectid > BTRFS_LAST_FREE_OBJECTID)
3822                         goto skip;
3823
3824                 eb = path->nodes[0];
3825                 slot = path->slots[0];
3826                 item_size = btrfs_item_size_nr(eb, slot);
3827                 if (item_size < sizeof(root_item))
3828                         goto skip;
3829
3830                 read_extent_buffer(eb, &root_item,
3831                                    btrfs_item_ptr_offset(eb, slot),
3832                                    (int)sizeof(root_item));
3833                 if (btrfs_root_refs(&root_item) == 0)
3834                         goto skip;
3835
3836                 if (!btrfs_is_empty_uuid(root_item.uuid) ||
3837                     !btrfs_is_empty_uuid(root_item.received_uuid)) {
3838                         if (trans)
3839                                 goto update_tree;
3840
3841                         btrfs_release_path(path);
3842                         /*
3843                          * 1 - subvol uuid item
3844                          * 1 - received_subvol uuid item
3845                          */
3846                         trans = btrfs_start_transaction(fs_info->uuid_root, 2);
3847                         if (IS_ERR(trans)) {
3848                                 ret = PTR_ERR(trans);
3849                                 break;
3850                         }
3851                         continue;
3852                 } else {
3853                         goto skip;
3854                 }
3855 update_tree:
3856                 if (!btrfs_is_empty_uuid(root_item.uuid)) {
3857                         ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
3858                                                   root_item.uuid,
3859                                                   BTRFS_UUID_KEY_SUBVOL,
3860                                                   key.objectid);
3861                         if (ret < 0) {
3862                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
3863                                         ret);
3864                                 break;
3865                         }
3866                 }
3867
3868                 if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
3869                         ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
3870                                                   root_item.received_uuid,
3871                                                  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
3872                                                   key.objectid);
3873                         if (ret < 0) {
3874                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
3875                                         ret);
3876                                 break;
3877                         }
3878                 }
3879
3880 skip:
3881                 if (trans) {
3882                         ret = btrfs_end_transaction(trans, fs_info->uuid_root);
3883                         trans = NULL;
3884                         if (ret)
3885                                 break;
3886                 }
3887
3888                 btrfs_release_path(path);
3889                 if (key.offset < (u64)-1) {
3890                         key.offset++;
3891                 } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
3892                         key.offset = 0;
3893                         key.type = BTRFS_ROOT_ITEM_KEY;
3894                 } else if (key.objectid < (u64)-1) {
3895                         key.offset = 0;
3896                         key.type = BTRFS_ROOT_ITEM_KEY;
3897                         key.objectid++;
3898                 } else {
3899                         break;
3900                 }
3901                 cond_resched();
3902         }
3903
3904 out:
3905         btrfs_free_path(path);
3906         if (trans && !IS_ERR(trans))
3907                 btrfs_end_transaction(trans, fs_info->uuid_root);
3908         if (ret)
3909                 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
3910         else
3911                 fs_info->update_uuid_tree_gen = 1;
3912         up(&fs_info->uuid_tree_rescan_sem);
3913         return 0;
3914 }
3915
3916 /*
3917  * Callback for btrfs_uuid_tree_iterate().
3918  * returns:
3919  * 0    check succeeded, the entry is not outdated.
3920  * < 0  if an error occured.
3921  * > 0  if the check failed, which means the caller shall remove the entry.
3922  */
3923 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
3924                                        u8 *uuid, u8 type, u64 subid)
3925 {
3926         struct btrfs_key key;
3927         int ret = 0;
3928         struct btrfs_root *subvol_root;
3929
3930         if (type != BTRFS_UUID_KEY_SUBVOL &&
3931             type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
3932                 goto out;
3933
3934         key.objectid = subid;
3935         key.type = BTRFS_ROOT_ITEM_KEY;
3936         key.offset = (u64)-1;
3937         subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
3938         if (IS_ERR(subvol_root)) {
3939                 ret = PTR_ERR(subvol_root);
3940                 if (ret == -ENOENT)
3941                         ret = 1;
3942                 goto out;
3943         }
3944
3945         switch (type) {
3946         case BTRFS_UUID_KEY_SUBVOL:
3947                 if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
3948                         ret = 1;
3949                 break;
3950         case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
3951                 if (memcmp(uuid, subvol_root->root_item.received_uuid,
3952                            BTRFS_UUID_SIZE))
3953                         ret = 1;
3954                 break;
3955         }
3956
3957 out:
3958         return ret;
3959 }
3960
3961 static int btrfs_uuid_rescan_kthread(void *data)
3962 {
3963         struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
3964         int ret;
3965
3966         /*
3967          * 1st step is to iterate through the existing UUID tree and
3968          * to delete all entries that contain outdated data.
3969          * 2nd step is to add all missing entries to the UUID tree.
3970          */
3971         ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
3972         if (ret < 0) {
3973                 btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
3974                 up(&fs_info->uuid_tree_rescan_sem);
3975                 return ret;
3976         }
3977         return btrfs_uuid_scan_kthread(data);
3978 }
3979
3980 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
3981 {
3982         struct btrfs_trans_handle *trans;
3983         struct btrfs_root *tree_root = fs_info->tree_root;
3984         struct btrfs_root *uuid_root;
3985         struct task_struct *task;
3986         int ret;
3987
3988         /*
3989          * 1 - root node
3990          * 1 - root item
3991          */
3992         trans = btrfs_start_transaction(tree_root, 2);
3993         if (IS_ERR(trans))
3994                 return PTR_ERR(trans);
3995
3996         uuid_root = btrfs_create_tree(trans, fs_info,
3997                                       BTRFS_UUID_TREE_OBJECTID);
3998         if (IS_ERR(uuid_root)) {
3999                 ret = PTR_ERR(uuid_root);
4000                 btrfs_abort_transaction(trans, tree_root, ret);
4001                 return ret;
4002         }
4003
4004         fs_info->uuid_root = uuid_root;
4005
4006         ret = btrfs_commit_transaction(trans, tree_root);
4007         if (ret)
4008                 return ret;
4009
4010         down(&fs_info->uuid_tree_rescan_sem);
4011         task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4012         if (IS_ERR(task)) {
4013                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4014                 btrfs_warn(fs_info, "failed to start uuid_scan task");
4015                 up(&fs_info->uuid_tree_rescan_sem);
4016                 return PTR_ERR(task);
4017         }
4018
4019         return 0;
4020 }
4021
4022 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
4023 {
4024         struct task_struct *task;
4025
4026         down(&fs_info->uuid_tree_rescan_sem);
4027         task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
4028         if (IS_ERR(task)) {
4029                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4030                 btrfs_warn(fs_info, "failed to start uuid_rescan task");
4031                 up(&fs_info->uuid_tree_rescan_sem);
4032                 return PTR_ERR(task);
4033         }
4034
4035         return 0;
4036 }
4037
4038 /*
4039  * shrinking a device means finding all of the device extents past
4040  * the new size, and then following the back refs to the chunks.
4041  * The chunk relocation code actually frees the device extent
4042  */
4043 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4044 {
4045         struct btrfs_trans_handle *trans;
4046         struct btrfs_root *root = device->dev_root;
4047         struct btrfs_dev_extent *dev_extent = NULL;
4048         struct btrfs_path *path;
4049         u64 length;
4050         u64 chunk_offset;
4051         int ret;
4052         int slot;
4053         int failed = 0;
4054         bool retried = false;
4055         bool checked_pending_chunks = false;
4056         struct extent_buffer *l;
4057         struct btrfs_key key;
4058         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
4059         u64 old_total = btrfs_super_total_bytes(super_copy);
4060         u64 old_size = btrfs_device_get_total_bytes(device);
4061         u64 diff = old_size - new_size;
4062
4063         if (device->is_tgtdev_for_dev_replace)
4064                 return -EINVAL;
4065
4066         path = btrfs_alloc_path();
4067         if (!path)
4068                 return -ENOMEM;
4069
4070         path->reada = 2;
4071
4072         lock_chunks(root);
4073
4074         btrfs_device_set_total_bytes(device, new_size);
4075         if (device->writeable) {
4076                 device->fs_devices->total_rw_bytes -= diff;
4077                 spin_lock(&root->fs_info->free_chunk_lock);
4078                 root->fs_info->free_chunk_space -= diff;
4079                 spin_unlock(&root->fs_info->free_chunk_lock);
4080         }
4081         unlock_chunks(root);
4082
4083 again:
4084         key.objectid = device->devid;
4085         key.offset = (u64)-1;
4086         key.type = BTRFS_DEV_EXTENT_KEY;
4087
4088         do {
4089                 mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
4090                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4091                 if (ret < 0) {
4092                         mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
4093                         goto done;
4094                 }
4095
4096                 ret = btrfs_previous_item(root, path, 0, key.type);
4097                 if (ret)
4098                         mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
4099                 if (ret < 0)
4100                         goto done;
4101                 if (ret) {
4102                         ret = 0;
4103                         btrfs_release_path(path);
4104                         break;
4105                 }
4106
4107                 l = path->nodes[0];
4108                 slot = path->slots[0];
4109                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4110
4111                 if (key.objectid != device->devid) {
4112                         mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
4113                         btrfs_release_path(path);
4114                         break;
4115                 }
4116
4117                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4118                 length = btrfs_dev_extent_length(l, dev_extent);
4119
4120                 if (key.offset + length <= new_size) {
4121                         mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
4122                         btrfs_release_path(path);
4123                         break;
4124                 }
4125
4126                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4127                 btrfs_release_path(path);
4128
4129                 ret = btrfs_relocate_chunk(root, chunk_offset);
4130                 mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
4131                 if (ret && ret != -ENOSPC)
4132                         goto done;
4133                 if (ret == -ENOSPC)
4134                         failed++;
4135         } while (key.offset-- > 0);
4136
4137         if (failed && !retried) {
4138                 failed = 0;
4139                 retried = true;
4140                 goto again;
4141         } else if (failed && retried) {
4142                 ret = -ENOSPC;
4143                 goto done;
4144         }
4145
4146         /* Shrinking succeeded, else we would be at "done". */
4147         trans = btrfs_start_transaction(root, 0);
4148         if (IS_ERR(trans)) {
4149                 ret = PTR_ERR(trans);
4150                 goto done;
4151         }
4152
4153         lock_chunks(root);
4154
4155         /*
4156          * We checked in the above loop all device extents that were already in
4157          * the device tree. However before we have updated the device's
4158          * total_bytes to the new size, we might have had chunk allocations that
4159          * have not complete yet (new block groups attached to transaction
4160          * handles), and therefore their device extents were not yet in the
4161          * device tree and we missed them in the loop above. So if we have any
4162          * pending chunk using a device extent that overlaps the device range
4163          * that we can not use anymore, commit the current transaction and
4164          * repeat the search on the device tree - this way we guarantee we will
4165          * not have chunks using device extents that end beyond 'new_size'.
4166          */
4167         if (!checked_pending_chunks) {
4168                 u64 start = new_size;
4169                 u64 len = old_size - new_size;
4170
4171                 if (contains_pending_extent(trans->transaction, device,
4172                                             &start, len)) {
4173                         unlock_chunks(root);
4174                         checked_pending_chunks = true;
4175                         failed = 0;
4176                         retried = false;
4177                         ret = btrfs_commit_transaction(trans, root);
4178                         if (ret)
4179                                 goto done;
4180                         goto again;
4181                 }
4182         }
4183
4184         btrfs_device_set_disk_total_bytes(device, new_size);
4185         if (list_empty(&device->resized_list))
4186                 list_add_tail(&device->resized_list,
4187                               &root->fs_info->fs_devices->resized_devices);
4188
4189         WARN_ON(diff > old_total);
4190         btrfs_set_super_total_bytes(super_copy, old_total - diff);
4191         unlock_chunks(root);
4192
4193         /* Now btrfs_update_device() will change the on-disk size. */
4194         ret = btrfs_update_device(trans, device);
4195         btrfs_end_transaction(trans, root);
4196 done:
4197         btrfs_free_path(path);
4198         if (ret) {
4199                 lock_chunks(root);
4200                 btrfs_device_set_total_bytes(device, old_size);
4201                 if (device->writeable)
4202                         device->fs_devices->total_rw_bytes += diff;
4203                 spin_lock(&root->fs_info->free_chunk_lock);
4204                 root->fs_info->free_chunk_space += diff;
4205                 spin_unlock(&root->fs_info->free_chunk_lock);
4206                 unlock_chunks(root);
4207         }
4208         return ret;
4209 }
4210
4211 static int btrfs_add_system_chunk(struct btrfs_root *root,
4212                            struct btrfs_key *key,
4213                            struct btrfs_chunk *chunk, int item_size)
4214 {
4215         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
4216         struct btrfs_disk_key disk_key;
4217         u32 array_size;
4218         u8 *ptr;
4219
4220         lock_chunks(root);
4221         array_size = btrfs_super_sys_array_size(super_copy);
4222         if (array_size + item_size + sizeof(disk_key)
4223                         > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4224                 unlock_chunks(root);
4225                 return -EFBIG;
4226         }
4227
4228         ptr = super_copy->sys_chunk_array + array_size;
4229         btrfs_cpu_key_to_disk(&disk_key, key);
4230         memcpy(ptr, &disk_key, sizeof(disk_key));
4231         ptr += sizeof(disk_key);
4232         memcpy(ptr, chunk, item_size);
4233         item_size += sizeof(disk_key);
4234         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4235         unlock_chunks(root);
4236
4237         return 0;
4238 }
4239
4240 /*
4241  * sort the devices in descending order by max_avail, total_avail
4242  */
4243 static int btrfs_cmp_device_info(const void *a, const void *b)
4244 {
4245         const struct btrfs_device_info *di_a = a;
4246         const struct btrfs_device_info *di_b = b;
4247
4248         if (di_a->max_avail > di_b->max_avail)
4249                 return -1;
4250         if (di_a->max_avail < di_b->max_avail)
4251                 return 1;
4252         if (di_a->total_avail > di_b->total_avail)
4253                 return -1;
4254         if (di_a->total_avail < di_b->total_avail)
4255                 return 1;
4256         return 0;
4257 }
4258
4259 static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
4260         [BTRFS_RAID_RAID10] = {
4261                 .sub_stripes    = 2,
4262                 .dev_stripes    = 1,
4263                 .devs_max       = 0,    /* 0 == as many as possible */
4264                 .devs_min       = 4,
4265                 .devs_increment = 2,
4266                 .ncopies        = 2,
4267         },
4268         [BTRFS_RAID_RAID1] = {
4269                 .sub_stripes    = 1,
4270                 .dev_stripes    = 1,
4271                 .devs_max       = 2,
4272                 .devs_min       = 2,
4273                 .devs_increment = 2,
4274                 .ncopies        = 2,
4275         },
4276         [BTRFS_RAID_DUP] = {
4277                 .sub_stripes    = 1,
4278                 .dev_stripes    = 2,
4279                 .devs_max       = 1,
4280                 .devs_min       = 1,
4281                 .devs_increment = 1,
4282                 .ncopies        = 2,
4283         },
4284         [BTRFS_RAID_RAID0] = {
4285                 .sub_stripes    = 1,
4286                 .dev_stripes    = 1,
4287                 .devs_max       = 0,
4288                 .devs_min       = 2,
4289                 .devs_increment = 1,
4290                 .ncopies        = 1,
4291         },
4292         [BTRFS_RAID_SINGLE] = {
4293                 .sub_stripes    = 1,
4294                 .dev_stripes    = 1,
4295                 .devs_max       = 1,
4296                 .devs_min       = 1,
4297                 .devs_increment = 1,
4298                 .ncopies        = 1,
4299         },
4300         [BTRFS_RAID_RAID5] = {
4301                 .sub_stripes    = 1,
4302                 .dev_stripes    = 1,
4303                 .devs_max       = 0,
4304                 .devs_min       = 2,
4305                 .devs_increment = 1,
4306                 .ncopies        = 2,
4307         },
4308         [BTRFS_RAID_RAID6] = {
4309                 .sub_stripes    = 1,
4310                 .dev_stripes    = 1,
4311                 .devs_max       = 0,
4312                 .devs_min       = 3,
4313                 .devs_increment = 1,
4314                 .ncopies        = 3,
4315         },
4316 };
4317
4318 static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
4319 {
4320         /* TODO allow them to set a preferred stripe size */
4321         return 64 * 1024;
4322 }
4323
4324 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4325 {
4326         if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
4327                 return;
4328
4329         btrfs_set_fs_incompat(info, RAID56);
4330 }
4331
4332 #define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r)             \
4333                         - sizeof(struct btrfs_item)             \
4334                         - sizeof(struct btrfs_chunk))           \
4335                         / sizeof(struct btrfs_stripe) + 1)
4336
4337 #define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE        \
4338                                 - 2 * sizeof(struct btrfs_disk_key)     \
4339                                 - 2 * sizeof(struct btrfs_chunk))       \
4340                                 / sizeof(struct btrfs_stripe) + 1)
4341
4342 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4343                                struct btrfs_root *extent_root, u64 start,
4344                                u64 type)
4345 {
4346         struct btrfs_fs_info *info = extent_root->fs_info;
4347         struct btrfs_fs_devices *fs_devices = info->fs_devices;
4348         struct list_head *cur;
4349         struct map_lookup *map = NULL;
4350         struct extent_map_tree *em_tree;
4351         struct extent_map *em;
4352         struct btrfs_device_info *devices_info = NULL;
4353         u64 total_avail;
4354         int num_stripes;        /* total number of stripes to allocate */
4355         int data_stripes;       /* number of stripes that count for
4356                                    block group size */
4357         int sub_stripes;        /* sub_stripes info for map */
4358         int dev_stripes;        /* stripes per dev */
4359         int devs_max;           /* max devs to use */
4360         int devs_min;           /* min devs needed */
4361         int devs_increment;     /* ndevs has to be a multiple of this */
4362         int ncopies;            /* how many copies to data has */
4363         int ret;
4364         u64 max_stripe_size;
4365         u64 max_chunk_size;
4366         u64 stripe_size;
4367         u64 num_bytes;
4368         u64 raid_stripe_len = BTRFS_STRIPE_LEN;
4369         int ndevs;
4370         int i;
4371         int j;
4372         int index;
4373
4374         BUG_ON(!alloc_profile_is_valid(type, 0));
4375
4376         if (list_empty(&fs_devices->alloc_list))
4377                 return -ENOSPC;
4378
4379         index = __get_raid_index(type);
4380
4381         sub_stripes = btrfs_raid_array[index].sub_stripes;
4382         dev_stripes = btrfs_raid_array[index].dev_stripes;
4383         devs_max = btrfs_raid_array[index].devs_max;
4384         devs_min = btrfs_raid_array[index].devs_min;
4385         devs_increment = btrfs_raid_array[index].devs_increment;
4386         ncopies = btrfs_raid_array[index].ncopies;
4387
4388         if (type & BTRFS_BLOCK_GROUP_DATA) {
4389                 max_stripe_size = 1024 * 1024 * 1024;
4390                 max_chunk_size = 10 * max_stripe_size;
4391                 if (!devs_max)
4392                         devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4393         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4394                 /* for larger filesystems, use larger metadata chunks */
4395                 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
4396                         max_stripe_size = 1024 * 1024 * 1024;
4397                 else
4398                         max_stripe_size = 256 * 1024 * 1024;
4399                 max_chunk_size = max_stripe_size;
4400                 if (!devs_max)
4401                         devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4402         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4403                 max_stripe_size = 32 * 1024 * 1024;
4404                 max_chunk_size = 2 * max_stripe_size;
4405                 if (!devs_max)
4406                         devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
4407         } else {
4408                 btrfs_err(info, "invalid chunk type 0x%llx requested",
4409                        type);
4410                 BUG_ON(1);
4411         }
4412
4413         /* we don't want a chunk larger than 10% of writeable space */
4414         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4415                              max_chunk_size);
4416
4417         devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
4418                                GFP_NOFS);
4419         if (!devices_info)
4420                 return -ENOMEM;
4421
4422         cur = fs_devices->alloc_list.next;
4423
4424         /*
4425          * in the first pass through the devices list, we gather information
4426          * about the available holes on each device.
4427          */
4428         ndevs = 0;
4429         while (cur != &fs_devices->alloc_list) {
4430                 struct btrfs_device *device;
4431                 u64 max_avail;
4432                 u64 dev_offset;
4433
4434                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
4435
4436                 cur = cur->next;
4437
4438                 if (!device->writeable) {
4439                         WARN(1, KERN_ERR
4440                                "BTRFS: read-only device in alloc_list\n");
4441                         continue;
4442                 }
4443
4444                 if (!device->in_fs_metadata ||
4445                     device->is_tgtdev_for_dev_replace)
4446                         continue;
4447
4448                 if (device->total_bytes > device->bytes_used)
4449                         total_avail = device->total_bytes - device->bytes_used;
4450                 else
4451                         total_avail = 0;
4452
4453                 /* If there is no space on this device, skip it. */
4454                 if (total_avail == 0)
4455                         continue;
4456
4457                 ret = find_free_dev_extent(trans, device,
4458                                            max_stripe_size * dev_stripes,
4459                                            &dev_offset, &max_avail);
4460                 if (ret && ret != -ENOSPC)
4461                         goto error;
4462
4463                 if (ret == 0)
4464                         max_avail = max_stripe_size * dev_stripes;
4465
4466                 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
4467                         continue;
4468
4469                 if (ndevs == fs_devices->rw_devices) {
4470                         WARN(1, "%s: found more than %llu devices\n",
4471                              __func__, fs_devices->rw_devices);
4472                         break;
4473                 }
4474                 devices_info[ndevs].dev_offset = dev_offset;
4475                 devices_info[ndevs].max_avail = max_avail;
4476                 devices_info[ndevs].total_avail = total_avail;
4477                 devices_info[ndevs].dev = device;
4478                 ++ndevs;
4479         }
4480
4481         /*
4482          * now sort the devices by hole size / available space
4483          */
4484         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
4485              btrfs_cmp_device_info, NULL);
4486
4487         /* round down to number of usable stripes */
4488         ndevs -= ndevs % devs_increment;
4489
4490         if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
4491                 ret = -ENOSPC;
4492                 goto error;
4493         }
4494
4495         if (devs_max && ndevs > devs_max)
4496                 ndevs = devs_max;
4497         /*
4498          * the primary goal is to maximize the number of stripes, so use as many
4499          * devices as possible, even if the stripes are not maximum sized.
4500          */
4501         stripe_size = devices_info[ndevs-1].max_avail;
4502         num_stripes = ndevs * dev_stripes;
4503
4504         /*
4505          * this will have to be fixed for RAID1 and RAID10 over
4506          * more drives
4507          */
4508         data_stripes = num_stripes / ncopies;
4509
4510         if (type & BTRFS_BLOCK_GROUP_RAID5) {
4511                 raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
4512                                  btrfs_super_stripesize(info->super_copy));
4513                 data_stripes = num_stripes - 1;
4514         }
4515         if (type & BTRFS_BLOCK_GROUP_RAID6) {
4516                 raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
4517                                  btrfs_super_stripesize(info->super_copy));
4518                 data_stripes = num_stripes - 2;
4519         }
4520
4521         /*
4522          * Use the number of data stripes to figure out how big this chunk
4523          * is really going to be in terms of logical address space,
4524          * and compare that answer with the max chunk size
4525          */
4526         if (stripe_size * data_stripes > max_chunk_size) {
4527                 u64 mask = (1ULL << 24) - 1;
4528
4529                 stripe_size = div_u64(max_chunk_size, data_stripes);
4530
4531                 /* bump the answer up to a 16MB boundary */
4532                 stripe_size = (stripe_size + mask) & ~mask;
4533
4534                 /* but don't go higher than the limits we found
4535                  * while searching for free extents
4536                  */
4537                 if (stripe_size > devices_info[ndevs-1].max_avail)
4538                         stripe_size = devices_info[ndevs-1].max_avail;
4539         }
4540
4541         stripe_size = div_u64(stripe_size, dev_stripes);
4542
4543         /* align to BTRFS_STRIPE_LEN */
4544         stripe_size = div_u64(stripe_size, raid_stripe_len);
4545         stripe_size *= raid_stripe_len;
4546
4547         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4548         if (!map) {
4549                 ret = -ENOMEM;
4550                 goto error;
4551         }
4552         map->num_stripes = num_stripes;
4553
4554         for (i = 0; i < ndevs; ++i) {
4555                 for (j = 0; j < dev_stripes; ++j) {
4556                         int s = i * dev_stripes + j;
4557                         map->stripes[s].dev = devices_info[i].dev;
4558                         map->stripes[s].physical = devices_info[i].dev_offset +
4559                                                    j * stripe_size;
4560                 }
4561         }
4562         map->sector_size = extent_root->sectorsize;
4563         map->stripe_len = raid_stripe_len;
4564         map->io_align = raid_stripe_len;
4565         map->io_width = raid_stripe_len;
4566         map->type = type;
4567         map->sub_stripes = sub_stripes;
4568
4569         num_bytes = stripe_size * data_stripes;
4570
4571         trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
4572
4573         em = alloc_extent_map();
4574         if (!em) {
4575                 kfree(map);
4576                 ret = -ENOMEM;
4577                 goto error;
4578         }
4579         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
4580         em->bdev = (struct block_device *)map;
4581         em->start = start;
4582         em->len = num_bytes;
4583         em->block_start = 0;
4584         em->block_len = em->len;
4585         em->orig_block_len = stripe_size;
4586
4587         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
4588         write_lock(&em_tree->lock);
4589         ret = add_extent_mapping(em_tree, em, 0);
4590         if (!ret) {
4591                 list_add_tail(&em->list, &trans->transaction->pending_chunks);
4592                 atomic_inc(&em->refs);
4593         }
4594         write_unlock(&em_tree->lock);
4595         if (ret) {
4596                 free_extent_map(em);
4597                 goto error;
4598         }
4599
4600         ret = btrfs_make_block_group(trans, extent_root, 0, type,
4601                                      BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4602                                      start, num_bytes);
4603         if (ret)
4604                 goto error_del_extent;
4605
4606         for (i = 0; i < map->num_stripes; i++) {
4607                 num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
4608                 btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
4609         }
4610
4611         spin_lock(&extent_root->fs_info->free_chunk_lock);
4612         extent_root->fs_info->free_chunk_space -= (stripe_size *
4613                                                    map->num_stripes);
4614         spin_unlock(&extent_root->fs_info->free_chunk_lock);
4615
4616         free_extent_map(em);
4617         check_raid56_incompat_flag(extent_root->fs_info, type);
4618
4619         kfree(devices_info);
4620         return 0;
4621
4622 error_del_extent:
4623         write_lock(&em_tree->lock);
4624         remove_extent_mapping(em_tree, em);
4625         write_unlock(&em_tree->lock);
4626
4627         /* One for our allocation */
4628         free_extent_map(em);
4629         /* One for the tree reference */
4630         free_extent_map(em);
4631         /* One for the pending_chunks list reference */
4632         free_extent_map(em);
4633 error:
4634         kfree(devices_info);
4635         return ret;
4636 }
4637
4638 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4639                                 struct btrfs_root *extent_root,
4640                                 u64 chunk_offset, u64 chunk_size)
4641 {
4642         struct btrfs_key key;
4643         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
4644         struct btrfs_device *device;
4645         struct btrfs_chunk *chunk;
4646         struct btrfs_stripe *stripe;
4647         struct extent_map_tree *em_tree;
4648         struct extent_map *em;
4649         struct map_lookup *map;
4650         size_t item_size;
4651         u64 dev_offset;
4652         u64 stripe_size;
4653         int i = 0;
4654         int ret;
4655
4656         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
4657         read_lock(&em_tree->lock);
4658         em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
4659         read_unlock(&em_tree->lock);
4660
4661         if (!em) {
4662                 btrfs_crit(extent_root->fs_info, "unable to find logical "
4663                            "%Lu len %Lu", chunk_offset, chunk_size);
4664                 return -EINVAL;
4665         }
4666
4667         if (em->start != chunk_offset || em->len != chunk_size) {
4668                 btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
4669                           " %Lu-%Lu, found %Lu-%Lu", chunk_offset,
4670                           chunk_size, em->start, em->len);
4671                 free_extent_map(em);
4672                 return -EINVAL;
4673         }
4674
4675         map = (struct map_lookup *)em->bdev;
4676         item_size = btrfs_chunk_item_size(map->num_stripes);
4677         stripe_size = em->orig_block_len;
4678
4679         chunk = kzalloc(item_size, GFP_NOFS);
4680         if (!chunk) {
4681                 ret = -ENOMEM;
4682                 goto out;
4683         }
4684
4685         for (i = 0; i < map->num_stripes; i++) {
4686                 device = map->stripes[i].dev;
4687                 dev_offset = map->stripes[i].physical;
4688
4689                 ret = btrfs_update_device(trans, device);
4690                 if (ret)
4691                         goto out;
4692                 ret = btrfs_alloc_dev_extent(trans, device,
4693                                              chunk_root->root_key.objectid,
4694                                              BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4695                                              chunk_offset, dev_offset,
4696                                              stripe_size);
4697                 if (ret)
4698                         goto out;
4699         }
4700
4701         stripe = &chunk->stripe;
4702         for (i = 0; i < map->num_stripes; i++) {
4703                 device = map->stripes[i].dev;
4704                 dev_offset = map->stripes[i].physical;
4705
4706                 btrfs_set_stack_stripe_devid(stripe, device->devid);
4707                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
4708                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
4709                 stripe++;
4710         }
4711
4712         btrfs_set_stack_chunk_length(chunk, chunk_size);
4713         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
4714         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
4715         btrfs_set_stack_chunk_type(chunk, map->type);
4716         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
4717         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
4718         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
4719         btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
4720         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
4721
4722         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
4723         key.type = BTRFS_CHUNK_ITEM_KEY;
4724         key.offset = chunk_offset;
4725
4726         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
4727         if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
4728                 /*
4729                  * TODO: Cleanup of inserted chunk root in case of
4730                  * failure.
4731                  */
4732                 ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
4733                                              item_size);
4734         }
4735
4736 out:
4737         kfree(chunk);
4738         free_extent_map(em);
4739         return ret;
4740 }
4741
4742 /*
4743  * Chunk allocation falls into two parts. The first part does works
4744  * that make the new allocated chunk useable, but not do any operation
4745  * that modifies the chunk tree. The second part does the works that
4746  * require modifying the chunk tree. This division is important for the
4747  * bootstrap process of adding storage to a seed btrfs.
4748  */
4749 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4750                       struct btrfs_root *extent_root, u64 type)
4751 {
4752         u64 chunk_offset;
4753
4754         ASSERT(mutex_is_locked(&extent_root->fs_info->chunk_mutex));
4755         chunk_offset = find_next_chunk(extent_root->fs_info);
4756         return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
4757 }
4758
4759 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
4760                                          struct btrfs_root *root,
4761                                          struct btrfs_device *device)
4762 {
4763         u64 chunk_offset;
4764         u64 sys_chunk_offset;
4765         u64 alloc_profile;
4766         struct btrfs_fs_info *fs_info = root->fs_info;
4767         struct btrfs_root *extent_root = fs_info->extent_root;
4768         int ret;
4769
4770         chunk_offset = find_next_chunk(fs_info);
4771         alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
4772         ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
4773                                   alloc_profile);
4774         if (ret)
4775                 return ret;
4776
4777         sys_chunk_offset = find_next_chunk(root->fs_info);
4778         alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
4779         ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
4780                                   alloc_profile);
4781         return ret;
4782 }
4783
4784 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
4785 {
4786         int max_errors;
4787
4788         if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4789                          BTRFS_BLOCK_GROUP_RAID10 |
4790                          BTRFS_BLOCK_GROUP_RAID5 |
4791                          BTRFS_BLOCK_GROUP_DUP)) {
4792                 max_errors = 1;
4793         } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
4794                 max_errors = 2;
4795         } else {
4796                 max_errors = 0;
4797         }
4798
4799         return max_errors;
4800 }
4801
4802 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
4803 {
4804         struct extent_map *em;
4805         struct map_lookup *map;
4806         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4807         int readonly = 0;
4808         int miss_ndevs = 0;
4809         int i;
4810
4811         read_lock(&map_tree->map_tree.lock);
4812         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
4813         read_unlock(&map_tree->map_tree.lock);
4814         if (!em)
4815                 return 1;
4816
4817         map = (struct map_lookup *)em->bdev;
4818         for (i = 0; i < map->num_stripes; i++) {
4819                 if (map->stripes[i].dev->missing) {
4820                         miss_ndevs++;
4821                         continue;
4822                 }
4823
4824                 if (!map->stripes[i].dev->writeable) {
4825                         readonly = 1;
4826                         goto end;
4827                 }
4828         }
4829
4830         /*
4831          * If the number of missing devices is larger than max errors,
4832          * we can not write the data into that chunk successfully, so
4833          * set it readonly.
4834          */
4835         if (miss_ndevs > btrfs_chunk_max_errors(map))
4836                 readonly = 1;
4837 end:
4838         free_extent_map(em);
4839         return readonly;
4840 }
4841
4842 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
4843 {
4844         extent_map_tree_init(&tree->map_tree);
4845 }
4846
4847 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
4848 {
4849         struct extent_map *em;
4850
4851         while (1) {
4852                 write_lock(&tree->map_tree.lock);
4853                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
4854                 if (em)
4855                         remove_extent_mapping(&tree->map_tree, em);
4856                 write_unlock(&tree->map_tree.lock);
4857                 if (!em)
4858                         break;
4859                 /* once for us */
4860                 free_extent_map(em);
4861                 /* once for the tree */
4862                 free_extent_map(em);
4863         }
4864 }
4865
4866 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4867 {
4868         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
4869         struct extent_map *em;
4870         struct map_lookup *map;
4871         struct extent_map_tree *em_tree = &map_tree->map_tree;
4872         int ret;
4873
4874         read_lock(&em_tree->lock);
4875         em = lookup_extent_mapping(em_tree, logical, len);
4876         read_unlock(&em_tree->lock);
4877
4878         /*
4879          * We could return errors for these cases, but that could get ugly and
4880          * we'd probably do the same thing which is just not do anything else
4881          * and exit, so return 1 so the callers don't try to use other copies.
4882          */
4883         if (!em) {
4884                 btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
4885                             logical+len);
4886                 return 1;
4887         }
4888
4889         if (em->start > logical || em->start + em->len < logical) {
4890                 btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
4891                             "%Lu-%Lu", logical, logical+len, em->start,
4892                             em->start + em->len);
4893                 free_extent_map(em);
4894                 return 1;
4895         }
4896
4897         map = (struct map_lookup *)em->bdev;
4898         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
4899                 ret = map->num_stripes;
4900         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4901                 ret = map->sub_stripes;
4902         else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
4903                 ret = 2;
4904         else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4905                 ret = 3;
4906         else
4907                 ret = 1;
4908         free_extent_map(em);
4909
4910         btrfs_dev_replace_lock(&fs_info->dev_replace);
4911         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
4912                 ret++;
4913         btrfs_dev_replace_unlock(&fs_info->dev_replace);
4914
4915         return ret;
4916 }
4917
4918 unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
4919                                     struct btrfs_mapping_tree *map_tree,
4920                                     u64 logical)
4921 {
4922         struct extent_map *em;
4923         struct map_lookup *map;
4924         struct extent_map_tree *em_tree = &map_tree->map_tree;
4925         unsigned long len = root->sectorsize;
4926
4927         read_lock(&em_tree->lock);
4928         em = lookup_extent_mapping(em_tree, logical, len);
4929         read_unlock(&em_tree->lock);
4930         BUG_ON(!em);
4931
4932         BUG_ON(em->start > logical || em->start + em->len < logical);
4933         map = (struct map_lookup *)em->bdev;
4934         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
4935                 len = map->stripe_len * nr_data_stripes(map);
4936         free_extent_map(em);
4937         return len;
4938 }
4939
4940 int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
4941                            u64 logical, u64 len, int mirror_num)
4942 {
4943         struct extent_map *em;
4944         struct map_lookup *map;
4945         struct extent_map_tree *em_tree = &map_tree->map_tree;
4946         int ret = 0;
4947
4948         read_lock(&em_tree->lock);
4949         em = lookup_extent_mapping(em_tree, logical, len);
4950         read_unlock(&em_tree->lock);
4951         BUG_ON(!em);
4952
4953         BUG_ON(em->start > logical || em->start + em->len < logical);
4954         map = (struct map_lookup *)em->bdev;
4955         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
4956                 ret = 1;
4957         free_extent_map(em);
4958         return ret;
4959 }
4960
4961 static int find_live_mirror(struct btrfs_fs_info *fs_info,
4962                             struct map_lookup *map, int first, int num,
4963                             int optimal, int dev_replace_is_ongoing)
4964 {
4965         int i;
4966         int tolerance;
4967         struct btrfs_device *srcdev;
4968
4969         if (dev_replace_is_ongoing &&
4970             fs_info->dev_replace.cont_reading_from_srcdev_mode ==
4971              BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
4972                 srcdev = fs_info->dev_replace.srcdev;
4973         else
4974                 srcdev = NULL;
4975
4976         /*
4977          * try to avoid the drive that is the source drive for a
4978          * dev-replace procedure, only choose it if no other non-missing
4979          * mirror is available
4980          */
4981         for (tolerance = 0; tolerance < 2; tolerance++) {
4982                 if (map->stripes[optimal].dev->bdev &&
4983                     (tolerance || map->stripes[optimal].dev != srcdev))
4984                         return optimal;
4985                 for (i = first; i < first + num; i++) {
4986                         if (map->stripes[i].dev->bdev &&
4987                             (tolerance || map->stripes[i].dev != srcdev))
4988                                 return i;
4989                 }
4990         }
4991
4992         /* we couldn't find one that doesn't fail.  Just return something
4993          * and the io error handling code will clean up eventually
4994          */
4995         return optimal;
4996 }
4997
4998 static inline int parity_smaller(u64 a, u64 b)
4999 {
5000         return a > b;
5001 }
5002
5003 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5004 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
5005 {
5006         struct btrfs_bio_stripe s;
5007         int i;
5008         u64 l;
5009         int again = 1;
5010
5011         while (again) {
5012                 again = 0;
5013                 for (i = 0; i < num_stripes - 1; i++) {
5014                         if (parity_smaller(bbio->raid_map[i],
5015                                            bbio->raid_map[i+1])) {
5016                                 s = bbio->stripes[i];
5017                                 l = bbio->raid_map[i];
5018                                 bbio->stripes[i] = bbio->stripes[i+1];
5019                                 bbio->raid_map[i] = bbio->raid_map[i+1];
5020                                 bbio->stripes[i+1] = s;
5021                                 bbio->raid_map[i+1] = l;
5022
5023                                 again = 1;
5024                         }
5025                 }
5026         }
5027 }
5028
5029 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
5030 {
5031         struct btrfs_bio *bbio = kzalloc(
5032                  /* the size of the btrfs_bio */
5033                 sizeof(struct btrfs_bio) +
5034                 /* plus the variable array for the stripes */
5035                 sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5036                 /* plus the variable array for the tgt dev */
5037                 sizeof(int) * (real_stripes) +
5038                 /*
5039                  * plus the raid_map, which includes both the tgt dev
5040                  * and the stripes
5041                  */
5042                 sizeof(u64) * (total_stripes),
5043                 GFP_NOFS|__GFP_NOFAIL);
5044
5045         atomic_set(&bbio->error, 0);
5046         atomic_set(&bbio->refs, 1);
5047
5048         return bbio;
5049 }
5050
5051 void btrfs_get_bbio(struct btrfs_bio *bbio)
5052 {
5053         WARN_ON(!atomic_read(&bbio->refs));
5054         atomic_inc(&bbio->refs);
5055 }
5056
5057 void btrfs_put_bbio(struct btrfs_bio *bbio)
5058 {
5059         if (!bbio)
5060                 return;
5061         if (atomic_dec_and_test(&bbio->refs))
5062                 kfree(bbio);
5063 }
5064
5065 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5066                              u64 logical, u64 *length,
5067                              struct btrfs_bio **bbio_ret,
5068                              int mirror_num, int need_raid_map)
5069 {
5070         struct extent_map *em;
5071         struct map_lookup *map;
5072         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
5073         struct extent_map_tree *em_tree = &map_tree->map_tree;
5074         u64 offset;
5075         u64 stripe_offset;
5076         u64 stripe_end_offset;
5077         u64 stripe_nr;
5078         u64 stripe_nr_orig;
5079         u64 stripe_nr_end;
5080         u64 stripe_len;
5081         u32 stripe_index;
5082         int i;
5083         int ret = 0;
5084         int num_stripes;
5085         int max_errors = 0;
5086         int tgtdev_indexes = 0;
5087         struct btrfs_bio *bbio = NULL;
5088         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
5089         int dev_replace_is_ongoing = 0;
5090         int num_alloc_stripes;
5091         int patch_the_first_stripe_for_dev_replace = 0;
5092         u64 physical_to_patch_in_first_stripe = 0;
5093         u64 raid56_full_stripe_start = (u64)-1;
5094
5095         read_lock(&em_tree->lock);
5096         em = lookup_extent_mapping(em_tree, logical, *length);
5097         read_unlock(&em_tree->lock);
5098
5099         if (!em) {
5100                 btrfs_crit(fs_info, "unable to find logical %llu len %llu",
5101                         logical, *length);
5102                 return -EINVAL;
5103         }
5104
5105         if (em->start > logical || em->start + em->len < logical) {
5106                 btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
5107                            "found %Lu-%Lu", logical, em->start,
5108                            em->start + em->len);
5109                 free_extent_map(em);
5110                 return -EINVAL;
5111         }
5112
5113         map = (struct map_lookup *)em->bdev;
5114         offset = logical - em->start;
5115
5116         stripe_len = map->stripe_len;
5117         stripe_nr = offset;
5118         /*
5119          * stripe_nr counts the total number of stripes we have to stride
5120          * to get to this block
5121          */
5122         stripe_nr = div64_u64(stripe_nr, stripe_len);
5123
5124         stripe_offset = stripe_nr * stripe_len;
5125         BUG_ON(offset < stripe_offset);
5126
5127         /* stripe_offset is the offset of this block in its stripe*/
5128         stripe_offset = offset - stripe_offset;
5129
5130         /* if we're here for raid56, we need to know the stripe aligned start */
5131         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5132                 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
5133                 raid56_full_stripe_start = offset;
5134
5135                 /* allow a write of a full stripe, but make sure we don't
5136                  * allow straddling of stripes
5137                  */
5138                 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
5139                                 full_stripe_len);
5140                 raid56_full_stripe_start *= full_stripe_len;
5141         }
5142
5143         if (rw & REQ_DISCARD) {
5144                 /* we don't discard raid56 yet */
5145                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5146                         ret = -EOPNOTSUPP;
5147                         goto out;
5148                 }
5149                 *length = min_t(u64, em->len - offset, *length);
5150         } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
5151                 u64 max_len;
5152                 /* For writes to RAID[56], allow a full stripeset across all disks.
5153                    For other RAID types and for RAID[56] reads, just allow a single
5154                    stripe (on a single disk). */
5155                 if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
5156                     (rw & REQ_WRITE)) {
5157                         max_len = stripe_len * nr_data_stripes(map) -
5158                                 (offset - raid56_full_stripe_start);
5159                 } else {
5160                         /* we limit the length of each bio to what fits in a stripe */
5161                         max_len = stripe_len - stripe_offset;
5162                 }
5163                 *length = min_t(u64, em->len - offset, max_len);
5164         } else {
5165                 *length = em->len - offset;
5166         }
5167
5168         /* This is for when we're called from btrfs_merge_bio_hook() and all
5169            it cares about is the length */
5170         if (!bbio_ret)
5171                 goto out;
5172
5173         btrfs_dev_replace_lock(dev_replace);
5174         dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
5175         if (!dev_replace_is_ongoing)
5176                 btrfs_dev_replace_unlock(dev_replace);
5177
5178         if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
5179             !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
5180             dev_replace->tgtdev != NULL) {
5181                 /*
5182                  * in dev-replace case, for repair case (that's the only
5183                  * case where the mirror is selected explicitly when
5184                  * calling btrfs_map_block), blocks left of the left cursor
5185                  * can also be read from the target drive.
5186                  * For REQ_GET_READ_MIRRORS, the target drive is added as
5187                  * the last one to the array of stripes. For READ, it also
5188                  * needs to be supported using the same mirror number.
5189                  * If the requested block is not left of the left cursor,
5190                  * EIO is returned. This can happen because btrfs_num_copies()
5191                  * returns one more in the dev-replace case.
5192                  */
5193                 u64 tmp_length = *length;
5194                 struct btrfs_bio *tmp_bbio = NULL;
5195                 int tmp_num_stripes;
5196                 u64 srcdev_devid = dev_replace->srcdev->devid;
5197                 int index_srcdev = 0;
5198                 int found = 0;
5199                 u64 physical_of_found = 0;
5200
5201                 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
5202                              logical, &tmp_length, &tmp_bbio, 0, 0);
5203                 if (ret) {
5204                         WARN_ON(tmp_bbio != NULL);
5205                         goto out;
5206                 }
5207
5208                 tmp_num_stripes = tmp_bbio->num_stripes;
5209                 if (mirror_num > tmp_num_stripes) {
5210                         /*
5211                          * REQ_GET_READ_MIRRORS does not contain this
5212                          * mirror, that means that the requested area
5213                          * is not left of the left cursor
5214                          */
5215                         ret = -EIO;
5216                         btrfs_put_bbio(tmp_bbio);
5217                         goto out;
5218                 }
5219
5220                 /*
5221                  * process the rest of the function using the mirror_num
5222                  * of the source drive. Therefore look it up first.
5223                  * At the end, patch the device pointer to the one of the
5224                  * target drive.
5225                  */
5226                 for (i = 0; i < tmp_num_stripes; i++) {
5227                         if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
5228                                 /*
5229                                  * In case of DUP, in order to keep it
5230                                  * simple, only add the mirror with the
5231                                  * lowest physical address
5232                                  */
5233                                 if (found &&
5234                                     physical_of_found <=
5235                                      tmp_bbio->stripes[i].physical)
5236                                         continue;
5237                                 index_srcdev = i;
5238                                 found = 1;
5239                                 physical_of_found =
5240                                         tmp_bbio->stripes[i].physical;
5241                         }
5242                 }
5243
5244                 if (found) {
5245                         mirror_num = index_srcdev + 1;
5246                         patch_the_first_stripe_for_dev_replace = 1;
5247                         physical_to_patch_in_first_stripe = physical_of_found;
5248                 } else {
5249                         WARN_ON(1);
5250                         ret = -EIO;
5251                         btrfs_put_bbio(tmp_bbio);
5252                         goto out;
5253                 }
5254
5255                 btrfs_put_bbio(tmp_bbio);
5256         } else if (mirror_num > map->num_stripes) {
5257                 mirror_num = 0;
5258         }
5259
5260         num_stripes = 1;
5261         stripe_index = 0;
5262         stripe_nr_orig = stripe_nr;
5263         stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
5264         stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len);
5265         stripe_end_offset = stripe_nr_end * map->stripe_len -
5266                             (offset + *length);
5267
5268         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5269                 if (rw & REQ_DISCARD)
5270                         num_stripes = min_t(u64, map->num_stripes,
5271                                             stripe_nr_end - stripe_nr_orig);
5272                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5273                                 &stripe_index);
5274                 if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
5275                         mirror_num = 1;
5276         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
5277                 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
5278                         num_stripes = map->num_stripes;
5279                 else if (mirror_num)
5280                         stripe_index = mirror_num - 1;
5281                 else {
5282                         stripe_index = find_live_mirror(fs_info, map, 0,
5283                                             map->num_stripes,
5284                                             current->pid % map->num_stripes,
5285                                             dev_replace_is_ongoing);
5286                         mirror_num = stripe_index + 1;
5287                 }
5288
5289         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
5290                 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
5291                         num_stripes = map->num_stripes;
5292                 } else if (mirror_num) {
5293                         stripe_index = mirror_num - 1;
5294                 } else {
5295                         mirror_num = 1;
5296                 }
5297
5298         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5299                 u32 factor = map->num_stripes / map->sub_stripes;
5300
5301                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5302                 stripe_index *= map->sub_stripes;
5303
5304                 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
5305                         num_stripes = map->sub_stripes;
5306                 else if (rw & REQ_DISCARD)
5307                         num_stripes = min_t(u64, map->sub_stripes *
5308                                             (stripe_nr_end - stripe_nr_orig),
5309                                             map->num_stripes);
5310                 else if (mirror_num)
5311                         stripe_index += mirror_num - 1;
5312                 else {
5313                         int old_stripe_index = stripe_index;
5314                         stripe_index = find_live_mirror(fs_info, map,
5315                                               stripe_index,
5316                                               map->sub_stripes, stripe_index +
5317                                               current->pid % map->sub_stripes,
5318                                               dev_replace_is_ongoing);
5319                         mirror_num = stripe_index - old_stripe_index + 1;
5320                 }
5321
5322         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5323                 if (need_raid_map &&
5324                     ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
5325                      mirror_num > 1)) {
5326                         /* push stripe_nr back to the start of the full stripe */
5327                         stripe_nr = div_u64(raid56_full_stripe_start,
5328                                         stripe_len * nr_data_stripes(map));
5329
5330                         /* RAID[56] write or recovery. Return all stripes */
5331                         num_stripes = map->num_stripes;
5332                         max_errors = nr_parity_stripes(map);
5333
5334                         *length = map->stripe_len;
5335                         stripe_index = 0;
5336                         stripe_offset = 0;
5337                 } else {
5338                         /*
5339                          * Mirror #0 or #1 means the original data block.
5340                          * Mirror #2 is RAID5 parity block.
5341                          * Mirror #3 is RAID6 Q block.
5342                          */
5343                         stripe_nr = div_u64_rem(stripe_nr,
5344                                         nr_data_stripes(map), &stripe_index);
5345                         if (mirror_num > 1)
5346                                 stripe_index = nr_data_stripes(map) +
5347                                                 mirror_num - 2;
5348
5349                         /* We distribute the parity blocks across stripes */
5350                         div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
5351                                         &stripe_index);
5352                         if (!(rw & (REQ_WRITE | REQ_DISCARD |
5353                                     REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
5354                                 mirror_num = 1;
5355                 }
5356         } else {
5357                 /*
5358                  * after this, stripe_nr is the number of stripes on this
5359                  * device we have to walk to find the data, and stripe_index is
5360                  * the number of our device in the stripe array
5361                  */
5362                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5363                                 &stripe_index);
5364                 mirror_num = stripe_index + 1;
5365         }
5366         BUG_ON(stripe_index >= map->num_stripes);
5367
5368         num_alloc_stripes = num_stripes;
5369         if (dev_replace_is_ongoing) {
5370                 if (rw & (REQ_WRITE | REQ_DISCARD))
5371                         num_alloc_stripes <<= 1;
5372                 if (rw & REQ_GET_READ_MIRRORS)
5373                         num_alloc_stripes++;
5374                 tgtdev_indexes = num_stripes;
5375         }
5376
5377         bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
5378         if (!bbio) {
5379                 ret = -ENOMEM;
5380                 goto out;
5381         }
5382         if (dev_replace_is_ongoing)
5383                 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
5384
5385         /* build raid_map */
5386         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
5387             need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
5388             mirror_num > 1)) {
5389                 u64 tmp;
5390                 unsigned rot;
5391
5392                 bbio->raid_map = (u64 *)((void *)bbio->stripes +
5393                                  sizeof(struct btrfs_bio_stripe) *
5394                                  num_alloc_stripes +
5395                                  sizeof(int) * tgtdev_indexes);
5396
5397                 /* Work out the disk rotation on this stripe-set */
5398                 div_u64_rem(stripe_nr, num_stripes, &rot);
5399
5400                 /* Fill in the logical address of each stripe */
5401                 tmp = stripe_nr * nr_data_stripes(map);
5402                 for (i = 0; i < nr_data_stripes(map); i++)
5403                         bbio->raid_map[(i+rot) % num_stripes] =
5404                                 em->start + (tmp + i) * map->stripe_len;
5405
5406                 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
5407                 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5408                         bbio->raid_map[(i+rot+1) % num_stripes] =
5409                                 RAID6_Q_STRIPE;
5410         }
5411
5412         if (rw & REQ_DISCARD) {
5413                 u32 factor = 0;
5414                 u32 sub_stripes = 0;
5415                 u64 stripes_per_dev = 0;
5416                 u32 remaining_stripes = 0;
5417                 u32 last_stripe = 0;
5418
5419                 if (map->type &
5420                     (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
5421                         if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5422                                 sub_stripes = 1;
5423                         else
5424                                 sub_stripes = map->sub_stripes;
5425
5426                         factor = map->num_stripes / sub_stripes;
5427                         stripes_per_dev = div_u64_rem(stripe_nr_end -
5428                                                       stripe_nr_orig,
5429                                                       factor,
5430                                                       &remaining_stripes);
5431                         div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5432                         last_stripe *= sub_stripes;
5433                 }
5434
5435                 for (i = 0; i < num_stripes; i++) {
5436                         bbio->stripes[i].physical =
5437                                 map->stripes[stripe_index].physical +
5438                                 stripe_offset + stripe_nr * map->stripe_len;
5439                         bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5440
5441                         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5442                                          BTRFS_BLOCK_GROUP_RAID10)) {
5443                                 bbio->stripes[i].length = stripes_per_dev *
5444                                                           map->stripe_len;
5445
5446                                 if (i / sub_stripes < remaining_stripes)
5447                                         bbio->stripes[i].length +=
5448                                                 map->stripe_len;
5449
5450                                 /*
5451                                  * Special for the first stripe and
5452                                  * the last stripe:
5453                                  *
5454                                  * |-------|...|-------|
5455                                  *     |----------|
5456                                  *    off     end_off
5457                                  */
5458                                 if (i < sub_stripes)
5459                                         bbio->stripes[i].length -=
5460                                                 stripe_offset;
5461
5462                                 if (stripe_index >= last_stripe &&
5463                                     stripe_index <= (last_stripe +
5464                                                      sub_stripes - 1))
5465                                         bbio->stripes[i].length -=
5466                                                 stripe_end_offset;
5467
5468                                 if (i == sub_stripes - 1)
5469                                         stripe_offset = 0;
5470                         } else
5471                                 bbio->stripes[i].length = *length;
5472
5473                         stripe_index++;
5474                         if (stripe_index == map->num_stripes) {
5475                                 /* This could only happen for RAID0/10 */
5476                                 stripe_index = 0;
5477                                 stripe_nr++;
5478                         }
5479                 }
5480         } else {
5481                 for (i = 0; i < num_stripes; i++) {
5482                         bbio->stripes[i].physical =
5483                                 map->stripes[stripe_index].physical +
5484                                 stripe_offset +
5485                                 stripe_nr * map->stripe_len;
5486                         bbio->stripes[i].dev =
5487                                 map->stripes[stripe_index].dev;
5488                         stripe_index++;
5489                 }
5490         }
5491
5492         if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
5493                 max_errors = btrfs_chunk_max_errors(map);
5494
5495         if (bbio->raid_map)
5496                 sort_parity_stripes(bbio, num_stripes);
5497
5498         tgtdev_indexes = 0;
5499         if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
5500             dev_replace->tgtdev != NULL) {
5501                 int index_where_to_add;
5502                 u64 srcdev_devid = dev_replace->srcdev->devid;
5503
5504                 /*
5505                  * duplicate the write operations while the dev replace
5506                  * procedure is running. Since the copying of the old disk
5507                  * to the new disk takes place at run time while the
5508                  * filesystem is mounted writable, the regular write
5509                  * operations to the old disk have to be duplicated to go
5510                  * to the new disk as well.
5511                  * Note that device->missing is handled by the caller, and
5512                  * that the write to the old disk is already set up in the
5513                  * stripes array.
5514                  */
5515                 index_where_to_add = num_stripes;
5516                 for (i = 0; i < num_stripes; i++) {
5517                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
5518                                 /* write to new disk, too */
5519                                 struct btrfs_bio_stripe *new =
5520                                         bbio->stripes + index_where_to_add;
5521                                 struct btrfs_bio_stripe *old =
5522                                         bbio->stripes + i;
5523
5524                                 new->physical = old->physical;
5525                                 new->length = old->length;
5526                                 new->dev = dev_replace->tgtdev;
5527                                 bbio->tgtdev_map[i] = index_where_to_add;
5528                                 index_where_to_add++;
5529                                 max_errors++;
5530                                 tgtdev_indexes++;
5531                         }
5532                 }
5533                 num_stripes = index_where_to_add;
5534         } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
5535                    dev_replace->tgtdev != NULL) {
5536                 u64 srcdev_devid = dev_replace->srcdev->devid;
5537                 int index_srcdev = 0;
5538                 int found = 0;
5539                 u64 physical_of_found = 0;
5540
5541                 /*
5542                  * During the dev-replace procedure, the target drive can
5543                  * also be used to read data in case it is needed to repair
5544                  * a corrupt block elsewhere. This is possible if the
5545                  * requested area is left of the left cursor. In this area,
5546                  * the target drive is a full copy of the source drive.
5547                  */
5548                 for (i = 0; i < num_stripes; i++) {
5549                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
5550                                 /*
5551                                  * In case of DUP, in order to keep it
5552                                  * simple, only add the mirror with the
5553                                  * lowest physical address
5554                                  */
5555                                 if (found &&
5556                                     physical_of_found <=
5557                                      bbio->stripes[i].physical)
5558                                         continue;
5559                                 index_srcdev = i;
5560                                 found = 1;
5561                                 physical_of_found = bbio->stripes[i].physical;
5562                         }
5563                 }
5564                 if (found) {
5565                         if (physical_of_found + map->stripe_len <=
5566                             dev_replace->cursor_left) {
5567                                 struct btrfs_bio_stripe *tgtdev_stripe =
5568                                         bbio->stripes + num_stripes;
5569
5570                                 tgtdev_stripe->physical = physical_of_found;
5571                                 tgtdev_stripe->length =
5572                                         bbio->stripes[index_srcdev].length;
5573                                 tgtdev_stripe->dev = dev_replace->tgtdev;
5574                                 bbio->tgtdev_map[index_srcdev] = num_stripes;
5575
5576                                 tgtdev_indexes++;
5577                                 num_stripes++;
5578                         }
5579                 }
5580         }
5581
5582         *bbio_ret = bbio;
5583         bbio->map_type = map->type;
5584         bbio->num_stripes = num_stripes;
5585         bbio->max_errors = max_errors;
5586         bbio->mirror_num = mirror_num;
5587         bbio->num_tgtdevs = tgtdev_indexes;
5588
5589         /*
5590          * this is the case that REQ_READ && dev_replace_is_ongoing &&
5591          * mirror_num == num_stripes + 1 && dev_replace target drive is
5592          * available as a mirror
5593          */
5594         if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
5595                 WARN_ON(num_stripes > 1);
5596                 bbio->stripes[0].dev = dev_replace->tgtdev;
5597                 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
5598                 bbio->mirror_num = map->num_stripes + 1;
5599         }
5600 out:
5601         if (dev_replace_is_ongoing)
5602                 btrfs_dev_replace_unlock(dev_replace);
5603         free_extent_map(em);
5604         return ret;
5605 }
5606
5607 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5608                       u64 logical, u64 *length,
5609                       struct btrfs_bio **bbio_ret, int mirror_num)
5610 {
5611         return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
5612                                  mirror_num, 0);
5613 }
5614
5615 /* For Scrub/replace */
5616 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
5617                      u64 logical, u64 *length,
5618                      struct btrfs_bio **bbio_ret, int mirror_num,
5619                      int need_raid_map)
5620 {
5621         return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
5622                                  mirror_num, need_raid_map);
5623 }
5624
5625 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5626                      u64 chunk_start, u64 physical, u64 devid,
5627                      u64 **logical, int *naddrs, int *stripe_len)
5628 {
5629         struct extent_map_tree *em_tree = &map_tree->map_tree;
5630         struct extent_map *em;
5631         struct map_lookup *map;
5632         u64 *buf;
5633         u64 bytenr;
5634         u64 length;
5635         u64 stripe_nr;
5636         u64 rmap_len;
5637         int i, j, nr = 0;
5638
5639         read_lock(&em_tree->lock);
5640         em = lookup_extent_mapping(em_tree, chunk_start, 1);
5641         read_unlock(&em_tree->lock);
5642
5643         if (!em) {
5644                 printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n",
5645                        chunk_start);
5646                 return -EIO;
5647         }
5648
5649         if (em->start != chunk_start) {
5650                 printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n",
5651                        em->start, chunk_start);
5652                 free_extent_map(em);
5653                 return -EIO;
5654         }
5655         map = (struct map_lookup *)em->bdev;
5656
5657         length = em->len;
5658         rmap_len = map->stripe_len;
5659
5660         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5661                 length = div_u64(length, map->num_stripes / map->sub_stripes);
5662         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5663                 length = div_u64(length, map->num_stripes);
5664         else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5665                 length = div_u64(length, nr_data_stripes(map));
5666                 rmap_len = map->stripe_len * nr_data_stripes(map);
5667         }
5668
5669         buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
5670         BUG_ON(!buf); /* -ENOMEM */
5671
5672         for (i = 0; i < map->num_stripes; i++) {
5673                 if (devid && map->stripes[i].dev->devid != devid)
5674                         continue;
5675                 if (map->stripes[i].physical > physical ||
5676                     map->stripes[i].physical + length <= physical)
5677                         continue;
5678
5679                 stripe_nr = physical - map->stripes[i].physical;
5680                 stripe_nr = div_u64(stripe_nr, map->stripe_len);
5681
5682                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5683                         stripe_nr = stripe_nr * map->num_stripes + i;
5684                         stripe_nr = div_u64(stripe_nr, map->sub_stripes);
5685                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5686                         stripe_nr = stripe_nr * map->num_stripes + i;
5687                 } /* else if RAID[56], multiply by nr_data_stripes().
5688                    * Alternatively, just use rmap_len below instead of
5689                    * map->stripe_len */
5690
5691                 bytenr = chunk_start + stripe_nr * rmap_len;
5692                 WARN_ON(nr >= map->num_stripes);
5693                 for (j = 0; j < nr; j++) {
5694                         if (buf[j] == bytenr)
5695                                 break;
5696                 }
5697                 if (j == nr) {
5698                         WARN_ON(nr >= map->num_stripes);
5699                         buf[nr++] = bytenr;
5700                 }
5701         }
5702
5703         *logical = buf;
5704         *naddrs = nr;
5705         *stripe_len = rmap_len;
5706
5707         free_extent_map(em);
5708         return 0;
5709 }
5710
5711 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
5712 {
5713         bio->bi_private = bbio->private;
5714         bio->bi_end_io = bbio->end_io;
5715         bio_endio(bio);
5716
5717         btrfs_put_bbio(bbio);
5718 }
5719
5720 static void btrfs_end_bio(struct bio *bio)
5721 {
5722         struct btrfs_bio *bbio = bio->bi_private;
5723         int is_orig_bio = 0;
5724
5725         if (bio->bi_error) {
5726                 atomic_inc(&bbio->error);
5727                 if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) {
5728                         unsigned int stripe_index =
5729                                 btrfs_io_bio(bio)->stripe_index;
5730                         struct btrfs_device *dev;
5731
5732                         BUG_ON(stripe_index >= bbio->num_stripes);
5733                         dev = bbio->stripes[stripe_index].dev;
5734                         if (dev->bdev) {
5735                                 if (bio->bi_rw & WRITE)
5736                                         btrfs_dev_stat_inc(dev,
5737                                                 BTRFS_DEV_STAT_WRITE_ERRS);
5738                                 else
5739                                         btrfs_dev_stat_inc(dev,
5740                                                 BTRFS_DEV_STAT_READ_ERRS);
5741                                 if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
5742                                         btrfs_dev_stat_inc(dev,
5743                                                 BTRFS_DEV_STAT_FLUSH_ERRS);
5744                                 btrfs_dev_stat_print_on_error(dev);
5745                         }
5746                 }
5747         }
5748
5749         if (bio == bbio->orig_bio)
5750                 is_orig_bio = 1;
5751
5752         btrfs_bio_counter_dec(bbio->fs_info);
5753
5754         if (atomic_dec_and_test(&bbio->stripes_pending)) {
5755                 if (!is_orig_bio) {
5756                         bio_put(bio);
5757                         bio = bbio->orig_bio;
5758                 }
5759
5760                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
5761                 /* only send an error to the higher layers if it is
5762                  * beyond the tolerance of the btrfs bio
5763                  */
5764                 if (atomic_read(&bbio->error) > bbio->max_errors) {
5765                         bio->bi_error = -EIO;
5766                 } else {
5767                         /*
5768                          * this bio is actually up to date, we didn't
5769                          * go over the max number of errors
5770                          */
5771                         bio->bi_error = 0;
5772                 }
5773
5774                 btrfs_end_bbio(bbio, bio);
5775         } else if (!is_orig_bio) {
5776                 bio_put(bio);
5777         }
5778 }
5779
5780 /*
5781  * see run_scheduled_bios for a description of why bios are collected for
5782  * async submit.
5783  *
5784  * This will add one bio to the pending list for a device and make sure
5785  * the work struct is scheduled.
5786  */
5787 static noinline void btrfs_schedule_bio(struct btrfs_root *root,
5788                                         struct btrfs_device *device,
5789                                         int rw, struct bio *bio)
5790 {
5791         int should_queue = 1;
5792         struct btrfs_pending_bios *pending_bios;
5793
5794         if (device->missing || !device->bdev) {
5795                 bio_io_error(bio);
5796                 return;
5797         }
5798
5799         /* don't bother with additional async steps for reads, right now */
5800         if (!(rw & REQ_WRITE)) {
5801                 bio_get(bio);
5802                 btrfsic_submit_bio(rw, bio);
5803                 bio_put(bio);
5804                 return;
5805         }
5806
5807         /*
5808          * nr_async_bios allows us to reliably return congestion to the
5809          * higher layers.  Otherwise, the async bio makes it appear we have
5810          * made progress against dirty pages when we've really just put it
5811          * on a queue for later
5812          */
5813         atomic_inc(&root->fs_info->nr_async_bios);
5814         WARN_ON(bio->bi_next);
5815         bio->bi_next = NULL;
5816         bio->bi_rw |= rw;
5817
5818         spin_lock(&device->io_lock);
5819         if (bio->bi_rw & REQ_SYNC)
5820                 pending_bios = &device->pending_sync_bios;
5821         else
5822                 pending_bios = &device->pending_bios;
5823
5824         if (pending_bios->tail)
5825                 pending_bios->tail->bi_next = bio;
5826
5827         pending_bios->tail = bio;
5828         if (!pending_bios->head)
5829                 pending_bios->head = bio;
5830         if (device->running_pending)
5831                 should_queue = 0;
5832
5833         spin_unlock(&device->io_lock);
5834
5835         if (should_queue)
5836                 btrfs_queue_work(root->fs_info->submit_workers,
5837                                  &device->work);
5838 }
5839
5840 static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5841                               struct bio *bio, u64 physical, int dev_nr,
5842                               int rw, int async)
5843 {
5844         struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
5845
5846         bio->bi_private = bbio;
5847         btrfs_io_bio(bio)->stripe_index = dev_nr;
5848         bio->bi_end_io = btrfs_end_bio;
5849         bio->bi_iter.bi_sector = physical >> 9;
5850 #ifdef DEBUG
5851         {
5852                 struct rcu_string *name;
5853
5854                 rcu_read_lock();
5855                 name = rcu_dereference(dev->name);
5856                 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
5857                          "(%s id %llu), size=%u\n", rw,
5858                          (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev,
5859                          name->str, dev->devid, bio->bi_iter.bi_size);
5860                 rcu_read_unlock();
5861         }
5862 #endif
5863         bio->bi_bdev = dev->bdev;
5864
5865         btrfs_bio_counter_inc_noblocked(root->fs_info);
5866
5867         if (async)
5868                 btrfs_schedule_bio(root, dev, rw, bio);
5869         else
5870                 btrfsic_submit_bio(rw, bio);
5871 }
5872
5873 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
5874 {
5875         atomic_inc(&bbio->error);
5876         if (atomic_dec_and_test(&bbio->stripes_pending)) {
5877                 /* Shoud be the original bio. */
5878                 WARN_ON(bio != bbio->orig_bio);
5879
5880                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
5881                 bio->bi_iter.bi_sector = logical >> 9;
5882                 bio->bi_error = -EIO;
5883                 btrfs_end_bbio(bbio, bio);
5884         }
5885 }
5886
5887 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5888                   int mirror_num, int async_submit)
5889 {
5890         struct btrfs_device *dev;
5891         struct bio *first_bio = bio;
5892         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
5893         u64 length = 0;
5894         u64 map_length;
5895         int ret;
5896         int dev_nr;
5897         int total_devs;
5898         struct btrfs_bio *bbio = NULL;
5899
5900         length = bio->bi_iter.bi_size;
5901         map_length = length;
5902
5903         btrfs_bio_counter_inc_blocked(root->fs_info);
5904         ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
5905                               mirror_num, 1);
5906         if (ret) {
5907                 btrfs_bio_counter_dec(root->fs_info);
5908                 return ret;
5909         }
5910
5911         total_devs = bbio->num_stripes;
5912         bbio->orig_bio = first_bio;
5913         bbio->private = first_bio->bi_private;
5914         bbio->end_io = first_bio->bi_end_io;
5915         bbio->fs_info = root->fs_info;
5916         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5917
5918         if (bbio->raid_map) {
5919                 /* In this case, map_length has been set to the length of
5920                    a single stripe; not the whole write */
5921                 if (rw & WRITE) {
5922                         ret = raid56_parity_write(root, bio, bbio, map_length);
5923                 } else {
5924                         ret = raid56_parity_recover(root, bio, bbio, map_length,
5925                                                     mirror_num, 1);
5926                 }
5927
5928                 btrfs_bio_counter_dec(root->fs_info);
5929                 return ret;
5930         }
5931
5932         if (map_length < length) {
5933                 btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu",
5934                         logical, length, map_length);
5935                 BUG();
5936         }
5937
5938         for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
5939                 dev = bbio->stripes[dev_nr].dev;
5940                 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
5941                         bbio_error(bbio, first_bio, logical);
5942                         continue;
5943                 }
5944
5945                 if (dev_nr < total_devs - 1) {
5946                         bio = btrfs_bio_clone(first_bio, GFP_NOFS);
5947                         BUG_ON(!bio); /* -ENOMEM */
5948                 } else
5949                         bio = first_bio;
5950
5951                 submit_stripe_bio(root, bbio, bio,
5952                                   bbio->stripes[dev_nr].physical, dev_nr, rw,
5953                                   async_submit);
5954         }
5955         btrfs_bio_counter_dec(root->fs_info);
5956         return 0;
5957 }
5958
5959 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
5960                                        u8 *uuid, u8 *fsid)
5961 {
5962         struct btrfs_device *device;
5963         struct btrfs_fs_devices *cur_devices;
5964
5965         cur_devices = fs_info->fs_devices;
5966         while (cur_devices) {
5967                 if (!fsid ||
5968                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
5969                         device = __find_device(&cur_devices->devices,
5970                                                devid, uuid);
5971                         if (device)
5972                                 return device;
5973                 }
5974                 cur_devices = cur_devices->seed;
5975         }
5976         return NULL;
5977 }
5978
5979 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
5980                                             struct btrfs_fs_devices *fs_devices,
5981                                             u64 devid, u8 *dev_uuid)
5982 {
5983         struct btrfs_device *device;
5984
5985         device = btrfs_alloc_device(NULL, &devid, dev_uuid);
5986         if (IS_ERR(device))
5987                 return NULL;
5988
5989         list_add(&device->dev_list, &fs_devices->devices);
5990         device->fs_devices = fs_devices;
5991         fs_devices->num_devices++;
5992
5993         device->missing = 1;
5994         fs_devices->missing_devices++;
5995
5996         return device;
5997 }
5998
5999 /**
6000  * btrfs_alloc_device - allocate struct btrfs_device
6001  * @fs_info:    used only for generating a new devid, can be NULL if
6002  *              devid is provided (i.e. @devid != NULL).
6003  * @devid:      a pointer to devid for this device.  If NULL a new devid
6004  *              is generated.
6005  * @uuid:       a pointer to UUID for this device.  If NULL a new UUID
6006  *              is generated.
6007  *
6008  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6009  * on error.  Returned struct is not linked onto any lists and can be
6010  * destroyed with kfree() right away.
6011  */
6012 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6013                                         const u64 *devid,
6014                                         const u8 *uuid)
6015 {
6016         struct btrfs_device *dev;
6017         u64 tmp;
6018
6019         if (WARN_ON(!devid && !fs_info))
6020                 return ERR_PTR(-EINVAL);
6021
6022         dev = __alloc_device();
6023         if (IS_ERR(dev))
6024                 return dev;
6025
6026         if (devid)
6027                 tmp = *devid;
6028         else {
6029                 int ret;
6030
6031                 ret = find_next_devid(fs_info, &tmp);
6032                 if (ret) {
6033                         kfree(dev);
6034                         return ERR_PTR(ret);
6035                 }
6036         }
6037         dev->devid = tmp;
6038
6039         if (uuid)
6040                 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6041         else
6042                 generate_random_uuid(dev->uuid);
6043
6044         btrfs_init_work(&dev->work, btrfs_submit_helper,
6045                         pending_bios_fn, NULL, NULL);
6046
6047         return dev;
6048 }
6049
6050 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
6051                           struct extent_buffer *leaf,
6052                           struct btrfs_chunk *chunk)
6053 {
6054         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
6055         struct map_lookup *map;
6056         struct extent_map *em;
6057         u64 logical;
6058         u64 length;
6059         u64 devid;
6060         u8 uuid[BTRFS_UUID_SIZE];
6061         int num_stripes;
6062         int ret;
6063         int i;
6064
6065         logical = key->offset;
6066         length = btrfs_chunk_length(leaf, chunk);
6067
6068         read_lock(&map_tree->map_tree.lock);
6069         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
6070         read_unlock(&map_tree->map_tree.lock);
6071
6072         /* already mapped? */
6073         if (em && em->start <= logical && em->start + em->len > logical) {
6074                 free_extent_map(em);
6075                 return 0;
6076         } else if (em) {
6077                 free_extent_map(em);
6078         }
6079
6080         em = alloc_extent_map();
6081         if (!em)
6082                 return -ENOMEM;
6083         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6084         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6085         if (!map) {
6086                 free_extent_map(em);
6087                 return -ENOMEM;
6088         }
6089
6090         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6091         em->bdev = (struct block_device *)map;
6092         em->start = logical;
6093         em->len = length;
6094         em->orig_start = 0;
6095         em->block_start = 0;
6096         em->block_len = em->len;
6097
6098         map->num_stripes = num_stripes;
6099         map->io_width = btrfs_chunk_io_width(leaf, chunk);
6100         map->io_align = btrfs_chunk_io_align(leaf, chunk);
6101         map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
6102         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6103         map->type = btrfs_chunk_type(leaf, chunk);
6104         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6105         for (i = 0; i < num_stripes; i++) {
6106                 map->stripes[i].physical =
6107                         btrfs_stripe_offset_nr(leaf, chunk, i);
6108                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6109                 read_extent_buffer(leaf, uuid, (unsigned long)
6110                                    btrfs_stripe_dev_uuid_nr(chunk, i),
6111                                    BTRFS_UUID_SIZE);
6112                 map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
6113                                                         uuid, NULL);
6114                 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
6115                         free_extent_map(em);
6116                         return -EIO;
6117                 }
6118                 if (!map->stripes[i].dev) {
6119                         map->stripes[i].dev =
6120                                 add_missing_dev(root, root->fs_info->fs_devices,
6121                                                 devid, uuid);
6122                         if (!map->stripes[i].dev) {
6123                                 free_extent_map(em);
6124                                 return -EIO;
6125                         }
6126                         btrfs_warn(root->fs_info, "devid %llu uuid %pU is missing",
6127                                                 devid, uuid);
6128                 }
6129                 map->stripes[i].dev->in_fs_metadata = 1;
6130         }
6131
6132         write_lock(&map_tree->map_tree.lock);
6133         ret = add_extent_mapping(&map_tree->map_tree, em, 0);
6134         write_unlock(&map_tree->map_tree.lock);
6135         BUG_ON(ret); /* Tree corruption */
6136         free_extent_map(em);
6137
6138         return 0;
6139 }
6140
6141 static void fill_device_from_item(struct extent_buffer *leaf,
6142                                  struct btrfs_dev_item *dev_item,
6143                                  struct btrfs_device *device)
6144 {
6145         unsigned long ptr;
6146
6147         device->devid = btrfs_device_id(leaf, dev_item);
6148         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6149         device->total_bytes = device->disk_total_bytes;
6150         device->commit_total_bytes = device->disk_total_bytes;
6151         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6152         device->commit_bytes_used = device->bytes_used;
6153         device->type = btrfs_device_type(leaf, dev_item);
6154         device->io_align = btrfs_device_io_align(leaf, dev_item);
6155         device->io_width = btrfs_device_io_width(leaf, dev_item);
6156         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6157         WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6158         device->is_tgtdev_for_dev_replace = 0;
6159
6160         ptr = btrfs_device_uuid(dev_item);
6161         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6162 }
6163
6164 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
6165                                                   u8 *fsid)
6166 {
6167         struct btrfs_fs_devices *fs_devices;
6168         int ret;
6169
6170         BUG_ON(!mutex_is_locked(&uuid_mutex));
6171
6172         fs_devices = root->fs_info->fs_devices->seed;
6173         while (fs_devices) {
6174                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE))
6175                         return fs_devices;
6176
6177                 fs_devices = fs_devices->seed;
6178         }
6179
6180         fs_devices = find_fsid(fsid);
6181         if (!fs_devices) {
6182                 if (!btrfs_test_opt(root, DEGRADED))
6183                         return ERR_PTR(-ENOENT);
6184
6185                 fs_devices = alloc_fs_devices(fsid);
6186                 if (IS_ERR(fs_devices))
6187                         return fs_devices;
6188
6189                 fs_devices->seeding = 1;
6190                 fs_devices->opened = 1;
6191                 return fs_devices;
6192         }
6193
6194         fs_devices = clone_fs_devices(fs_devices);
6195         if (IS_ERR(fs_devices))
6196                 return fs_devices;
6197
6198         ret = __btrfs_open_devices(fs_devices, FMODE_READ,
6199                                    root->fs_info->bdev_holder);
6200         if (ret) {
6201                 free_fs_devices(fs_devices);
6202                 fs_devices = ERR_PTR(ret);
6203                 goto out;
6204         }
6205
6206         if (!fs_devices->seeding) {
6207                 __btrfs_close_devices(fs_devices);
6208                 free_fs_devices(fs_devices);
6209                 fs_devices = ERR_PTR(-EINVAL);
6210                 goto out;
6211         }
6212
6213         fs_devices->seed = root->fs_info->fs_devices->seed;
6214         root->fs_info->fs_devices->seed = fs_devices;
6215 out:
6216         return fs_devices;
6217 }
6218
6219 static int read_one_dev(struct btrfs_root *root,
6220                         struct extent_buffer *leaf,
6221                         struct btrfs_dev_item *dev_item)
6222 {
6223         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
6224         struct btrfs_device *device;
6225         u64 devid;
6226         int ret;
6227         u8 fs_uuid[BTRFS_UUID_SIZE];
6228         u8 dev_uuid[BTRFS_UUID_SIZE];
6229
6230         devid = btrfs_device_id(leaf, dev_item);
6231         read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6232                            BTRFS_UUID_SIZE);
6233         read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6234                            BTRFS_UUID_SIZE);
6235
6236         if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
6237                 fs_devices = open_seed_devices(root, fs_uuid);
6238                 if (IS_ERR(fs_devices))
6239                         return PTR_ERR(fs_devices);
6240         }
6241
6242         device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
6243         if (!device) {
6244                 if (!btrfs_test_opt(root, DEGRADED))
6245                         return -EIO;
6246
6247                 device = add_missing_dev(root, fs_devices, devid, dev_uuid);
6248                 if (!device)
6249                         return -ENOMEM;
6250                 btrfs_warn(root->fs_info, "devid %llu uuid %pU missing",
6251                                 devid, dev_uuid);
6252         } else {
6253                 if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
6254                         return -EIO;
6255
6256                 if(!device->bdev && !device->missing) {
6257                         /*
6258                          * this happens when a device that was properly setup
6259                          * in the device info lists suddenly goes bad.
6260                          * device->bdev is NULL, and so we have to set
6261                          * device->missing to one here
6262                          */
6263                         device->fs_devices->missing_devices++;
6264                         device->missing = 1;
6265                 }
6266
6267                 /* Move the device to its own fs_devices */
6268                 if (device->fs_devices != fs_devices) {
6269                         ASSERT(device->missing);
6270
6271                         list_move(&device->dev_list, &fs_devices->devices);
6272                         device->fs_devices->num_devices--;
6273                         fs_devices->num_devices++;
6274
6275                         device->fs_devices->missing_devices--;
6276                         fs_devices->missing_devices++;
6277
6278                         device->fs_devices = fs_devices;
6279                 }
6280         }
6281
6282         if (device->fs_devices != root->fs_info->fs_devices) {
6283                 BUG_ON(device->writeable);
6284                 if (device->generation !=
6285                     btrfs_device_generation(leaf, dev_item))
6286                         return -EINVAL;
6287         }
6288
6289         fill_device_from_item(leaf, dev_item, device);
6290         device->in_fs_metadata = 1;
6291         if (device->writeable && !device->is_tgtdev_for_dev_replace) {
6292                 device->fs_devices->total_rw_bytes += device->total_bytes;
6293                 spin_lock(&root->fs_info->free_chunk_lock);
6294                 root->fs_info->free_chunk_space += device->total_bytes -
6295                         device->bytes_used;
6296                 spin_unlock(&root->fs_info->free_chunk_lock);
6297         }
6298         ret = 0;
6299         return ret;
6300 }
6301
6302 int btrfs_read_sys_array(struct btrfs_root *root)
6303 {
6304         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
6305         struct extent_buffer *sb;
6306         struct btrfs_disk_key *disk_key;
6307         struct btrfs_chunk *chunk;
6308         u8 *array_ptr;
6309         unsigned long sb_array_offset;
6310         int ret = 0;
6311         u32 num_stripes;
6312         u32 array_size;
6313         u32 len = 0;
6314         u32 cur_offset;
6315         struct btrfs_key key;
6316
6317         ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize);
6318         /*
6319          * This will create extent buffer of nodesize, superblock size is
6320          * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
6321          * overallocate but we can keep it as-is, only the first page is used.
6322          */
6323         sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
6324         if (!sb)
6325                 return -ENOMEM;
6326         btrfs_set_buffer_uptodate(sb);
6327         btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
6328         /*
6329          * The sb extent buffer is artifical and just used to read the system array.
6330          * btrfs_set_buffer_uptodate() call does not properly mark all it's
6331          * pages up-to-date when the page is larger: extent does not cover the
6332          * whole page and consequently check_page_uptodate does not find all
6333          * the page's extents up-to-date (the hole beyond sb),
6334          * write_extent_buffer then triggers a WARN_ON.
6335          *
6336          * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
6337          * but sb spans only this function. Add an explicit SetPageUptodate call
6338          * to silence the warning eg. on PowerPC 64.
6339          */
6340         if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
6341                 SetPageUptodate(sb->pages[0]);
6342
6343         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6344         array_size = btrfs_super_sys_array_size(super_copy);
6345
6346         array_ptr = super_copy->sys_chunk_array;
6347         sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
6348         cur_offset = 0;
6349
6350         while (cur_offset < array_size) {
6351                 disk_key = (struct btrfs_disk_key *)array_ptr;
6352                 len = sizeof(*disk_key);
6353                 if (cur_offset + len > array_size)
6354                         goto out_short_read;
6355
6356                 btrfs_disk_key_to_cpu(&key, disk_key);
6357
6358                 array_ptr += len;
6359                 sb_array_offset += len;
6360                 cur_offset += len;
6361
6362                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
6363                         chunk = (struct btrfs_chunk *)sb_array_offset;
6364                         /*
6365                          * At least one btrfs_chunk with one stripe must be
6366                          * present, exact stripe count check comes afterwards
6367                          */
6368                         len = btrfs_chunk_item_size(1);
6369                         if (cur_offset + len > array_size)
6370                                 goto out_short_read;
6371
6372                         num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6373                         len = btrfs_chunk_item_size(num_stripes);
6374                         if (cur_offset + len > array_size)
6375                                 goto out_short_read;
6376
6377                         ret = read_one_chunk(root, &key, sb, chunk);
6378                         if (ret)
6379                                 break;
6380                 } else {
6381                         ret = -EIO;
6382                         break;
6383                 }
6384                 array_ptr += len;
6385                 sb_array_offset += len;
6386                 cur_offset += len;
6387         }
6388         free_extent_buffer(sb);
6389         return ret;
6390
6391 out_short_read:
6392         printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n",
6393                         len, cur_offset);
6394         free_extent_buffer(sb);
6395         return -EIO;
6396 }
6397
6398 int btrfs_read_chunk_tree(struct btrfs_root *root)
6399 {
6400         struct btrfs_path *path;
6401         struct extent_buffer *leaf;
6402         struct btrfs_key key;
6403         struct btrfs_key found_key;
6404         int ret;
6405         int slot;
6406
6407         root = root->fs_info->chunk_root;
6408
6409         path = btrfs_alloc_path();
6410         if (!path)
6411                 return -ENOMEM;
6412
6413         mutex_lock(&uuid_mutex);
6414         lock_chunks(root);
6415
6416         /*
6417          * Read all device items, and then all the chunk items. All
6418          * device items are found before any chunk item (their object id
6419          * is smaller than the lowest possible object id for a chunk
6420          * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
6421          */
6422         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
6423         key.offset = 0;
6424         key.type = 0;
6425         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6426         if (ret < 0)
6427                 goto error;
6428         while (1) {
6429                 leaf = path->nodes[0];
6430                 slot = path->slots[0];
6431                 if (slot >= btrfs_header_nritems(leaf)) {
6432                         ret = btrfs_next_leaf(root, path);
6433                         if (ret == 0)
6434                                 continue;
6435                         if (ret < 0)
6436                                 goto error;
6437                         break;
6438                 }
6439                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
6440                 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
6441                         struct btrfs_dev_item *dev_item;
6442                         dev_item = btrfs_item_ptr(leaf, slot,
6443                                                   struct btrfs_dev_item);
6444                         ret = read_one_dev(root, leaf, dev_item);
6445                         if (ret)
6446                                 goto error;
6447                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
6448                         struct btrfs_chunk *chunk;
6449                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
6450                         ret = read_one_chunk(root, &found_key, leaf, chunk);
6451                         if (ret)
6452                                 goto error;
6453                 }
6454                 path->slots[0]++;
6455         }
6456         ret = 0;
6457 error:
6458         unlock_chunks(root);
6459         mutex_unlock(&uuid_mutex);
6460
6461         btrfs_free_path(path);
6462         return ret;
6463 }
6464
6465 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
6466 {
6467         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6468         struct btrfs_device *device;
6469
6470         while (fs_devices) {
6471                 mutex_lock(&fs_devices->device_list_mutex);
6472                 list_for_each_entry(device, &fs_devices->devices, dev_list)
6473                         device->dev_root = fs_info->dev_root;
6474                 mutex_unlock(&fs_devices->device_list_mutex);
6475
6476                 fs_devices = fs_devices->seed;
6477         }
6478 }
6479
6480 static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
6481 {
6482         int i;
6483
6484         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6485                 btrfs_dev_stat_reset(dev, i);
6486 }
6487
6488 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
6489 {
6490         struct btrfs_key key;
6491         struct btrfs_key found_key;
6492         struct btrfs_root *dev_root = fs_info->dev_root;
6493         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6494         struct extent_buffer *eb;
6495         int slot;
6496         int ret = 0;
6497         struct btrfs_device *device;
6498         struct btrfs_path *path = NULL;
6499         int i;
6500
6501         path = btrfs_alloc_path();
6502         if (!path) {
6503                 ret = -ENOMEM;
6504                 goto out;
6505         }
6506
6507         mutex_lock(&fs_devices->device_list_mutex);
6508         list_for_each_entry(device, &fs_devices->devices, dev_list) {
6509                 int item_size;
6510                 struct btrfs_dev_stats_item *ptr;
6511
6512                 key.objectid = 0;
6513                 key.type = BTRFS_DEV_STATS_KEY;
6514                 key.offset = device->devid;
6515                 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
6516                 if (ret) {
6517                         __btrfs_reset_dev_stats(device);
6518                         device->dev_stats_valid = 1;
6519                         btrfs_release_path(path);
6520                         continue;
6521                 }
6522                 slot = path->slots[0];
6523                 eb = path->nodes[0];
6524                 btrfs_item_key_to_cpu(eb, &found_key, slot);
6525                 item_size = btrfs_item_size_nr(eb, slot);
6526
6527                 ptr = btrfs_item_ptr(eb, slot,
6528                                      struct btrfs_dev_stats_item);
6529
6530                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
6531                         if (item_size >= (1 + i) * sizeof(__le64))
6532                                 btrfs_dev_stat_set(device, i,
6533                                         btrfs_dev_stats_value(eb, ptr, i));
6534                         else
6535                                 btrfs_dev_stat_reset(device, i);
6536                 }
6537
6538                 device->dev_stats_valid = 1;
6539                 btrfs_dev_stat_print_on_load(device);
6540                 btrfs_release_path(path);
6541         }
6542         mutex_unlock(&fs_devices->device_list_mutex);
6543
6544 out:
6545         btrfs_free_path(path);
6546         return ret < 0 ? ret : 0;
6547 }
6548
6549 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
6550                                 struct btrfs_root *dev_root,
6551                                 struct btrfs_device *device)
6552 {
6553         struct btrfs_path *path;
6554         struct btrfs_key key;
6555         struct extent_buffer *eb;
6556         struct btrfs_dev_stats_item *ptr;
6557         int ret;
6558         int i;
6559
6560         key.objectid = 0;
6561         key.type = BTRFS_DEV_STATS_KEY;
6562         key.offset = device->devid;
6563
6564         path = btrfs_alloc_path();
6565         BUG_ON(!path);
6566         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
6567         if (ret < 0) {
6568                 btrfs_warn_in_rcu(dev_root->fs_info,
6569                         "error %d while searching for dev_stats item for device %s",
6570                               ret, rcu_str_deref(device->name));
6571                 goto out;
6572         }
6573
6574         if (ret == 0 &&
6575             btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
6576                 /* need to delete old one and insert a new one */
6577                 ret = btrfs_del_item(trans, dev_root, path);
6578                 if (ret != 0) {
6579                         btrfs_warn_in_rcu(dev_root->fs_info,
6580                                 "delete too small dev_stats item for device %s failed %d",
6581                                       rcu_str_deref(device->name), ret);
6582                         goto out;
6583                 }
6584                 ret = 1;
6585         }
6586
6587         if (ret == 1) {
6588                 /* need to insert a new item */
6589                 btrfs_release_path(path);
6590                 ret = btrfs_insert_empty_item(trans, dev_root, path,
6591                                               &key, sizeof(*ptr));
6592                 if (ret < 0) {
6593                         btrfs_warn_in_rcu(dev_root->fs_info,
6594                                 "insert dev_stats item for device %s failed %d",
6595                                 rcu_str_deref(device->name), ret);
6596                         goto out;
6597                 }
6598         }
6599
6600         eb = path->nodes[0];
6601         ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
6602         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6603                 btrfs_set_dev_stats_value(eb, ptr, i,
6604                                           btrfs_dev_stat_read(device, i));
6605         btrfs_mark_buffer_dirty(eb);
6606
6607 out:
6608         btrfs_free_path(path);
6609         return ret;
6610 }
6611
6612 /*
6613  * called from commit_transaction. Writes all changed device stats to disk.
6614  */
6615 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
6616                         struct btrfs_fs_info *fs_info)
6617 {
6618         struct btrfs_root *dev_root = fs_info->dev_root;
6619         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6620         struct btrfs_device *device;
6621         int stats_cnt;
6622         int ret = 0;
6623
6624         mutex_lock(&fs_devices->device_list_mutex);
6625         list_for_each_entry(device, &fs_devices->devices, dev_list) {
6626                 if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
6627                         continue;
6628
6629                 stats_cnt = atomic_read(&device->dev_stats_ccnt);
6630                 ret = update_dev_stat_item(trans, dev_root, device);
6631                 if (!ret)
6632                         atomic_sub(stats_cnt, &device->dev_stats_ccnt);
6633         }
6634         mutex_unlock(&fs_devices->device_list_mutex);
6635
6636         return ret;
6637 }
6638
6639 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
6640 {
6641         btrfs_dev_stat_inc(dev, index);
6642         btrfs_dev_stat_print_on_error(dev);
6643 }
6644
6645 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
6646 {
6647         if (!dev->dev_stats_valid)
6648                 return;
6649         btrfs_err_rl_in_rcu(dev->dev_root->fs_info,
6650                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
6651                            rcu_str_deref(dev->name),
6652                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
6653                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
6654                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
6655                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
6656                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
6657 }
6658
6659 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
6660 {
6661         int i;
6662
6663         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6664                 if (btrfs_dev_stat_read(dev, i) != 0)
6665                         break;
6666         if (i == BTRFS_DEV_STAT_VALUES_MAX)
6667                 return; /* all values == 0, suppress message */
6668
6669         btrfs_info_in_rcu(dev->dev_root->fs_info,
6670                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
6671                rcu_str_deref(dev->name),
6672                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
6673                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
6674                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
6675                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
6676                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
6677 }
6678
6679 int btrfs_get_dev_stats(struct btrfs_root *root,
6680                         struct btrfs_ioctl_get_dev_stats *stats)
6681 {
6682         struct btrfs_device *dev;
6683         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
6684         int i;
6685
6686         mutex_lock(&fs_devices->device_list_mutex);
6687         dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
6688         mutex_unlock(&fs_devices->device_list_mutex);
6689
6690         if (!dev) {
6691                 btrfs_warn(root->fs_info, "get dev_stats failed, device not found");
6692                 return -ENODEV;
6693         } else if (!dev->dev_stats_valid) {
6694                 btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid");
6695                 return -ENODEV;
6696         } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
6697                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
6698                         if (stats->nr_items > i)
6699                                 stats->values[i] =
6700                                         btrfs_dev_stat_read_and_reset(dev, i);
6701                         else
6702                                 btrfs_dev_stat_reset(dev, i);
6703                 }
6704         } else {
6705                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6706                         if (stats->nr_items > i)
6707                                 stats->values[i] = btrfs_dev_stat_read(dev, i);
6708         }
6709         if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
6710                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
6711         return 0;
6712 }
6713
6714 void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path)
6715 {
6716         struct buffer_head *bh;
6717         struct btrfs_super_block *disk_super;
6718         int copy_num;
6719
6720         if (!bdev)
6721                 return;
6722
6723         for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
6724                 copy_num++) {
6725
6726                 if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
6727                         continue;
6728
6729                 disk_super = (struct btrfs_super_block *)bh->b_data;
6730
6731                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
6732                 set_buffer_dirty(bh);
6733                 sync_dirty_buffer(bh);
6734                 brelse(bh);
6735         }
6736
6737         /* Notify udev that device has changed */
6738         btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
6739
6740         /* Update ctime/mtime for device path for libblkid */
6741         update_dev_time(device_path);
6742 }
6743
6744 /*
6745  * Update the size of all devices, which is used for writing out the
6746  * super blocks.
6747  */
6748 void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
6749 {
6750         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6751         struct btrfs_device *curr, *next;
6752
6753         if (list_empty(&fs_devices->resized_devices))
6754                 return;
6755
6756         mutex_lock(&fs_devices->device_list_mutex);
6757         lock_chunks(fs_info->dev_root);
6758         list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
6759                                  resized_list) {
6760                 list_del_init(&curr->resized_list);
6761                 curr->commit_total_bytes = curr->disk_total_bytes;
6762         }
6763         unlock_chunks(fs_info->dev_root);
6764         mutex_unlock(&fs_devices->device_list_mutex);
6765 }
6766
6767 /* Must be invoked during the transaction commit */
6768 void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
6769                                         struct btrfs_transaction *transaction)
6770 {
6771         struct extent_map *em;
6772         struct map_lookup *map;
6773         struct btrfs_device *dev;
6774         int i;
6775
6776         if (list_empty(&transaction->pending_chunks))
6777                 return;
6778
6779         /* In order to kick the device replace finish process */
6780         lock_chunks(root);
6781         list_for_each_entry(em, &transaction->pending_chunks, list) {
6782                 map = (struct map_lookup *)em->bdev;
6783
6784                 for (i = 0; i < map->num_stripes; i++) {
6785                         dev = map->stripes[i].dev;
6786                         dev->commit_bytes_used = dev->bytes_used;
6787                 }
6788         }
6789         unlock_chunks(root);
6790 }
6791
6792 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
6793 {
6794         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6795         while (fs_devices) {
6796                 fs_devices->fs_info = fs_info;
6797                 fs_devices = fs_devices->seed;
6798         }
6799 }
6800
6801 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
6802 {
6803         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6804         while (fs_devices) {
6805                 fs_devices->fs_info = NULL;
6806                 fs_devices = fs_devices->seed;
6807         }
6808 }
6809
6810 void btrfs_close_one_device(struct btrfs_device *device)
6811 {
6812         struct btrfs_fs_devices *fs_devices = device->fs_devices;
6813         struct btrfs_device *new_device;
6814         struct rcu_string *name;
6815
6816         if (device->bdev)
6817                 fs_devices->open_devices--;
6818
6819         if (device->writeable &&
6820             device->devid != BTRFS_DEV_REPLACE_DEVID) {
6821                 list_del_init(&device->dev_alloc_list);
6822                 fs_devices->rw_devices--;
6823         }
6824
6825         if (device->missing)
6826                 fs_devices->missing_devices--;
6827
6828         new_device = btrfs_alloc_device(NULL, &device->devid,
6829                                         device->uuid);
6830         BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
6831
6832         /* Safe because we are under uuid_mutex */
6833         if (device->name) {
6834                 name = rcu_string_strdup(device->name->str, GFP_NOFS);
6835                 BUG_ON(!name); /* -ENOMEM */
6836                 rcu_assign_pointer(new_device->name, name);
6837         }
6838
6839         list_replace_rcu(&device->dev_list, &new_device->dev_list);
6840         new_device->fs_devices = device->fs_devices;
6841
6842         call_rcu(&device->rcu, free_device);
6843 }