Btrfs: Fix page count calculation
[cascardo/linux.git] / fs / btrfs / volumes.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/random.h>
24 #include <linux/iocontext.h>
25 #include <linux/capability.h>
26 #include <asm/div64.h>
27 #include "compat.h"
28 #include "ctree.h"
29 #include "extent_map.h"
30 #include "disk-io.h"
31 #include "transaction.h"
32 #include "print-tree.h"
33 #include "volumes.h"
34 #include "async-thread.h"
35
36 struct map_lookup {
37         u64 type;
38         int io_align;
39         int io_width;
40         int stripe_len;
41         int sector_size;
42         int num_stripes;
43         int sub_stripes;
44         struct btrfs_bio_stripe stripes[];
45 };
46
47 static int init_first_rw_device(struct btrfs_trans_handle *trans,
48                                 struct btrfs_root *root,
49                                 struct btrfs_device *device);
50 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
51
52 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
53                             (sizeof(struct btrfs_bio_stripe) * (n)))
54
55 static DEFINE_MUTEX(uuid_mutex);
56 static LIST_HEAD(fs_uuids);
57
58 void btrfs_lock_volumes(void)
59 {
60         mutex_lock(&uuid_mutex);
61 }
62
63 void btrfs_unlock_volumes(void)
64 {
65         mutex_unlock(&uuid_mutex);
66 }
67
68 static void lock_chunks(struct btrfs_root *root)
69 {
70         mutex_lock(&root->fs_info->chunk_mutex);
71 }
72
73 static void unlock_chunks(struct btrfs_root *root)
74 {
75         mutex_unlock(&root->fs_info->chunk_mutex);
76 }
77
78 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
79 {
80         struct btrfs_device *device;
81         WARN_ON(fs_devices->opened);
82         while (!list_empty(&fs_devices->devices)) {
83                 device = list_entry(fs_devices->devices.next,
84                                     struct btrfs_device, dev_list);
85                 list_del(&device->dev_list);
86                 kfree(device->name);
87                 kfree(device);
88         }
89         kfree(fs_devices);
90 }
91
92 int btrfs_cleanup_fs_uuids(void)
93 {
94         struct btrfs_fs_devices *fs_devices;
95
96         while (!list_empty(&fs_uuids)) {
97                 fs_devices = list_entry(fs_uuids.next,
98                                         struct btrfs_fs_devices, list);
99                 list_del(&fs_devices->list);
100                 free_fs_devices(fs_devices);
101         }
102         return 0;
103 }
104
105 static noinline struct btrfs_device *__find_device(struct list_head *head,
106                                                    u64 devid, u8 *uuid)
107 {
108         struct btrfs_device *dev;
109
110         list_for_each_entry(dev, head, dev_list) {
111                 if (dev->devid == devid &&
112                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
113                         return dev;
114                 }
115         }
116         return NULL;
117 }
118
119 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
120 {
121         struct btrfs_fs_devices *fs_devices;
122
123         list_for_each_entry(fs_devices, &fs_uuids, list) {
124                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
125                         return fs_devices;
126         }
127         return NULL;
128 }
129
130 static void requeue_list(struct btrfs_pending_bios *pending_bios,
131                         struct bio *head, struct bio *tail)
132 {
133
134         struct bio *old_head;
135
136         old_head = pending_bios->head;
137         pending_bios->head = head;
138         if (pending_bios->tail)
139                 tail->bi_next = old_head;
140         else
141                 pending_bios->tail = tail;
142 }
143
144 /*
145  * we try to collect pending bios for a device so we don't get a large
146  * number of procs sending bios down to the same device.  This greatly
147  * improves the schedulers ability to collect and merge the bios.
148  *
149  * But, it also turns into a long list of bios to process and that is sure
150  * to eventually make the worker thread block.  The solution here is to
151  * make some progress and then put this work struct back at the end of
152  * the list if the block device is congested.  This way, multiple devices
153  * can make progress from a single worker thread.
154  */
155 static noinline int run_scheduled_bios(struct btrfs_device *device)
156 {
157         struct bio *pending;
158         struct backing_dev_info *bdi;
159         struct btrfs_fs_info *fs_info;
160         struct btrfs_pending_bios *pending_bios;
161         struct bio *tail;
162         struct bio *cur;
163         int again = 0;
164         unsigned long num_run;
165         unsigned long num_sync_run;
166         unsigned long batch_run = 0;
167         unsigned long limit;
168         unsigned long last_waited = 0;
169         int force_reg = 0;
170
171         bdi = blk_get_backing_dev_info(device->bdev);
172         fs_info = device->dev_root->fs_info;
173         limit = btrfs_async_submit_limit(fs_info);
174         limit = limit * 2 / 3;
175
176         /* we want to make sure that every time we switch from the sync
177          * list to the normal list, we unplug
178          */
179         num_sync_run = 0;
180
181 loop:
182         spin_lock(&device->io_lock);
183
184 loop_lock:
185         num_run = 0;
186
187         /* take all the bios off the list at once and process them
188          * later on (without the lock held).  But, remember the
189          * tail and other pointers so the bios can be properly reinserted
190          * into the list if we hit congestion
191          */
192         if (!force_reg && device->pending_sync_bios.head) {
193                 pending_bios = &device->pending_sync_bios;
194                 force_reg = 1;
195         } else {
196                 pending_bios = &device->pending_bios;
197                 force_reg = 0;
198         }
199
200         pending = pending_bios->head;
201         tail = pending_bios->tail;
202         WARN_ON(pending && !tail);
203
204         /*
205          * if pending was null this time around, no bios need processing
206          * at all and we can stop.  Otherwise it'll loop back up again
207          * and do an additional check so no bios are missed.
208          *
209          * device->running_pending is used to synchronize with the
210          * schedule_bio code.
211          */
212         if (device->pending_sync_bios.head == NULL &&
213             device->pending_bios.head == NULL) {
214                 again = 0;
215                 device->running_pending = 0;
216         } else {
217                 again = 1;
218                 device->running_pending = 1;
219         }
220
221         pending_bios->head = NULL;
222         pending_bios->tail = NULL;
223
224         spin_unlock(&device->io_lock);
225
226         /*
227          * if we're doing the regular priority list, make sure we unplug
228          * for any high prio bios we've sent down
229          */
230         if (pending_bios == &device->pending_bios && num_sync_run > 0) {
231                 num_sync_run = 0;
232                 blk_run_backing_dev(bdi, NULL);
233         }
234
235         while (pending) {
236
237                 rmb();
238                 /* we want to work on both lists, but do more bios on the
239                  * sync list than the regular list
240                  */
241                 if ((num_run > 32 &&
242                     pending_bios != &device->pending_sync_bios &&
243                     device->pending_sync_bios.head) ||
244                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
245                     device->pending_bios.head)) {
246                         spin_lock(&device->io_lock);
247                         requeue_list(pending_bios, pending, tail);
248                         goto loop_lock;
249                 }
250
251                 cur = pending;
252                 pending = pending->bi_next;
253                 cur->bi_next = NULL;
254                 atomic_dec(&fs_info->nr_async_bios);
255
256                 if (atomic_read(&fs_info->nr_async_bios) < limit &&
257                     waitqueue_active(&fs_info->async_submit_wait))
258                         wake_up(&fs_info->async_submit_wait);
259
260                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
261
262                 if (cur->bi_rw & REQ_SYNC)
263                         num_sync_run++;
264
265                 submit_bio(cur->bi_rw, cur);
266                 num_run++;
267                 batch_run++;
268                 if (need_resched()) {
269                         if (num_sync_run) {
270                                 blk_run_backing_dev(bdi, NULL);
271                                 num_sync_run = 0;
272                         }
273                         cond_resched();
274                 }
275
276                 /*
277                  * we made progress, there is more work to do and the bdi
278                  * is now congested.  Back off and let other work structs
279                  * run instead
280                  */
281                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
282                     fs_info->fs_devices->open_devices > 1) {
283                         struct io_context *ioc;
284
285                         ioc = current->io_context;
286
287                         /*
288                          * the main goal here is that we don't want to
289                          * block if we're going to be able to submit
290                          * more requests without blocking.
291                          *
292                          * This code does two great things, it pokes into
293                          * the elevator code from a filesystem _and_
294                          * it makes assumptions about how batching works.
295                          */
296                         if (ioc && ioc->nr_batch_requests > 0 &&
297                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
298                             (last_waited == 0 ||
299                              ioc->last_waited == last_waited)) {
300                                 /*
301                                  * we want to go through our batch of
302                                  * requests and stop.  So, we copy out
303                                  * the ioc->last_waited time and test
304                                  * against it before looping
305                                  */
306                                 last_waited = ioc->last_waited;
307                                 if (need_resched()) {
308                                         if (num_sync_run) {
309                                                 blk_run_backing_dev(bdi, NULL);
310                                                 num_sync_run = 0;
311                                         }
312                                         cond_resched();
313                                 }
314                                 continue;
315                         }
316                         spin_lock(&device->io_lock);
317                         requeue_list(pending_bios, pending, tail);
318                         device->running_pending = 1;
319
320                         spin_unlock(&device->io_lock);
321                         btrfs_requeue_work(&device->work);
322                         goto done;
323                 }
324         }
325
326         if (num_sync_run) {
327                 num_sync_run = 0;
328                 blk_run_backing_dev(bdi, NULL);
329         }
330         /*
331          * IO has already been through a long path to get here.  Checksumming,
332          * async helper threads, perhaps compression.  We've done a pretty
333          * good job of collecting a batch of IO and should just unplug
334          * the device right away.
335          *
336          * This will help anyone who is waiting on the IO, they might have
337          * already unplugged, but managed to do so before the bio they
338          * cared about found its way down here.
339          */
340         blk_run_backing_dev(bdi, NULL);
341
342         cond_resched();
343         if (again)
344                 goto loop;
345
346         spin_lock(&device->io_lock);
347         if (device->pending_bios.head || device->pending_sync_bios.head)
348                 goto loop_lock;
349         spin_unlock(&device->io_lock);
350
351 done:
352         return 0;
353 }
354
355 static void pending_bios_fn(struct btrfs_work *work)
356 {
357         struct btrfs_device *device;
358
359         device = container_of(work, struct btrfs_device, work);
360         run_scheduled_bios(device);
361 }
362
363 static noinline int device_list_add(const char *path,
364                            struct btrfs_super_block *disk_super,
365                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
366 {
367         struct btrfs_device *device;
368         struct btrfs_fs_devices *fs_devices;
369         u64 found_transid = btrfs_super_generation(disk_super);
370         char *name;
371
372         fs_devices = find_fsid(disk_super->fsid);
373         if (!fs_devices) {
374                 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
375                 if (!fs_devices)
376                         return -ENOMEM;
377                 INIT_LIST_HEAD(&fs_devices->devices);
378                 INIT_LIST_HEAD(&fs_devices->alloc_list);
379                 list_add(&fs_devices->list, &fs_uuids);
380                 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
381                 fs_devices->latest_devid = devid;
382                 fs_devices->latest_trans = found_transid;
383                 mutex_init(&fs_devices->device_list_mutex);
384                 device = NULL;
385         } else {
386                 device = __find_device(&fs_devices->devices, devid,
387                                        disk_super->dev_item.uuid);
388         }
389         if (!device) {
390                 if (fs_devices->opened)
391                         return -EBUSY;
392
393                 device = kzalloc(sizeof(*device), GFP_NOFS);
394                 if (!device) {
395                         /* we can safely leave the fs_devices entry around */
396                         return -ENOMEM;
397                 }
398                 device->devid = devid;
399                 device->work.func = pending_bios_fn;
400                 memcpy(device->uuid, disk_super->dev_item.uuid,
401                        BTRFS_UUID_SIZE);
402                 device->barriers = 1;
403                 spin_lock_init(&device->io_lock);
404                 device->name = kstrdup(path, GFP_NOFS);
405                 if (!device->name) {
406                         kfree(device);
407                         return -ENOMEM;
408                 }
409                 INIT_LIST_HEAD(&device->dev_alloc_list);
410
411                 mutex_lock(&fs_devices->device_list_mutex);
412                 list_add(&device->dev_list, &fs_devices->devices);
413                 mutex_unlock(&fs_devices->device_list_mutex);
414
415                 device->fs_devices = fs_devices;
416                 fs_devices->num_devices++;
417         } else if (!device->name || strcmp(device->name, path)) {
418                 name = kstrdup(path, GFP_NOFS);
419                 if (!name)
420                         return -ENOMEM;
421                 kfree(device->name);
422                 device->name = name;
423                 if (device->missing) {
424                         fs_devices->missing_devices--;
425                         device->missing = 0;
426                 }
427         }
428
429         if (found_transid > fs_devices->latest_trans) {
430                 fs_devices->latest_devid = devid;
431                 fs_devices->latest_trans = found_transid;
432         }
433         *fs_devices_ret = fs_devices;
434         return 0;
435 }
436
437 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
438 {
439         struct btrfs_fs_devices *fs_devices;
440         struct btrfs_device *device;
441         struct btrfs_device *orig_dev;
442
443         fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
444         if (!fs_devices)
445                 return ERR_PTR(-ENOMEM);
446
447         INIT_LIST_HEAD(&fs_devices->devices);
448         INIT_LIST_HEAD(&fs_devices->alloc_list);
449         INIT_LIST_HEAD(&fs_devices->list);
450         mutex_init(&fs_devices->device_list_mutex);
451         fs_devices->latest_devid = orig->latest_devid;
452         fs_devices->latest_trans = orig->latest_trans;
453         memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
454
455         mutex_lock(&orig->device_list_mutex);
456         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
457                 device = kzalloc(sizeof(*device), GFP_NOFS);
458                 if (!device)
459                         goto error;
460
461                 device->name = kstrdup(orig_dev->name, GFP_NOFS);
462                 if (!device->name) {
463                         kfree(device);
464                         goto error;
465                 }
466
467                 device->devid = orig_dev->devid;
468                 device->work.func = pending_bios_fn;
469                 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
470                 device->barriers = 1;
471                 spin_lock_init(&device->io_lock);
472                 INIT_LIST_HEAD(&device->dev_list);
473                 INIT_LIST_HEAD(&device->dev_alloc_list);
474
475                 list_add(&device->dev_list, &fs_devices->devices);
476                 device->fs_devices = fs_devices;
477                 fs_devices->num_devices++;
478         }
479         mutex_unlock(&orig->device_list_mutex);
480         return fs_devices;
481 error:
482         mutex_unlock(&orig->device_list_mutex);
483         free_fs_devices(fs_devices);
484         return ERR_PTR(-ENOMEM);
485 }
486
487 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
488 {
489         struct btrfs_device *device, *next;
490
491         mutex_lock(&uuid_mutex);
492 again:
493         mutex_lock(&fs_devices->device_list_mutex);
494         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
495                 if (device->in_fs_metadata)
496                         continue;
497
498                 if (device->bdev) {
499                         close_bdev_exclusive(device->bdev, device->mode);
500                         device->bdev = NULL;
501                         fs_devices->open_devices--;
502                 }
503                 if (device->writeable) {
504                         list_del_init(&device->dev_alloc_list);
505                         device->writeable = 0;
506                         fs_devices->rw_devices--;
507                 }
508                 list_del_init(&device->dev_list);
509                 fs_devices->num_devices--;
510                 kfree(device->name);
511                 kfree(device);
512         }
513         mutex_unlock(&fs_devices->device_list_mutex);
514
515         if (fs_devices->seed) {
516                 fs_devices = fs_devices->seed;
517                 goto again;
518         }
519
520         mutex_unlock(&uuid_mutex);
521         return 0;
522 }
523
524 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
525 {
526         struct btrfs_device *device;
527
528         if (--fs_devices->opened > 0)
529                 return 0;
530
531         list_for_each_entry(device, &fs_devices->devices, dev_list) {
532                 if (device->bdev) {
533                         close_bdev_exclusive(device->bdev, device->mode);
534                         fs_devices->open_devices--;
535                 }
536                 if (device->writeable) {
537                         list_del_init(&device->dev_alloc_list);
538                         fs_devices->rw_devices--;
539                 }
540
541                 device->bdev = NULL;
542                 device->writeable = 0;
543                 device->in_fs_metadata = 0;
544         }
545         WARN_ON(fs_devices->open_devices);
546         WARN_ON(fs_devices->rw_devices);
547         fs_devices->opened = 0;
548         fs_devices->seeding = 0;
549
550         return 0;
551 }
552
553 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
554 {
555         struct btrfs_fs_devices *seed_devices = NULL;
556         int ret;
557
558         mutex_lock(&uuid_mutex);
559         ret = __btrfs_close_devices(fs_devices);
560         if (!fs_devices->opened) {
561                 seed_devices = fs_devices->seed;
562                 fs_devices->seed = NULL;
563         }
564         mutex_unlock(&uuid_mutex);
565
566         while (seed_devices) {
567                 fs_devices = seed_devices;
568                 seed_devices = fs_devices->seed;
569                 __btrfs_close_devices(fs_devices);
570                 free_fs_devices(fs_devices);
571         }
572         return ret;
573 }
574
575 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
576                                 fmode_t flags, void *holder)
577 {
578         struct block_device *bdev;
579         struct list_head *head = &fs_devices->devices;
580         struct btrfs_device *device;
581         struct block_device *latest_bdev = NULL;
582         struct buffer_head *bh;
583         struct btrfs_super_block *disk_super;
584         u64 latest_devid = 0;
585         u64 latest_transid = 0;
586         u64 devid;
587         int seeding = 1;
588         int ret = 0;
589
590         list_for_each_entry(device, head, dev_list) {
591                 if (device->bdev)
592                         continue;
593                 if (!device->name)
594                         continue;
595
596                 bdev = open_bdev_exclusive(device->name, flags, holder);
597                 if (IS_ERR(bdev)) {
598                         printk(KERN_INFO "open %s failed\n", device->name);
599                         goto error;
600                 }
601                 set_blocksize(bdev, 4096);
602
603                 bh = btrfs_read_dev_super(bdev);
604                 if (!bh) {
605                         ret = -EINVAL;
606                         goto error_close;
607                 }
608
609                 disk_super = (struct btrfs_super_block *)bh->b_data;
610                 devid = btrfs_stack_device_id(&disk_super->dev_item);
611                 if (devid != device->devid)
612                         goto error_brelse;
613
614                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
615                            BTRFS_UUID_SIZE))
616                         goto error_brelse;
617
618                 device->generation = btrfs_super_generation(disk_super);
619                 if (!latest_transid || device->generation > latest_transid) {
620                         latest_devid = devid;
621                         latest_transid = device->generation;
622                         latest_bdev = bdev;
623                 }
624
625                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
626                         device->writeable = 0;
627                 } else {
628                         device->writeable = !bdev_read_only(bdev);
629                         seeding = 0;
630                 }
631
632                 device->bdev = bdev;
633                 device->in_fs_metadata = 0;
634                 device->mode = flags;
635
636                 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
637                         fs_devices->rotating = 1;
638
639                 fs_devices->open_devices++;
640                 if (device->writeable) {
641                         fs_devices->rw_devices++;
642                         list_add(&device->dev_alloc_list,
643                                  &fs_devices->alloc_list);
644                 }
645                 continue;
646
647 error_brelse:
648                 brelse(bh);
649 error_close:
650                 close_bdev_exclusive(bdev, FMODE_READ);
651 error:
652                 continue;
653         }
654         if (fs_devices->open_devices == 0) {
655                 ret = -EIO;
656                 goto out;
657         }
658         fs_devices->seeding = seeding;
659         fs_devices->opened = 1;
660         fs_devices->latest_bdev = latest_bdev;
661         fs_devices->latest_devid = latest_devid;
662         fs_devices->latest_trans = latest_transid;
663         fs_devices->total_rw_bytes = 0;
664 out:
665         return ret;
666 }
667
668 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
669                        fmode_t flags, void *holder)
670 {
671         int ret;
672
673         mutex_lock(&uuid_mutex);
674         if (fs_devices->opened) {
675                 fs_devices->opened++;
676                 ret = 0;
677         } else {
678                 ret = __btrfs_open_devices(fs_devices, flags, holder);
679         }
680         mutex_unlock(&uuid_mutex);
681         return ret;
682 }
683
684 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
685                           struct btrfs_fs_devices **fs_devices_ret)
686 {
687         struct btrfs_super_block *disk_super;
688         struct block_device *bdev;
689         struct buffer_head *bh;
690         int ret;
691         u64 devid;
692         u64 transid;
693
694         mutex_lock(&uuid_mutex);
695
696         bdev = open_bdev_exclusive(path, flags, holder);
697
698         if (IS_ERR(bdev)) {
699                 ret = PTR_ERR(bdev);
700                 goto error;
701         }
702
703         ret = set_blocksize(bdev, 4096);
704         if (ret)
705                 goto error_close;
706         bh = btrfs_read_dev_super(bdev);
707         if (!bh) {
708                 ret = -EINVAL;
709                 goto error_close;
710         }
711         disk_super = (struct btrfs_super_block *)bh->b_data;
712         devid = btrfs_stack_device_id(&disk_super->dev_item);
713         transid = btrfs_super_generation(disk_super);
714         if (disk_super->label[0])
715                 printk(KERN_INFO "device label %s ", disk_super->label);
716         else {
717                 /* FIXME, make a readl uuid parser */
718                 printk(KERN_INFO "device fsid %llx-%llx ",
719                        *(unsigned long long *)disk_super->fsid,
720                        *(unsigned long long *)(disk_super->fsid + 8));
721         }
722         printk(KERN_CONT "devid %llu transid %llu %s\n",
723                (unsigned long long)devid, (unsigned long long)transid, path);
724         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
725
726         brelse(bh);
727 error_close:
728         close_bdev_exclusive(bdev, flags);
729 error:
730         mutex_unlock(&uuid_mutex);
731         return ret;
732 }
733
734 /* helper to account the used device space in the range */
735 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
736                                    u64 end, u64 *length)
737 {
738         struct btrfs_key key;
739         struct btrfs_root *root = device->dev_root;
740         struct btrfs_dev_extent *dev_extent;
741         struct btrfs_path *path;
742         u64 extent_end;
743         int ret;
744         int slot;
745         struct extent_buffer *l;
746
747         *length = 0;
748
749         if (start >= device->total_bytes)
750                 return 0;
751
752         path = btrfs_alloc_path();
753         if (!path)
754                 return -ENOMEM;
755         path->reada = 2;
756
757         key.objectid = device->devid;
758         key.offset = start;
759         key.type = BTRFS_DEV_EXTENT_KEY;
760
761         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
762         if (ret < 0)
763                 goto out;
764         if (ret > 0) {
765                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
766                 if (ret < 0)
767                         goto out;
768         }
769
770         while (1) {
771                 l = path->nodes[0];
772                 slot = path->slots[0];
773                 if (slot >= btrfs_header_nritems(l)) {
774                         ret = btrfs_next_leaf(root, path);
775                         if (ret == 0)
776                                 continue;
777                         if (ret < 0)
778                                 goto out;
779
780                         break;
781                 }
782                 btrfs_item_key_to_cpu(l, &key, slot);
783
784                 if (key.objectid < device->devid)
785                         goto next;
786
787                 if (key.objectid > device->devid)
788                         break;
789
790                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
791                         goto next;
792
793                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
794                 extent_end = key.offset + btrfs_dev_extent_length(l,
795                                                                   dev_extent);
796                 if (key.offset <= start && extent_end > end) {
797                         *length = end - start + 1;
798                         break;
799                 } else if (key.offset <= start && extent_end > start)
800                         *length += extent_end - start;
801                 else if (key.offset > start && extent_end <= end)
802                         *length += extent_end - key.offset;
803                 else if (key.offset > start && key.offset <= end) {
804                         *length += end - key.offset + 1;
805                         break;
806                 } else if (key.offset > end)
807                         break;
808
809 next:
810                 path->slots[0]++;
811         }
812         ret = 0;
813 out:
814         btrfs_free_path(path);
815         return ret;
816 }
817
818 /*
819  * find_free_dev_extent - find free space in the specified device
820  * @trans:      transaction handler
821  * @device:     the device which we search the free space in
822  * @num_bytes:  the size of the free space that we need
823  * @start:      store the start of the free space.
824  * @len:        the size of the free space. that we find, or the size of the max
825  *              free space if we don't find suitable free space
826  *
827  * this uses a pretty simple search, the expectation is that it is
828  * called very infrequently and that a given device has a small number
829  * of extents
830  *
831  * @start is used to store the start of the free space if we find. But if we
832  * don't find suitable free space, it will be used to store the start position
833  * of the max free space.
834  *
835  * @len is used to store the size of the free space that we find.
836  * But if we don't find suitable free space, it is used to store the size of
837  * the max free space.
838  */
839 int find_free_dev_extent(struct btrfs_trans_handle *trans,
840                          struct btrfs_device *device, u64 num_bytes,
841                          u64 *start, u64 *len)
842 {
843         struct btrfs_key key;
844         struct btrfs_root *root = device->dev_root;
845         struct btrfs_dev_extent *dev_extent;
846         struct btrfs_path *path;
847         u64 hole_size;
848         u64 max_hole_start;
849         u64 max_hole_size;
850         u64 extent_end;
851         u64 search_start;
852         u64 search_end = device->total_bytes;
853         int ret;
854         int slot;
855         struct extent_buffer *l;
856
857         /* FIXME use last free of some kind */
858
859         /* we don't want to overwrite the superblock on the drive,
860          * so we make sure to start at an offset of at least 1MB
861          */
862         search_start = 1024 * 1024;
863
864         if (root->fs_info->alloc_start + num_bytes <= search_end)
865                 search_start = max(root->fs_info->alloc_start, search_start);
866
867         max_hole_start = search_start;
868         max_hole_size = 0;
869
870         if (search_start >= search_end) {
871                 ret = -ENOSPC;
872                 goto error;
873         }
874
875         path = btrfs_alloc_path();
876         if (!path) {
877                 ret = -ENOMEM;
878                 goto error;
879         }
880         path->reada = 2;
881
882         key.objectid = device->devid;
883         key.offset = search_start;
884         key.type = BTRFS_DEV_EXTENT_KEY;
885
886         ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
887         if (ret < 0)
888                 goto out;
889         if (ret > 0) {
890                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
891                 if (ret < 0)
892                         goto out;
893         }
894
895         while (1) {
896                 l = path->nodes[0];
897                 slot = path->slots[0];
898                 if (slot >= btrfs_header_nritems(l)) {
899                         ret = btrfs_next_leaf(root, path);
900                         if (ret == 0)
901                                 continue;
902                         if (ret < 0)
903                                 goto out;
904
905                         break;
906                 }
907                 btrfs_item_key_to_cpu(l, &key, slot);
908
909                 if (key.objectid < device->devid)
910                         goto next;
911
912                 if (key.objectid > device->devid)
913                         break;
914
915                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
916                         goto next;
917
918                 if (key.offset > search_start) {
919                         hole_size = key.offset - search_start;
920
921                         if (hole_size > max_hole_size) {
922                                 max_hole_start = search_start;
923                                 max_hole_size = hole_size;
924                         }
925
926                         /*
927                          * If this free space is greater than which we need,
928                          * it must be the max free space that we have found
929                          * until now, so max_hole_start must point to the start
930                          * of this free space and the length of this free space
931                          * is stored in max_hole_size. Thus, we return
932                          * max_hole_start and max_hole_size and go back to the
933                          * caller.
934                          */
935                         if (hole_size >= num_bytes) {
936                                 ret = 0;
937                                 goto out;
938                         }
939                 }
940
941                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
942                 extent_end = key.offset + btrfs_dev_extent_length(l,
943                                                                   dev_extent);
944                 if (extent_end > search_start)
945                         search_start = extent_end;
946 next:
947                 path->slots[0]++;
948                 cond_resched();
949         }
950
951         hole_size = search_end- search_start;
952         if (hole_size > max_hole_size) {
953                 max_hole_start = search_start;
954                 max_hole_size = hole_size;
955         }
956
957         /* See above. */
958         if (hole_size < num_bytes)
959                 ret = -ENOSPC;
960         else
961                 ret = 0;
962
963 out:
964         btrfs_free_path(path);
965 error:
966         *start = max_hole_start;
967         if (len)
968                 *len = max_hole_size;
969         return ret;
970 }
971
972 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
973                           struct btrfs_device *device,
974                           u64 start)
975 {
976         int ret;
977         struct btrfs_path *path;
978         struct btrfs_root *root = device->dev_root;
979         struct btrfs_key key;
980         struct btrfs_key found_key;
981         struct extent_buffer *leaf = NULL;
982         struct btrfs_dev_extent *extent = NULL;
983
984         path = btrfs_alloc_path();
985         if (!path)
986                 return -ENOMEM;
987
988         key.objectid = device->devid;
989         key.offset = start;
990         key.type = BTRFS_DEV_EXTENT_KEY;
991
992         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
993         if (ret > 0) {
994                 ret = btrfs_previous_item(root, path, key.objectid,
995                                           BTRFS_DEV_EXTENT_KEY);
996                 BUG_ON(ret);
997                 leaf = path->nodes[0];
998                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
999                 extent = btrfs_item_ptr(leaf, path->slots[0],
1000                                         struct btrfs_dev_extent);
1001                 BUG_ON(found_key.offset > start || found_key.offset +
1002                        btrfs_dev_extent_length(leaf, extent) < start);
1003                 ret = 0;
1004         } else if (ret == 0) {
1005                 leaf = path->nodes[0];
1006                 extent = btrfs_item_ptr(leaf, path->slots[0],
1007                                         struct btrfs_dev_extent);
1008         }
1009         BUG_ON(ret);
1010
1011         if (device->bytes_used > 0)
1012                 device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
1013         ret = btrfs_del_item(trans, root, path);
1014         BUG_ON(ret);
1015
1016         btrfs_free_path(path);
1017         return ret;
1018 }
1019
1020 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1021                            struct btrfs_device *device,
1022                            u64 chunk_tree, u64 chunk_objectid,
1023                            u64 chunk_offset, u64 start, u64 num_bytes)
1024 {
1025         int ret;
1026         struct btrfs_path *path;
1027         struct btrfs_root *root = device->dev_root;
1028         struct btrfs_dev_extent *extent;
1029         struct extent_buffer *leaf;
1030         struct btrfs_key key;
1031
1032         WARN_ON(!device->in_fs_metadata);
1033         path = btrfs_alloc_path();
1034         if (!path)
1035                 return -ENOMEM;
1036
1037         key.objectid = device->devid;
1038         key.offset = start;
1039         key.type = BTRFS_DEV_EXTENT_KEY;
1040         ret = btrfs_insert_empty_item(trans, root, path, &key,
1041                                       sizeof(*extent));
1042         BUG_ON(ret);
1043
1044         leaf = path->nodes[0];
1045         extent = btrfs_item_ptr(leaf, path->slots[0],
1046                                 struct btrfs_dev_extent);
1047         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1048         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1049         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1050
1051         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1052                     (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
1053                     BTRFS_UUID_SIZE);
1054
1055         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1056         btrfs_mark_buffer_dirty(leaf);
1057         btrfs_free_path(path);
1058         return ret;
1059 }
1060
1061 static noinline int find_next_chunk(struct btrfs_root *root,
1062                                     u64 objectid, u64 *offset)
1063 {
1064         struct btrfs_path *path;
1065         int ret;
1066         struct btrfs_key key;
1067         struct btrfs_chunk *chunk;
1068         struct btrfs_key found_key;
1069
1070         path = btrfs_alloc_path();
1071         BUG_ON(!path);
1072
1073         key.objectid = objectid;
1074         key.offset = (u64)-1;
1075         key.type = BTRFS_CHUNK_ITEM_KEY;
1076
1077         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1078         if (ret < 0)
1079                 goto error;
1080
1081         BUG_ON(ret == 0);
1082
1083         ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
1084         if (ret) {
1085                 *offset = 0;
1086         } else {
1087                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1088                                       path->slots[0]);
1089                 if (found_key.objectid != objectid)
1090                         *offset = 0;
1091                 else {
1092                         chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1093                                                struct btrfs_chunk);
1094                         *offset = found_key.offset +
1095                                 btrfs_chunk_length(path->nodes[0], chunk);
1096                 }
1097         }
1098         ret = 0;
1099 error:
1100         btrfs_free_path(path);
1101         return ret;
1102 }
1103
1104 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
1105 {
1106         int ret;
1107         struct btrfs_key key;
1108         struct btrfs_key found_key;
1109         struct btrfs_path *path;
1110
1111         root = root->fs_info->chunk_root;
1112
1113         path = btrfs_alloc_path();
1114         if (!path)
1115                 return -ENOMEM;
1116
1117         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1118         key.type = BTRFS_DEV_ITEM_KEY;
1119         key.offset = (u64)-1;
1120
1121         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1122         if (ret < 0)
1123                 goto error;
1124
1125         BUG_ON(ret == 0);
1126
1127         ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
1128                                   BTRFS_DEV_ITEM_KEY);
1129         if (ret) {
1130                 *objectid = 1;
1131         } else {
1132                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1133                                       path->slots[0]);
1134                 *objectid = found_key.offset + 1;
1135         }
1136         ret = 0;
1137 error:
1138         btrfs_free_path(path);
1139         return ret;
1140 }
1141
1142 /*
1143  * the device information is stored in the chunk root
1144  * the btrfs_device struct should be fully filled in
1145  */
1146 int btrfs_add_device(struct btrfs_trans_handle *trans,
1147                      struct btrfs_root *root,
1148                      struct btrfs_device *device)
1149 {
1150         int ret;
1151         struct btrfs_path *path;
1152         struct btrfs_dev_item *dev_item;
1153         struct extent_buffer *leaf;
1154         struct btrfs_key key;
1155         unsigned long ptr;
1156
1157         root = root->fs_info->chunk_root;
1158
1159         path = btrfs_alloc_path();
1160         if (!path)
1161                 return -ENOMEM;
1162
1163         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1164         key.type = BTRFS_DEV_ITEM_KEY;
1165         key.offset = device->devid;
1166
1167         ret = btrfs_insert_empty_item(trans, root, path, &key,
1168                                       sizeof(*dev_item));
1169         if (ret)
1170                 goto out;
1171
1172         leaf = path->nodes[0];
1173         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1174
1175         btrfs_set_device_id(leaf, dev_item, device->devid);
1176         btrfs_set_device_generation(leaf, dev_item, 0);
1177         btrfs_set_device_type(leaf, dev_item, device->type);
1178         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1179         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1180         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1181         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1182         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1183         btrfs_set_device_group(leaf, dev_item, 0);
1184         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1185         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1186         btrfs_set_device_start_offset(leaf, dev_item, 0);
1187
1188         ptr = (unsigned long)btrfs_device_uuid(dev_item);
1189         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1190         ptr = (unsigned long)btrfs_device_fsid(dev_item);
1191         write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1192         btrfs_mark_buffer_dirty(leaf);
1193
1194         ret = 0;
1195 out:
1196         btrfs_free_path(path);
1197         return ret;
1198 }
1199
1200 static int btrfs_rm_dev_item(struct btrfs_root *root,
1201                              struct btrfs_device *device)
1202 {
1203         int ret;
1204         struct btrfs_path *path;
1205         struct btrfs_key key;
1206         struct btrfs_trans_handle *trans;
1207
1208         root = root->fs_info->chunk_root;
1209
1210         path = btrfs_alloc_path();
1211         if (!path)
1212                 return -ENOMEM;
1213
1214         trans = btrfs_start_transaction(root, 0);
1215         if (IS_ERR(trans)) {
1216                 btrfs_free_path(path);
1217                 return PTR_ERR(trans);
1218         }
1219         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1220         key.type = BTRFS_DEV_ITEM_KEY;
1221         key.offset = device->devid;
1222         lock_chunks(root);
1223
1224         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1225         if (ret < 0)
1226                 goto out;
1227
1228         if (ret > 0) {
1229                 ret = -ENOENT;
1230                 goto out;
1231         }
1232
1233         ret = btrfs_del_item(trans, root, path);
1234         if (ret)
1235                 goto out;
1236 out:
1237         btrfs_free_path(path);
1238         unlock_chunks(root);
1239         btrfs_commit_transaction(trans, root);
1240         return ret;
1241 }
1242
1243 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1244 {
1245         struct btrfs_device *device;
1246         struct btrfs_device *next_device;
1247         struct block_device *bdev;
1248         struct buffer_head *bh = NULL;
1249         struct btrfs_super_block *disk_super;
1250         u64 all_avail;
1251         u64 devid;
1252         u64 num_devices;
1253         u8 *dev_uuid;
1254         int ret = 0;
1255
1256         mutex_lock(&uuid_mutex);
1257         mutex_lock(&root->fs_info->volume_mutex);
1258
1259         all_avail = root->fs_info->avail_data_alloc_bits |
1260                 root->fs_info->avail_system_alloc_bits |
1261                 root->fs_info->avail_metadata_alloc_bits;
1262
1263         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1264             root->fs_info->fs_devices->num_devices <= 4) {
1265                 printk(KERN_ERR "btrfs: unable to go below four devices "
1266                        "on raid10\n");
1267                 ret = -EINVAL;
1268                 goto out;
1269         }
1270
1271         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1272             root->fs_info->fs_devices->num_devices <= 2) {
1273                 printk(KERN_ERR "btrfs: unable to go below two "
1274                        "devices on raid1\n");
1275                 ret = -EINVAL;
1276                 goto out;
1277         }
1278
1279         if (strcmp(device_path, "missing") == 0) {
1280                 struct list_head *devices;
1281                 struct btrfs_device *tmp;
1282
1283                 device = NULL;
1284                 devices = &root->fs_info->fs_devices->devices;
1285                 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1286                 list_for_each_entry(tmp, devices, dev_list) {
1287                         if (tmp->in_fs_metadata && !tmp->bdev) {
1288                                 device = tmp;
1289                                 break;
1290                         }
1291                 }
1292                 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1293                 bdev = NULL;
1294                 bh = NULL;
1295                 disk_super = NULL;
1296                 if (!device) {
1297                         printk(KERN_ERR "btrfs: no missing devices found to "
1298                                "remove\n");
1299                         goto out;
1300                 }
1301         } else {
1302                 bdev = open_bdev_exclusive(device_path, FMODE_READ,
1303                                       root->fs_info->bdev_holder);
1304                 if (IS_ERR(bdev)) {
1305                         ret = PTR_ERR(bdev);
1306                         goto out;
1307                 }
1308
1309                 set_blocksize(bdev, 4096);
1310                 bh = btrfs_read_dev_super(bdev);
1311                 if (!bh) {
1312                         ret = -EINVAL;
1313                         goto error_close;
1314                 }
1315                 disk_super = (struct btrfs_super_block *)bh->b_data;
1316                 devid = btrfs_stack_device_id(&disk_super->dev_item);
1317                 dev_uuid = disk_super->dev_item.uuid;
1318                 device = btrfs_find_device(root, devid, dev_uuid,
1319                                            disk_super->fsid);
1320                 if (!device) {
1321                         ret = -ENOENT;
1322                         goto error_brelse;
1323                 }
1324         }
1325
1326         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1327                 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1328                        "device\n");
1329                 ret = -EINVAL;
1330                 goto error_brelse;
1331         }
1332
1333         if (device->writeable) {
1334                 list_del_init(&device->dev_alloc_list);
1335                 root->fs_info->fs_devices->rw_devices--;
1336         }
1337
1338         ret = btrfs_shrink_device(device, 0);
1339         if (ret)
1340                 goto error_brelse;
1341
1342         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1343         if (ret)
1344                 goto error_brelse;
1345
1346         device->in_fs_metadata = 0;
1347
1348         /*
1349          * the device list mutex makes sure that we don't change
1350          * the device list while someone else is writing out all
1351          * the device supers.
1352          */
1353         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1354         list_del_init(&device->dev_list);
1355         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1356
1357         device->fs_devices->num_devices--;
1358
1359         if (device->missing)
1360                 root->fs_info->fs_devices->missing_devices--;
1361
1362         next_device = list_entry(root->fs_info->fs_devices->devices.next,
1363                                  struct btrfs_device, dev_list);
1364         if (device->bdev == root->fs_info->sb->s_bdev)
1365                 root->fs_info->sb->s_bdev = next_device->bdev;
1366         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1367                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1368
1369         if (device->bdev) {
1370                 close_bdev_exclusive(device->bdev, device->mode);
1371                 device->bdev = NULL;
1372                 device->fs_devices->open_devices--;
1373         }
1374
1375         num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1376         btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
1377
1378         if (device->fs_devices->open_devices == 0) {
1379                 struct btrfs_fs_devices *fs_devices;
1380                 fs_devices = root->fs_info->fs_devices;
1381                 while (fs_devices) {
1382                         if (fs_devices->seed == device->fs_devices)
1383                                 break;
1384                         fs_devices = fs_devices->seed;
1385                 }
1386                 fs_devices->seed = device->fs_devices->seed;
1387                 device->fs_devices->seed = NULL;
1388                 __btrfs_close_devices(device->fs_devices);
1389                 free_fs_devices(device->fs_devices);
1390         }
1391
1392         /*
1393          * at this point, the device is zero sized.  We want to
1394          * remove it from the devices list and zero out the old super
1395          */
1396         if (device->writeable) {
1397                 /* make sure this device isn't detected as part of
1398                  * the FS anymore
1399                  */
1400                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1401                 set_buffer_dirty(bh);
1402                 sync_dirty_buffer(bh);
1403         }
1404
1405         kfree(device->name);
1406         kfree(device);
1407         ret = 0;
1408
1409 error_brelse:
1410         brelse(bh);
1411 error_close:
1412         if (bdev)
1413                 close_bdev_exclusive(bdev, FMODE_READ);
1414 out:
1415         mutex_unlock(&root->fs_info->volume_mutex);
1416         mutex_unlock(&uuid_mutex);
1417         return ret;
1418 }
1419
1420 /*
1421  * does all the dirty work required for changing file system's UUID.
1422  */
1423 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1424                                 struct btrfs_root *root)
1425 {
1426         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1427         struct btrfs_fs_devices *old_devices;
1428         struct btrfs_fs_devices *seed_devices;
1429         struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
1430         struct btrfs_device *device;
1431         u64 super_flags;
1432
1433         BUG_ON(!mutex_is_locked(&uuid_mutex));
1434         if (!fs_devices->seeding)
1435                 return -EINVAL;
1436
1437         seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1438         if (!seed_devices)
1439                 return -ENOMEM;
1440
1441         old_devices = clone_fs_devices(fs_devices);
1442         if (IS_ERR(old_devices)) {
1443                 kfree(seed_devices);
1444                 return PTR_ERR(old_devices);
1445         }
1446
1447         list_add(&old_devices->list, &fs_uuids);
1448
1449         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1450         seed_devices->opened = 1;
1451         INIT_LIST_HEAD(&seed_devices->devices);
1452         INIT_LIST_HEAD(&seed_devices->alloc_list);
1453         mutex_init(&seed_devices->device_list_mutex);
1454         list_splice_init(&fs_devices->devices, &seed_devices->devices);
1455         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1456         list_for_each_entry(device, &seed_devices->devices, dev_list) {
1457                 device->fs_devices = seed_devices;
1458         }
1459
1460         fs_devices->seeding = 0;
1461         fs_devices->num_devices = 0;
1462         fs_devices->open_devices = 0;
1463         fs_devices->seed = seed_devices;
1464
1465         generate_random_uuid(fs_devices->fsid);
1466         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1467         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1468         super_flags = btrfs_super_flags(disk_super) &
1469                       ~BTRFS_SUPER_FLAG_SEEDING;
1470         btrfs_set_super_flags(disk_super, super_flags);
1471
1472         return 0;
1473 }
1474
1475 /*
1476  * strore the expected generation for seed devices in device items.
1477  */
1478 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1479                                struct btrfs_root *root)
1480 {
1481         struct btrfs_path *path;
1482         struct extent_buffer *leaf;
1483         struct btrfs_dev_item *dev_item;
1484         struct btrfs_device *device;
1485         struct btrfs_key key;
1486         u8 fs_uuid[BTRFS_UUID_SIZE];
1487         u8 dev_uuid[BTRFS_UUID_SIZE];
1488         u64 devid;
1489         int ret;
1490
1491         path = btrfs_alloc_path();
1492         if (!path)
1493                 return -ENOMEM;
1494
1495         root = root->fs_info->chunk_root;
1496         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1497         key.offset = 0;
1498         key.type = BTRFS_DEV_ITEM_KEY;
1499
1500         while (1) {
1501                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1502                 if (ret < 0)
1503                         goto error;
1504
1505                 leaf = path->nodes[0];
1506 next_slot:
1507                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1508                         ret = btrfs_next_leaf(root, path);
1509                         if (ret > 0)
1510                                 break;
1511                         if (ret < 0)
1512                                 goto error;
1513                         leaf = path->nodes[0];
1514                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1515                         btrfs_release_path(root, path);
1516                         continue;
1517                 }
1518
1519                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1520                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1521                     key.type != BTRFS_DEV_ITEM_KEY)
1522                         break;
1523
1524                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1525                                           struct btrfs_dev_item);
1526                 devid = btrfs_device_id(leaf, dev_item);
1527                 read_extent_buffer(leaf, dev_uuid,
1528                                    (unsigned long)btrfs_device_uuid(dev_item),
1529                                    BTRFS_UUID_SIZE);
1530                 read_extent_buffer(leaf, fs_uuid,
1531                                    (unsigned long)btrfs_device_fsid(dev_item),
1532                                    BTRFS_UUID_SIZE);
1533                 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1534                 BUG_ON(!device);
1535
1536                 if (device->fs_devices->seeding) {
1537                         btrfs_set_device_generation(leaf, dev_item,
1538                                                     device->generation);
1539                         btrfs_mark_buffer_dirty(leaf);
1540                 }
1541
1542                 path->slots[0]++;
1543                 goto next_slot;
1544         }
1545         ret = 0;
1546 error:
1547         btrfs_free_path(path);
1548         return ret;
1549 }
1550
1551 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1552 {
1553         struct btrfs_trans_handle *trans;
1554         struct btrfs_device *device;
1555         struct block_device *bdev;
1556         struct list_head *devices;
1557         struct super_block *sb = root->fs_info->sb;
1558         u64 total_bytes;
1559         int seeding_dev = 0;
1560         int ret = 0;
1561
1562         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1563                 return -EINVAL;
1564
1565         bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
1566         if (IS_ERR(bdev))
1567                 return PTR_ERR(bdev);
1568
1569         if (root->fs_info->fs_devices->seeding) {
1570                 seeding_dev = 1;
1571                 down_write(&sb->s_umount);
1572                 mutex_lock(&uuid_mutex);
1573         }
1574
1575         filemap_write_and_wait(bdev->bd_inode->i_mapping);
1576         mutex_lock(&root->fs_info->volume_mutex);
1577
1578         devices = &root->fs_info->fs_devices->devices;
1579         /*
1580          * we have the volume lock, so we don't need the extra
1581          * device list mutex while reading the list here.
1582          */
1583         list_for_each_entry(device, devices, dev_list) {
1584                 if (device->bdev == bdev) {
1585                         ret = -EEXIST;
1586                         goto error;
1587                 }
1588         }
1589
1590         device = kzalloc(sizeof(*device), GFP_NOFS);
1591         if (!device) {
1592                 /* we can safely leave the fs_devices entry around */
1593                 ret = -ENOMEM;
1594                 goto error;
1595         }
1596
1597         device->name = kstrdup(device_path, GFP_NOFS);
1598         if (!device->name) {
1599                 kfree(device);
1600                 ret = -ENOMEM;
1601                 goto error;
1602         }
1603
1604         ret = find_next_devid(root, &device->devid);
1605         if (ret) {
1606                 kfree(device);
1607                 goto error;
1608         }
1609
1610         trans = btrfs_start_transaction(root, 0);
1611         if (IS_ERR(trans)) {
1612                 kfree(device);
1613                 ret = PTR_ERR(trans);
1614                 goto error;
1615         }
1616
1617         lock_chunks(root);
1618
1619         device->barriers = 1;
1620         device->writeable = 1;
1621         device->work.func = pending_bios_fn;
1622         generate_random_uuid(device->uuid);
1623         spin_lock_init(&device->io_lock);
1624         device->generation = trans->transid;
1625         device->io_width = root->sectorsize;
1626         device->io_align = root->sectorsize;
1627         device->sector_size = root->sectorsize;
1628         device->total_bytes = i_size_read(bdev->bd_inode);
1629         device->disk_total_bytes = device->total_bytes;
1630         device->dev_root = root->fs_info->dev_root;
1631         device->bdev = bdev;
1632         device->in_fs_metadata = 1;
1633         device->mode = 0;
1634         set_blocksize(device->bdev, 4096);
1635
1636         if (seeding_dev) {
1637                 sb->s_flags &= ~MS_RDONLY;
1638                 ret = btrfs_prepare_sprout(trans, root);
1639                 BUG_ON(ret);
1640         }
1641
1642         device->fs_devices = root->fs_info->fs_devices;
1643
1644         /*
1645          * we don't want write_supers to jump in here with our device
1646          * half setup
1647          */
1648         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1649         list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
1650         list_add(&device->dev_alloc_list,
1651                  &root->fs_info->fs_devices->alloc_list);
1652         root->fs_info->fs_devices->num_devices++;
1653         root->fs_info->fs_devices->open_devices++;
1654         root->fs_info->fs_devices->rw_devices++;
1655         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1656
1657         if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1658                 root->fs_info->fs_devices->rotating = 1;
1659
1660         total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
1661         btrfs_set_super_total_bytes(&root->fs_info->super_copy,
1662                                     total_bytes + device->total_bytes);
1663
1664         total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
1665         btrfs_set_super_num_devices(&root->fs_info->super_copy,
1666                                     total_bytes + 1);
1667         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1668
1669         if (seeding_dev) {
1670                 ret = init_first_rw_device(trans, root, device);
1671                 BUG_ON(ret);
1672                 ret = btrfs_finish_sprout(trans, root);
1673                 BUG_ON(ret);
1674         } else {
1675                 ret = btrfs_add_device(trans, root, device);
1676         }
1677
1678         /*
1679          * we've got more storage, clear any full flags on the space
1680          * infos
1681          */
1682         btrfs_clear_space_info_full(root->fs_info);
1683
1684         unlock_chunks(root);
1685         btrfs_commit_transaction(trans, root);
1686
1687         if (seeding_dev) {
1688                 mutex_unlock(&uuid_mutex);
1689                 up_write(&sb->s_umount);
1690
1691                 ret = btrfs_relocate_sys_chunks(root);
1692                 BUG_ON(ret);
1693         }
1694 out:
1695         mutex_unlock(&root->fs_info->volume_mutex);
1696         return ret;
1697 error:
1698         close_bdev_exclusive(bdev, 0);
1699         if (seeding_dev) {
1700                 mutex_unlock(&uuid_mutex);
1701                 up_write(&sb->s_umount);
1702         }
1703         goto out;
1704 }
1705
1706 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1707                                         struct btrfs_device *device)
1708 {
1709         int ret;
1710         struct btrfs_path *path;
1711         struct btrfs_root *root;
1712         struct btrfs_dev_item *dev_item;
1713         struct extent_buffer *leaf;
1714         struct btrfs_key key;
1715
1716         root = device->dev_root->fs_info->chunk_root;
1717
1718         path = btrfs_alloc_path();
1719         if (!path)
1720                 return -ENOMEM;
1721
1722         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1723         key.type = BTRFS_DEV_ITEM_KEY;
1724         key.offset = device->devid;
1725
1726         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1727         if (ret < 0)
1728                 goto out;
1729
1730         if (ret > 0) {
1731                 ret = -ENOENT;
1732                 goto out;
1733         }
1734
1735         leaf = path->nodes[0];
1736         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1737
1738         btrfs_set_device_id(leaf, dev_item, device->devid);
1739         btrfs_set_device_type(leaf, dev_item, device->type);
1740         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1741         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1742         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1743         btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1744         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1745         btrfs_mark_buffer_dirty(leaf);
1746
1747 out:
1748         btrfs_free_path(path);
1749         return ret;
1750 }
1751
1752 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1753                       struct btrfs_device *device, u64 new_size)
1754 {
1755         struct btrfs_super_block *super_copy =
1756                 &device->dev_root->fs_info->super_copy;
1757         u64 old_total = btrfs_super_total_bytes(super_copy);
1758         u64 diff = new_size - device->total_bytes;
1759
1760         if (!device->writeable)
1761                 return -EACCES;
1762         if (new_size <= device->total_bytes)
1763                 return -EINVAL;
1764
1765         btrfs_set_super_total_bytes(super_copy, old_total + diff);
1766         device->fs_devices->total_rw_bytes += diff;
1767
1768         device->total_bytes = new_size;
1769         device->disk_total_bytes = new_size;
1770         btrfs_clear_space_info_full(device->dev_root->fs_info);
1771
1772         return btrfs_update_device(trans, device);
1773 }
1774
1775 int btrfs_grow_device(struct btrfs_trans_handle *trans,
1776                       struct btrfs_device *device, u64 new_size)
1777 {
1778         int ret;
1779         lock_chunks(device->dev_root);
1780         ret = __btrfs_grow_device(trans, device, new_size);
1781         unlock_chunks(device->dev_root);
1782         return ret;
1783 }
1784
1785 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1786                             struct btrfs_root *root,
1787                             u64 chunk_tree, u64 chunk_objectid,
1788                             u64 chunk_offset)
1789 {
1790         int ret;
1791         struct btrfs_path *path;
1792         struct btrfs_key key;
1793
1794         root = root->fs_info->chunk_root;
1795         path = btrfs_alloc_path();
1796         if (!path)
1797                 return -ENOMEM;
1798
1799         key.objectid = chunk_objectid;
1800         key.offset = chunk_offset;
1801         key.type = BTRFS_CHUNK_ITEM_KEY;
1802
1803         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1804         BUG_ON(ret);
1805
1806         ret = btrfs_del_item(trans, root, path);
1807         BUG_ON(ret);
1808
1809         btrfs_free_path(path);
1810         return 0;
1811 }
1812
1813 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1814                         chunk_offset)
1815 {
1816         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1817         struct btrfs_disk_key *disk_key;
1818         struct btrfs_chunk *chunk;
1819         u8 *ptr;
1820         int ret = 0;
1821         u32 num_stripes;
1822         u32 array_size;
1823         u32 len = 0;
1824         u32 cur;
1825         struct btrfs_key key;
1826
1827         array_size = btrfs_super_sys_array_size(super_copy);
1828
1829         ptr = super_copy->sys_chunk_array;
1830         cur = 0;
1831
1832         while (cur < array_size) {
1833                 disk_key = (struct btrfs_disk_key *)ptr;
1834                 btrfs_disk_key_to_cpu(&key, disk_key);
1835
1836                 len = sizeof(*disk_key);
1837
1838                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1839                         chunk = (struct btrfs_chunk *)(ptr + len);
1840                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1841                         len += btrfs_chunk_item_size(num_stripes);
1842                 } else {
1843                         ret = -EIO;
1844                         break;
1845                 }
1846                 if (key.objectid == chunk_objectid &&
1847                     key.offset == chunk_offset) {
1848                         memmove(ptr, ptr + len, array_size - (cur + len));
1849                         array_size -= len;
1850                         btrfs_set_super_sys_array_size(super_copy, array_size);
1851                 } else {
1852                         ptr += len;
1853                         cur += len;
1854                 }
1855         }
1856         return ret;
1857 }
1858
1859 static int btrfs_relocate_chunk(struct btrfs_root *root,
1860                          u64 chunk_tree, u64 chunk_objectid,
1861                          u64 chunk_offset)
1862 {
1863         struct extent_map_tree *em_tree;
1864         struct btrfs_root *extent_root;
1865         struct btrfs_trans_handle *trans;
1866         struct extent_map *em;
1867         struct map_lookup *map;
1868         int ret;
1869         int i;
1870
1871         root = root->fs_info->chunk_root;
1872         extent_root = root->fs_info->extent_root;
1873         em_tree = &root->fs_info->mapping_tree.map_tree;
1874
1875         ret = btrfs_can_relocate(extent_root, chunk_offset);
1876         if (ret)
1877                 return -ENOSPC;
1878
1879         /* step one, relocate all the extents inside this chunk */
1880         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1881         if (ret)
1882                 return ret;
1883
1884         trans = btrfs_start_transaction(root, 0);
1885         BUG_ON(IS_ERR(trans));
1886
1887         lock_chunks(root);
1888
1889         /*
1890          * step two, delete the device extents and the
1891          * chunk tree entries
1892          */
1893         read_lock(&em_tree->lock);
1894         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1895         read_unlock(&em_tree->lock);
1896
1897         BUG_ON(em->start > chunk_offset ||
1898                em->start + em->len < chunk_offset);
1899         map = (struct map_lookup *)em->bdev;
1900
1901         for (i = 0; i < map->num_stripes; i++) {
1902                 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1903                                             map->stripes[i].physical);
1904                 BUG_ON(ret);
1905
1906                 if (map->stripes[i].dev) {
1907                         ret = btrfs_update_device(trans, map->stripes[i].dev);
1908                         BUG_ON(ret);
1909                 }
1910         }
1911         ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1912                                chunk_offset);
1913
1914         BUG_ON(ret);
1915
1916         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1917                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1918                 BUG_ON(ret);
1919         }
1920
1921         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1922         BUG_ON(ret);
1923
1924         write_lock(&em_tree->lock);
1925         remove_extent_mapping(em_tree, em);
1926         write_unlock(&em_tree->lock);
1927
1928         kfree(map);
1929         em->bdev = NULL;
1930
1931         /* once for the tree */
1932         free_extent_map(em);
1933         /* once for us */
1934         free_extent_map(em);
1935
1936         unlock_chunks(root);
1937         btrfs_end_transaction(trans, root);
1938         return 0;
1939 }
1940
1941 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1942 {
1943         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
1944         struct btrfs_path *path;
1945         struct extent_buffer *leaf;
1946         struct btrfs_chunk *chunk;
1947         struct btrfs_key key;
1948         struct btrfs_key found_key;
1949         u64 chunk_tree = chunk_root->root_key.objectid;
1950         u64 chunk_type;
1951         bool retried = false;
1952         int failed = 0;
1953         int ret;
1954
1955         path = btrfs_alloc_path();
1956         if (!path)
1957                 return -ENOMEM;
1958
1959 again:
1960         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1961         key.offset = (u64)-1;
1962         key.type = BTRFS_CHUNK_ITEM_KEY;
1963
1964         while (1) {
1965                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1966                 if (ret < 0)
1967                         goto error;
1968                 BUG_ON(ret == 0);
1969
1970                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
1971                                           key.type);
1972                 if (ret < 0)
1973                         goto error;
1974                 if (ret > 0)
1975                         break;
1976
1977                 leaf = path->nodes[0];
1978                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1979
1980                 chunk = btrfs_item_ptr(leaf, path->slots[0],
1981                                        struct btrfs_chunk);
1982                 chunk_type = btrfs_chunk_type(leaf, chunk);
1983                 btrfs_release_path(chunk_root, path);
1984
1985                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
1986                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
1987                                                    found_key.objectid,
1988                                                    found_key.offset);
1989                         if (ret == -ENOSPC)
1990                                 failed++;
1991                         else if (ret)
1992                                 BUG();
1993                 }
1994
1995                 if (found_key.offset == 0)
1996                         break;
1997                 key.offset = found_key.offset - 1;
1998         }
1999         ret = 0;
2000         if (failed && !retried) {
2001                 failed = 0;
2002                 retried = true;
2003                 goto again;
2004         } else if (failed && retried) {
2005                 WARN_ON(1);
2006                 ret = -ENOSPC;
2007         }
2008 error:
2009         btrfs_free_path(path);
2010         return ret;
2011 }
2012
2013 static u64 div_factor(u64 num, int factor)
2014 {
2015         if (factor == 10)
2016                 return num;
2017         num *= factor;
2018         do_div(num, 10);
2019         return num;
2020 }
2021
2022 int btrfs_balance(struct btrfs_root *dev_root)
2023 {
2024         int ret;
2025         struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
2026         struct btrfs_device *device;
2027         u64 old_size;
2028         u64 size_to_free;
2029         struct btrfs_path *path;
2030         struct btrfs_key key;
2031         struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
2032         struct btrfs_trans_handle *trans;
2033         struct btrfs_key found_key;
2034
2035         if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
2036                 return -EROFS;
2037
2038         if (!capable(CAP_SYS_ADMIN))
2039                 return -EPERM;
2040
2041         mutex_lock(&dev_root->fs_info->volume_mutex);
2042         dev_root = dev_root->fs_info->dev_root;
2043
2044         /* step one make some room on all the devices */
2045         list_for_each_entry(device, devices, dev_list) {
2046                 old_size = device->total_bytes;
2047                 size_to_free = div_factor(old_size, 1);
2048                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2049                 if (!device->writeable ||
2050                     device->total_bytes - device->bytes_used > size_to_free)
2051                         continue;
2052
2053                 ret = btrfs_shrink_device(device, old_size - size_to_free);
2054                 if (ret == -ENOSPC)
2055                         break;
2056                 BUG_ON(ret);
2057
2058                 trans = btrfs_start_transaction(dev_root, 0);
2059                 BUG_ON(IS_ERR(trans));
2060
2061                 ret = btrfs_grow_device(trans, device, old_size);
2062                 BUG_ON(ret);
2063
2064                 btrfs_end_transaction(trans, dev_root);
2065         }
2066
2067         /* step two, relocate all the chunks */
2068         path = btrfs_alloc_path();
2069         BUG_ON(!path);
2070
2071         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2072         key.offset = (u64)-1;
2073         key.type = BTRFS_CHUNK_ITEM_KEY;
2074
2075         while (1) {
2076                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2077                 if (ret < 0)
2078                         goto error;
2079
2080                 /*
2081                  * this shouldn't happen, it means the last relocate
2082                  * failed
2083                  */
2084                 if (ret == 0)
2085                         break;
2086
2087                 ret = btrfs_previous_item(chunk_root, path, 0,
2088                                           BTRFS_CHUNK_ITEM_KEY);
2089                 if (ret)
2090                         break;
2091
2092                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2093                                       path->slots[0]);
2094                 if (found_key.objectid != key.objectid)
2095                         break;
2096
2097                 /* chunk zero is special */
2098                 if (found_key.offset == 0)
2099                         break;
2100
2101                 btrfs_release_path(chunk_root, path);
2102                 ret = btrfs_relocate_chunk(chunk_root,
2103                                            chunk_root->root_key.objectid,
2104                                            found_key.objectid,
2105                                            found_key.offset);
2106                 BUG_ON(ret && ret != -ENOSPC);
2107                 key.offset = found_key.offset - 1;
2108         }
2109         ret = 0;
2110 error:
2111         btrfs_free_path(path);
2112         mutex_unlock(&dev_root->fs_info->volume_mutex);
2113         return ret;
2114 }
2115
2116 /*
2117  * shrinking a device means finding all of the device extents past
2118  * the new size, and then following the back refs to the chunks.
2119  * The chunk relocation code actually frees the device extent
2120  */
2121 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2122 {
2123         struct btrfs_trans_handle *trans;
2124         struct btrfs_root *root = device->dev_root;
2125         struct btrfs_dev_extent *dev_extent = NULL;
2126         struct btrfs_path *path;
2127         u64 length;
2128         u64 chunk_tree;
2129         u64 chunk_objectid;
2130         u64 chunk_offset;
2131         int ret;
2132         int slot;
2133         int failed = 0;
2134         bool retried = false;
2135         struct extent_buffer *l;
2136         struct btrfs_key key;
2137         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
2138         u64 old_total = btrfs_super_total_bytes(super_copy);
2139         u64 old_size = device->total_bytes;
2140         u64 diff = device->total_bytes - new_size;
2141
2142         if (new_size >= device->total_bytes)
2143                 return -EINVAL;
2144
2145         path = btrfs_alloc_path();
2146         if (!path)
2147                 return -ENOMEM;
2148
2149         path->reada = 2;
2150
2151         lock_chunks(root);
2152
2153         device->total_bytes = new_size;
2154         if (device->writeable)
2155                 device->fs_devices->total_rw_bytes -= diff;
2156         unlock_chunks(root);
2157
2158 again:
2159         key.objectid = device->devid;
2160         key.offset = (u64)-1;
2161         key.type = BTRFS_DEV_EXTENT_KEY;
2162
2163         while (1) {
2164                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2165                 if (ret < 0)
2166                         goto done;
2167
2168                 ret = btrfs_previous_item(root, path, 0, key.type);
2169                 if (ret < 0)
2170                         goto done;
2171                 if (ret) {
2172                         ret = 0;
2173                         btrfs_release_path(root, path);
2174                         break;
2175                 }
2176
2177                 l = path->nodes[0];
2178                 slot = path->slots[0];
2179                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
2180
2181                 if (key.objectid != device->devid) {
2182                         btrfs_release_path(root, path);
2183                         break;
2184                 }
2185
2186                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2187                 length = btrfs_dev_extent_length(l, dev_extent);
2188
2189                 if (key.offset + length <= new_size) {
2190                         btrfs_release_path(root, path);
2191                         break;
2192                 }
2193
2194                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2195                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2196                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2197                 btrfs_release_path(root, path);
2198
2199                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
2200                                            chunk_offset);
2201                 if (ret && ret != -ENOSPC)
2202                         goto done;
2203                 if (ret == -ENOSPC)
2204                         failed++;
2205                 key.offset -= 1;
2206         }
2207
2208         if (failed && !retried) {
2209                 failed = 0;
2210                 retried = true;
2211                 goto again;
2212         } else if (failed && retried) {
2213                 ret = -ENOSPC;
2214                 lock_chunks(root);
2215
2216                 device->total_bytes = old_size;
2217                 if (device->writeable)
2218                         device->fs_devices->total_rw_bytes += diff;
2219                 unlock_chunks(root);
2220                 goto done;
2221         }
2222
2223         /* Shrinking succeeded, else we would be at "done". */
2224         trans = btrfs_start_transaction(root, 0);
2225         if (IS_ERR(trans)) {
2226                 ret = PTR_ERR(trans);
2227                 goto done;
2228         }
2229
2230         lock_chunks(root);
2231
2232         device->disk_total_bytes = new_size;
2233         /* Now btrfs_update_device() will change the on-disk size. */
2234         ret = btrfs_update_device(trans, device);
2235         if (ret) {
2236                 unlock_chunks(root);
2237                 btrfs_end_transaction(trans, root);
2238                 goto done;
2239         }
2240         WARN_ON(diff > old_total);
2241         btrfs_set_super_total_bytes(super_copy, old_total - diff);
2242         unlock_chunks(root);
2243         btrfs_end_transaction(trans, root);
2244 done:
2245         btrfs_free_path(path);
2246         return ret;
2247 }
2248
2249 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2250                            struct btrfs_root *root,
2251                            struct btrfs_key *key,
2252                            struct btrfs_chunk *chunk, int item_size)
2253 {
2254         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
2255         struct btrfs_disk_key disk_key;
2256         u32 array_size;
2257         u8 *ptr;
2258
2259         array_size = btrfs_super_sys_array_size(super_copy);
2260         if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
2261                 return -EFBIG;
2262
2263         ptr = super_copy->sys_chunk_array + array_size;
2264         btrfs_cpu_key_to_disk(&disk_key, key);
2265         memcpy(ptr, &disk_key, sizeof(disk_key));
2266         ptr += sizeof(disk_key);
2267         memcpy(ptr, chunk, item_size);
2268         item_size += sizeof(disk_key);
2269         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
2270         return 0;
2271 }
2272
2273 static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
2274                                         int num_stripes, int sub_stripes)
2275 {
2276         if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
2277                 return calc_size;
2278         else if (type & BTRFS_BLOCK_GROUP_RAID10)
2279                 return calc_size * (num_stripes / sub_stripes);
2280         else
2281                 return calc_size * num_stripes;
2282 }
2283
2284 /* Used to sort the devices by max_avail(descending sort) */
2285 int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
2286 {
2287         if (((struct btrfs_device_info *)dev_info1)->max_avail >
2288             ((struct btrfs_device_info *)dev_info2)->max_avail)
2289                 return -1;
2290         else if (((struct btrfs_device_info *)dev_info1)->max_avail <
2291                  ((struct btrfs_device_info *)dev_info2)->max_avail)
2292                 return 1;
2293         else
2294                 return 0;
2295 }
2296
2297 static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
2298                                  int *num_stripes, int *min_stripes,
2299                                  int *sub_stripes)
2300 {
2301         *num_stripes = 1;
2302         *min_stripes = 1;
2303         *sub_stripes = 0;
2304
2305         if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
2306                 *num_stripes = fs_devices->rw_devices;
2307                 *min_stripes = 2;
2308         }
2309         if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2310                 *num_stripes = 2;
2311                 *min_stripes = 2;
2312         }
2313         if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2314                 if (fs_devices->rw_devices < 2)
2315                         return -ENOSPC;
2316                 *num_stripes = 2;
2317                 *min_stripes = 2;
2318         }
2319         if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2320                 *num_stripes = fs_devices->rw_devices;
2321                 if (*num_stripes < 4)
2322                         return -ENOSPC;
2323                 *num_stripes &= ~(u32)1;
2324                 *sub_stripes = 2;
2325                 *min_stripes = 4;
2326         }
2327
2328         return 0;
2329 }
2330
2331 static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
2332                                     u64 proposed_size, u64 type,
2333                                     int num_stripes, int small_stripe)
2334 {
2335         int min_stripe_size = 1 * 1024 * 1024;
2336         u64 calc_size = proposed_size;
2337         u64 max_chunk_size = calc_size;
2338         int ncopies = 1;
2339
2340         if (type & (BTRFS_BLOCK_GROUP_RAID1 |
2341                     BTRFS_BLOCK_GROUP_DUP |
2342                     BTRFS_BLOCK_GROUP_RAID10))
2343                 ncopies = 2;
2344
2345         if (type & BTRFS_BLOCK_GROUP_DATA) {
2346                 max_chunk_size = 10 * calc_size;
2347                 min_stripe_size = 64 * 1024 * 1024;
2348         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
2349                 max_chunk_size = 256 * 1024 * 1024;
2350                 min_stripe_size = 32 * 1024 * 1024;
2351         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2352                 calc_size = 8 * 1024 * 1024;
2353                 max_chunk_size = calc_size * 2;
2354                 min_stripe_size = 1 * 1024 * 1024;
2355         }
2356
2357         /* we don't want a chunk larger than 10% of writeable space */
2358         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2359                              max_chunk_size);
2360
2361         if (calc_size * num_stripes > max_chunk_size * ncopies) {
2362                 calc_size = max_chunk_size * ncopies;
2363                 do_div(calc_size, num_stripes);
2364                 do_div(calc_size, BTRFS_STRIPE_LEN);
2365                 calc_size *= BTRFS_STRIPE_LEN;
2366         }
2367
2368         /* we don't want tiny stripes */
2369         if (!small_stripe)
2370                 calc_size = max_t(u64, min_stripe_size, calc_size);
2371
2372         /*
2373          * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
2374          * we end up with something bigger than a stripe
2375          */
2376         calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
2377
2378         do_div(calc_size, BTRFS_STRIPE_LEN);
2379         calc_size *= BTRFS_STRIPE_LEN;
2380
2381         return calc_size;
2382 }
2383
2384 static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
2385                                                       int num_stripes)
2386 {
2387         struct map_lookup *new;
2388         size_t len = map_lookup_size(num_stripes);
2389
2390         BUG_ON(map->num_stripes < num_stripes);
2391
2392         if (map->num_stripes == num_stripes)
2393                 return map;
2394
2395         new = kmalloc(len, GFP_NOFS);
2396         if (!new) {
2397                 /* just change map->num_stripes */
2398                 map->num_stripes = num_stripes;
2399                 return map;
2400         }
2401
2402         memcpy(new, map, len);
2403         new->num_stripes = num_stripes;
2404         kfree(map);
2405         return new;
2406 }
2407
2408 /*
2409  * helper to allocate device space from btrfs_device_info, in which we stored
2410  * max free space information of every device. It is used when we can not
2411  * allocate chunks by default size.
2412  *
2413  * By this helper, we can allocate a new chunk as larger as possible.
2414  */
2415 static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
2416                                     struct btrfs_fs_devices *fs_devices,
2417                                     struct btrfs_device_info *devices,
2418                                     int nr_device, u64 type,
2419                                     struct map_lookup **map_lookup,
2420                                     int min_stripes, u64 *stripe_size)
2421 {
2422         int i, index, sort_again = 0;
2423         int min_devices = min_stripes;
2424         u64 max_avail, min_free;
2425         struct map_lookup *map = *map_lookup;
2426         int ret;
2427
2428         if (nr_device < min_stripes)
2429                 return -ENOSPC;
2430
2431         btrfs_descending_sort_devices(devices, nr_device);
2432
2433         max_avail = devices[0].max_avail;
2434         if (!max_avail)
2435                 return -ENOSPC;
2436
2437         for (i = 0; i < nr_device; i++) {
2438                 /*
2439                  * if dev_offset = 0, it means the free space of this device
2440                  * is less than what we need, and we didn't search max avail
2441                  * extent on this device, so do it now.
2442                  */
2443                 if (!devices[i].dev_offset) {
2444                         ret = find_free_dev_extent(trans, devices[i].dev,
2445                                                    max_avail,
2446                                                    &devices[i].dev_offset,
2447                                                    &devices[i].max_avail);
2448                         if (ret != 0 && ret != -ENOSPC)
2449                                 return ret;
2450                         sort_again = 1;
2451                 }
2452         }
2453
2454         /* we update the max avail free extent of each devices, sort again */
2455         if (sort_again)
2456                 btrfs_descending_sort_devices(devices, nr_device);
2457
2458         if (type & BTRFS_BLOCK_GROUP_DUP)
2459                 min_devices = 1;
2460
2461         if (!devices[min_devices - 1].max_avail)
2462                 return -ENOSPC;
2463
2464         max_avail = devices[min_devices - 1].max_avail;
2465         if (type & BTRFS_BLOCK_GROUP_DUP)
2466                 do_div(max_avail, 2);
2467
2468         max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
2469                                              min_stripes, 1);
2470         if (type & BTRFS_BLOCK_GROUP_DUP)
2471                 min_free = max_avail * 2;
2472         else
2473                 min_free = max_avail;
2474
2475         if (min_free > devices[min_devices - 1].max_avail)
2476                 return -ENOSPC;
2477
2478         map = __shrink_map_lookup_stripes(map, min_stripes);
2479         *stripe_size = max_avail;
2480
2481         index = 0;
2482         for (i = 0; i < min_stripes; i++) {
2483                 map->stripes[i].dev = devices[index].dev;
2484                 map->stripes[i].physical = devices[index].dev_offset;
2485                 if (type & BTRFS_BLOCK_GROUP_DUP) {
2486                         i++;
2487                         map->stripes[i].dev = devices[index].dev;
2488                         map->stripes[i].physical = devices[index].dev_offset +
2489                                                    max_avail;
2490                 }
2491                 index++;
2492         }
2493         *map_lookup = map;
2494
2495         return 0;
2496 }
2497
2498 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2499                                struct btrfs_root *extent_root,
2500                                struct map_lookup **map_ret,
2501                                u64 *num_bytes, u64 *stripe_size,
2502                                u64 start, u64 type)
2503 {
2504         struct btrfs_fs_info *info = extent_root->fs_info;
2505         struct btrfs_device *device = NULL;
2506         struct btrfs_fs_devices *fs_devices = info->fs_devices;
2507         struct list_head *cur;
2508         struct map_lookup *map;
2509         struct extent_map_tree *em_tree;
2510         struct extent_map *em;
2511         struct btrfs_device_info *devices_info;
2512         struct list_head private_devs;
2513         u64 calc_size = 1024 * 1024 * 1024;
2514         u64 min_free;
2515         u64 avail;
2516         u64 dev_offset;
2517         int num_stripes;
2518         int min_stripes;
2519         int sub_stripes;
2520         int min_devices;        /* the min number of devices we need */
2521         int i;
2522         int ret;
2523         int index;
2524
2525         if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
2526             (type & BTRFS_BLOCK_GROUP_DUP)) {
2527                 WARN_ON(1);
2528                 type &= ~BTRFS_BLOCK_GROUP_DUP;
2529         }
2530         if (list_empty(&fs_devices->alloc_list))
2531                 return -ENOSPC;
2532
2533         ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
2534                                     &min_stripes, &sub_stripes);
2535         if (ret)
2536                 return ret;
2537
2538         devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
2539                                GFP_NOFS);
2540         if (!devices_info)
2541                 return -ENOMEM;
2542
2543         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2544         if (!map) {
2545                 ret = -ENOMEM;
2546                 goto error;
2547         }
2548         map->num_stripes = num_stripes;
2549
2550         cur = fs_devices->alloc_list.next;
2551         index = 0;
2552         i = 0;
2553
2554         calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
2555                                              num_stripes, 0);
2556
2557         if (type & BTRFS_BLOCK_GROUP_DUP) {
2558                 min_free = calc_size * 2;
2559                 min_devices = 1;
2560         } else {
2561                 min_free = calc_size;
2562                 min_devices = min_stripes;
2563         }
2564
2565         INIT_LIST_HEAD(&private_devs);
2566         while (index < num_stripes) {
2567                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2568                 BUG_ON(!device->writeable);
2569                 if (device->total_bytes > device->bytes_used)
2570                         avail = device->total_bytes - device->bytes_used;
2571                 else
2572                         avail = 0;
2573                 cur = cur->next;
2574
2575                 if (device->in_fs_metadata && avail >= min_free) {
2576                         ret = find_free_dev_extent(trans, device, min_free,
2577                                                    &devices_info[i].dev_offset,
2578                                                    &devices_info[i].max_avail);
2579                         if (ret == 0) {
2580                                 list_move_tail(&device->dev_alloc_list,
2581                                                &private_devs);
2582                                 map->stripes[index].dev = device;
2583                                 map->stripes[index].physical =
2584                                                 devices_info[i].dev_offset;
2585                                 index++;
2586                                 if (type & BTRFS_BLOCK_GROUP_DUP) {
2587                                         map->stripes[index].dev = device;
2588                                         map->stripes[index].physical =
2589                                                 devices_info[i].dev_offset +
2590                                                 calc_size;
2591                                         index++;
2592                                 }
2593                         } else if (ret != -ENOSPC)
2594                                 goto error;
2595
2596                         devices_info[i].dev = device;
2597                         i++;
2598                 } else if (device->in_fs_metadata &&
2599                            avail >= BTRFS_STRIPE_LEN) {
2600                         devices_info[i].dev = device;
2601                         devices_info[i].max_avail = avail;
2602                         i++;
2603                 }
2604
2605                 if (cur == &fs_devices->alloc_list)
2606                         break;
2607         }
2608
2609         list_splice(&private_devs, &fs_devices->alloc_list);
2610         if (index < num_stripes) {
2611                 if (index >= min_stripes) {
2612                         num_stripes = index;
2613                         if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2614                                 num_stripes /= sub_stripes;
2615                                 num_stripes *= sub_stripes;
2616                         }
2617
2618                         map = __shrink_map_lookup_stripes(map, num_stripes);
2619                 } else if (i >= min_devices) {
2620                         ret = __btrfs_alloc_tiny_space(trans, fs_devices,
2621                                                        devices_info, i, type,
2622                                                        &map, min_stripes,
2623                                                        &calc_size);
2624                         if (ret)
2625                                 goto error;
2626                 } else {
2627                         ret = -ENOSPC;
2628                         goto error;
2629                 }
2630         }
2631         map->sector_size = extent_root->sectorsize;
2632         map->stripe_len = BTRFS_STRIPE_LEN;
2633         map->io_align = BTRFS_STRIPE_LEN;
2634         map->io_width = BTRFS_STRIPE_LEN;
2635         map->type = type;
2636         map->sub_stripes = sub_stripes;
2637
2638         *map_ret = map;
2639         *stripe_size = calc_size;
2640         *num_bytes = chunk_bytes_by_type(type, calc_size,
2641                                          map->num_stripes, sub_stripes);
2642
2643         em = alloc_extent_map(GFP_NOFS);
2644         if (!em) {
2645                 ret = -ENOMEM;
2646                 goto error;
2647         }
2648         em->bdev = (struct block_device *)map;
2649         em->start = start;
2650         em->len = *num_bytes;
2651         em->block_start = 0;
2652         em->block_len = em->len;
2653
2654         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
2655         write_lock(&em_tree->lock);
2656         ret = add_extent_mapping(em_tree, em);
2657         write_unlock(&em_tree->lock);
2658         BUG_ON(ret);
2659         free_extent_map(em);
2660
2661         ret = btrfs_make_block_group(trans, extent_root, 0, type,
2662                                      BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2663                                      start, *num_bytes);
2664         BUG_ON(ret);
2665
2666         index = 0;
2667         while (index < map->num_stripes) {
2668                 device = map->stripes[index].dev;
2669                 dev_offset = map->stripes[index].physical;
2670
2671                 ret = btrfs_alloc_dev_extent(trans, device,
2672                                 info->chunk_root->root_key.objectid,
2673                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2674                                 start, dev_offset, calc_size);
2675                 BUG_ON(ret);
2676                 index++;
2677         }
2678
2679         kfree(devices_info);
2680         return 0;
2681
2682 error:
2683         kfree(map);
2684         kfree(devices_info);
2685         return ret;
2686 }
2687
2688 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2689                                 struct btrfs_root *extent_root,
2690                                 struct map_lookup *map, u64 chunk_offset,
2691                                 u64 chunk_size, u64 stripe_size)
2692 {
2693         u64 dev_offset;
2694         struct btrfs_key key;
2695         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2696         struct btrfs_device *device;
2697         struct btrfs_chunk *chunk;
2698         struct btrfs_stripe *stripe;
2699         size_t item_size = btrfs_chunk_item_size(map->num_stripes);
2700         int index = 0;
2701         int ret;
2702
2703         chunk = kzalloc(item_size, GFP_NOFS);
2704         if (!chunk)
2705                 return -ENOMEM;
2706
2707         index = 0;
2708         while (index < map->num_stripes) {
2709                 device = map->stripes[index].dev;
2710                 device->bytes_used += stripe_size;
2711                 ret = btrfs_update_device(trans, device);
2712                 BUG_ON(ret);
2713                 index++;
2714         }
2715
2716         index = 0;
2717         stripe = &chunk->stripe;
2718         while (index < map->num_stripes) {
2719                 device = map->stripes[index].dev;
2720                 dev_offset = map->stripes[index].physical;
2721
2722                 btrfs_set_stack_stripe_devid(stripe, device->devid);
2723                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
2724                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
2725                 stripe++;
2726                 index++;
2727         }
2728
2729         btrfs_set_stack_chunk_length(chunk, chunk_size);
2730         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
2731         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
2732         btrfs_set_stack_chunk_type(chunk, map->type);
2733         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
2734         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
2735         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
2736         btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
2737         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
2738
2739         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2740         key.type = BTRFS_CHUNK_ITEM_KEY;
2741         key.offset = chunk_offset;
2742
2743         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
2744         BUG_ON(ret);
2745
2746         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2747                 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
2748                                              item_size);
2749                 BUG_ON(ret);
2750         }
2751         kfree(chunk);
2752         return 0;
2753 }
2754
2755 /*
2756  * Chunk allocation falls into two parts. The first part does works
2757  * that make the new allocated chunk useable, but not do any operation
2758  * that modifies the chunk tree. The second part does the works that
2759  * require modifying the chunk tree. This division is important for the
2760  * bootstrap process of adding storage to a seed btrfs.
2761  */
2762 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2763                       struct btrfs_root *extent_root, u64 type)
2764 {
2765         u64 chunk_offset;
2766         u64 chunk_size;
2767         u64 stripe_size;
2768         struct map_lookup *map;
2769         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2770         int ret;
2771
2772         ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2773                               &chunk_offset);
2774         if (ret)
2775                 return ret;
2776
2777         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2778                                   &stripe_size, chunk_offset, type);
2779         if (ret)
2780                 return ret;
2781
2782         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2783                                    chunk_size, stripe_size);
2784         BUG_ON(ret);
2785         return 0;
2786 }
2787
2788 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2789                                          struct btrfs_root *root,
2790                                          struct btrfs_device *device)
2791 {
2792         u64 chunk_offset;
2793         u64 sys_chunk_offset;
2794         u64 chunk_size;
2795         u64 sys_chunk_size;
2796         u64 stripe_size;
2797         u64 sys_stripe_size;
2798         u64 alloc_profile;
2799         struct map_lookup *map;
2800         struct map_lookup *sys_map;
2801         struct btrfs_fs_info *fs_info = root->fs_info;
2802         struct btrfs_root *extent_root = fs_info->extent_root;
2803         int ret;
2804
2805         ret = find_next_chunk(fs_info->chunk_root,
2806                               BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
2807         BUG_ON(ret);
2808
2809         alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2810                         (fs_info->metadata_alloc_profile &
2811                          fs_info->avail_metadata_alloc_bits);
2812         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2813
2814         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2815                                   &stripe_size, chunk_offset, alloc_profile);
2816         BUG_ON(ret);
2817
2818         sys_chunk_offset = chunk_offset + chunk_size;
2819
2820         alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2821                         (fs_info->system_alloc_profile &
2822                          fs_info->avail_system_alloc_bits);
2823         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2824
2825         ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
2826                                   &sys_chunk_size, &sys_stripe_size,
2827                                   sys_chunk_offset, alloc_profile);
2828         BUG_ON(ret);
2829
2830         ret = btrfs_add_device(trans, fs_info->chunk_root, device);
2831         BUG_ON(ret);
2832
2833         /*
2834          * Modifying chunk tree needs allocating new blocks from both
2835          * system block group and metadata block group. So we only can
2836          * do operations require modifying the chunk tree after both
2837          * block groups were created.
2838          */
2839         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2840                                    chunk_size, stripe_size);
2841         BUG_ON(ret);
2842
2843         ret = __finish_chunk_alloc(trans, extent_root, sys_map,
2844                                    sys_chunk_offset, sys_chunk_size,
2845                                    sys_stripe_size);
2846         BUG_ON(ret);
2847         return 0;
2848 }
2849
2850 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2851 {
2852         struct extent_map *em;
2853         struct map_lookup *map;
2854         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2855         int readonly = 0;
2856         int i;
2857
2858         read_lock(&map_tree->map_tree.lock);
2859         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2860         read_unlock(&map_tree->map_tree.lock);
2861         if (!em)
2862                 return 1;
2863
2864         if (btrfs_test_opt(root, DEGRADED)) {
2865                 free_extent_map(em);
2866                 return 0;
2867         }
2868
2869         map = (struct map_lookup *)em->bdev;
2870         for (i = 0; i < map->num_stripes; i++) {
2871                 if (!map->stripes[i].dev->writeable) {
2872                         readonly = 1;
2873                         break;
2874                 }
2875         }
2876         free_extent_map(em);
2877         return readonly;
2878 }
2879
2880 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
2881 {
2882         extent_map_tree_init(&tree->map_tree, GFP_NOFS);
2883 }
2884
2885 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
2886 {
2887         struct extent_map *em;
2888
2889         while (1) {
2890                 write_lock(&tree->map_tree.lock);
2891                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
2892                 if (em)
2893                         remove_extent_mapping(&tree->map_tree, em);
2894                 write_unlock(&tree->map_tree.lock);
2895                 if (!em)
2896                         break;
2897                 kfree(em->bdev);
2898                 /* once for us */
2899                 free_extent_map(em);
2900                 /* once for the tree */
2901                 free_extent_map(em);
2902         }
2903 }
2904
2905 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
2906 {
2907         struct extent_map *em;
2908         struct map_lookup *map;
2909         struct extent_map_tree *em_tree = &map_tree->map_tree;
2910         int ret;
2911
2912         read_lock(&em_tree->lock);
2913         em = lookup_extent_mapping(em_tree, logical, len);
2914         read_unlock(&em_tree->lock);
2915         BUG_ON(!em);
2916
2917         BUG_ON(em->start > logical || em->start + em->len < logical);
2918         map = (struct map_lookup *)em->bdev;
2919         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
2920                 ret = map->num_stripes;
2921         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2922                 ret = map->sub_stripes;
2923         else
2924                 ret = 1;
2925         free_extent_map(em);
2926         return ret;
2927 }
2928
2929 static int find_live_mirror(struct map_lookup *map, int first, int num,
2930                             int optimal)
2931 {
2932         int i;
2933         if (map->stripes[optimal].dev->bdev)
2934                 return optimal;
2935         for (i = first; i < first + num; i++) {
2936                 if (map->stripes[i].dev->bdev)
2937                         return i;
2938         }
2939         /* we couldn't find one that doesn't fail.  Just return something
2940          * and the io error handling code will clean up eventually
2941          */
2942         return optimal;
2943 }
2944
2945 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2946                              u64 logical, u64 *length,
2947                              struct btrfs_multi_bio **multi_ret,
2948                              int mirror_num, struct page *unplug_page)
2949 {
2950         struct extent_map *em;
2951         struct map_lookup *map;
2952         struct extent_map_tree *em_tree = &map_tree->map_tree;
2953         u64 offset;
2954         u64 stripe_offset;
2955         u64 stripe_nr;
2956         int stripes_allocated = 8;
2957         int stripes_required = 1;
2958         int stripe_index;
2959         int i;
2960         int num_stripes;
2961         int max_errors = 0;
2962         struct btrfs_multi_bio *multi = NULL;
2963
2964         if (multi_ret && !(rw & REQ_WRITE))
2965                 stripes_allocated = 1;
2966 again:
2967         if (multi_ret) {
2968                 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
2969                                 GFP_NOFS);
2970                 if (!multi)
2971                         return -ENOMEM;
2972
2973                 atomic_set(&multi->error, 0);
2974         }
2975
2976         read_lock(&em_tree->lock);
2977         em = lookup_extent_mapping(em_tree, logical, *length);
2978         read_unlock(&em_tree->lock);
2979
2980         if (!em && unplug_page) {
2981                 kfree(multi);
2982                 return 0;
2983         }
2984
2985         if (!em) {
2986                 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2987                        (unsigned long long)logical,
2988                        (unsigned long long)*length);
2989                 BUG();
2990         }
2991
2992         BUG_ON(em->start > logical || em->start + em->len < logical);
2993         map = (struct map_lookup *)em->bdev;
2994         offset = logical - em->start;
2995
2996         if (mirror_num > map->num_stripes)
2997                 mirror_num = 0;
2998
2999         /* if our multi bio struct is too small, back off and try again */
3000         if (rw & REQ_WRITE) {
3001                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
3002                                  BTRFS_BLOCK_GROUP_DUP)) {
3003                         stripes_required = map->num_stripes;
3004                         max_errors = 1;
3005                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3006                         stripes_required = map->sub_stripes;
3007                         max_errors = 1;
3008                 }
3009         }
3010         if (multi_ret && (rw & REQ_WRITE) &&
3011             stripes_allocated < stripes_required) {
3012                 stripes_allocated = map->num_stripes;
3013                 free_extent_map(em);
3014                 kfree(multi);
3015                 goto again;
3016         }
3017         stripe_nr = offset;
3018         /*
3019          * stripe_nr counts the total number of stripes we have to stride
3020          * to get to this block
3021          */
3022         do_div(stripe_nr, map->stripe_len);
3023
3024         stripe_offset = stripe_nr * map->stripe_len;
3025         BUG_ON(offset < stripe_offset);
3026
3027         /* stripe_offset is the offset of this block in its stripe*/
3028         stripe_offset = offset - stripe_offset;
3029
3030         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
3031                          BTRFS_BLOCK_GROUP_RAID10 |
3032                          BTRFS_BLOCK_GROUP_DUP)) {
3033                 /* we limit the length of each bio to what fits in a stripe */
3034                 *length = min_t(u64, em->len - offset,
3035                               map->stripe_len - stripe_offset);
3036         } else {
3037                 *length = em->len - offset;
3038         }
3039
3040         if (!multi_ret && !unplug_page)
3041                 goto out;
3042
3043         num_stripes = 1;
3044         stripe_index = 0;
3045         if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3046                 if (unplug_page || (rw & REQ_WRITE))
3047                         num_stripes = map->num_stripes;
3048                 else if (mirror_num)
3049                         stripe_index = mirror_num - 1;
3050                 else {
3051                         stripe_index = find_live_mirror(map, 0,
3052                                             map->num_stripes,
3053                                             current->pid % map->num_stripes);
3054                 }
3055
3056         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3057                 if (rw & REQ_WRITE)
3058                         num_stripes = map->num_stripes;
3059                 else if (mirror_num)
3060                         stripe_index = mirror_num - 1;
3061
3062         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3063                 int factor = map->num_stripes / map->sub_stripes;
3064
3065                 stripe_index = do_div(stripe_nr, factor);
3066                 stripe_index *= map->sub_stripes;
3067
3068                 if (unplug_page || (rw & REQ_WRITE))
3069                         num_stripes = map->sub_stripes;
3070                 else if (mirror_num)
3071                         stripe_index += mirror_num - 1;
3072                 else {
3073                         stripe_index = find_live_mirror(map, stripe_index,
3074                                               map->sub_stripes, stripe_index +
3075                                               current->pid % map->sub_stripes);
3076                 }
3077         } else {
3078                 /*
3079                  * after this do_div call, stripe_nr is the number of stripes
3080                  * on this device we have to walk to find the data, and
3081                  * stripe_index is the number of our device in the stripe array
3082                  */
3083                 stripe_index = do_div(stripe_nr, map->num_stripes);
3084         }
3085         BUG_ON(stripe_index >= map->num_stripes);
3086
3087         for (i = 0; i < num_stripes; i++) {
3088                 if (unplug_page) {
3089                         struct btrfs_device *device;
3090                         struct backing_dev_info *bdi;
3091
3092                         device = map->stripes[stripe_index].dev;
3093                         if (device->bdev) {
3094                                 bdi = blk_get_backing_dev_info(device->bdev);
3095                                 if (bdi->unplug_io_fn)
3096                                         bdi->unplug_io_fn(bdi, unplug_page);
3097                         }
3098                 } else {
3099                         multi->stripes[i].physical =
3100                                 map->stripes[stripe_index].physical +
3101                                 stripe_offset + stripe_nr * map->stripe_len;
3102                         multi->stripes[i].dev = map->stripes[stripe_index].dev;
3103                 }
3104                 stripe_index++;
3105         }
3106         if (multi_ret) {
3107                 *multi_ret = multi;
3108                 multi->num_stripes = num_stripes;
3109                 multi->max_errors = max_errors;
3110         }
3111 out:
3112         free_extent_map(em);
3113         return 0;
3114 }
3115
3116 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3117                       u64 logical, u64 *length,
3118                       struct btrfs_multi_bio **multi_ret, int mirror_num)
3119 {
3120         return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
3121                                  mirror_num, NULL);
3122 }
3123
3124 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3125                      u64 chunk_start, u64 physical, u64 devid,
3126                      u64 **logical, int *naddrs, int *stripe_len)
3127 {
3128         struct extent_map_tree *em_tree = &map_tree->map_tree;
3129         struct extent_map *em;
3130         struct map_lookup *map;
3131         u64 *buf;
3132         u64 bytenr;
3133         u64 length;
3134         u64 stripe_nr;
3135         int i, j, nr = 0;
3136
3137         read_lock(&em_tree->lock);
3138         em = lookup_extent_mapping(em_tree, chunk_start, 1);
3139         read_unlock(&em_tree->lock);
3140
3141         BUG_ON(!em || em->start != chunk_start);
3142         map = (struct map_lookup *)em->bdev;
3143
3144         length = em->len;
3145         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3146                 do_div(length, map->num_stripes / map->sub_stripes);
3147         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3148                 do_div(length, map->num_stripes);
3149
3150         buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
3151         BUG_ON(!buf);
3152
3153         for (i = 0; i < map->num_stripes; i++) {
3154                 if (devid && map->stripes[i].dev->devid != devid)
3155                         continue;
3156                 if (map->stripes[i].physical > physical ||
3157                     map->stripes[i].physical + length <= physical)
3158                         continue;
3159
3160                 stripe_nr = physical - map->stripes[i].physical;
3161                 do_div(stripe_nr, map->stripe_len);
3162
3163                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3164                         stripe_nr = stripe_nr * map->num_stripes + i;
3165                         do_div(stripe_nr, map->sub_stripes);
3166                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3167                         stripe_nr = stripe_nr * map->num_stripes + i;
3168                 }
3169                 bytenr = chunk_start + stripe_nr * map->stripe_len;
3170                 WARN_ON(nr >= map->num_stripes);
3171                 for (j = 0; j < nr; j++) {
3172                         if (buf[j] == bytenr)
3173                                 break;
3174                 }
3175                 if (j == nr) {
3176                         WARN_ON(nr >= map->num_stripes);
3177                         buf[nr++] = bytenr;
3178                 }
3179         }
3180
3181         *logical = buf;
3182         *naddrs = nr;
3183         *stripe_len = map->stripe_len;
3184
3185         free_extent_map(em);
3186         return 0;
3187 }
3188
3189 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
3190                       u64 logical, struct page *page)
3191 {
3192         u64 length = PAGE_CACHE_SIZE;
3193         return __btrfs_map_block(map_tree, READ, logical, &length,
3194                                  NULL, 0, page);
3195 }
3196
3197 static void end_bio_multi_stripe(struct bio *bio, int err)
3198 {
3199         struct btrfs_multi_bio *multi = bio->bi_private;
3200         int is_orig_bio = 0;
3201
3202         if (err)
3203                 atomic_inc(&multi->error);
3204
3205         if (bio == multi->orig_bio)
3206                 is_orig_bio = 1;
3207
3208         if (atomic_dec_and_test(&multi->stripes_pending)) {
3209                 if (!is_orig_bio) {
3210                         bio_put(bio);
3211                         bio = multi->orig_bio;
3212                 }
3213                 bio->bi_private = multi->private;
3214                 bio->bi_end_io = multi->end_io;
3215                 /* only send an error to the higher layers if it is
3216                  * beyond the tolerance of the multi-bio
3217                  */
3218                 if (atomic_read(&multi->error) > multi->max_errors) {
3219                         err = -EIO;
3220                 } else if (err) {
3221                         /*
3222                          * this bio is actually up to date, we didn't
3223                          * go over the max number of errors
3224                          */
3225                         set_bit(BIO_UPTODATE, &bio->bi_flags);
3226                         err = 0;
3227                 }
3228                 kfree(multi);
3229
3230                 bio_endio(bio, err);
3231         } else if (!is_orig_bio) {
3232                 bio_put(bio);
3233         }
3234 }
3235
3236 struct async_sched {
3237         struct bio *bio;
3238         int rw;
3239         struct btrfs_fs_info *info;
3240         struct btrfs_work work;
3241 };
3242
3243 /*
3244  * see run_scheduled_bios for a description of why bios are collected for
3245  * async submit.
3246  *
3247  * This will add one bio to the pending list for a device and make sure
3248  * the work struct is scheduled.
3249  */
3250 static noinline int schedule_bio(struct btrfs_root *root,
3251                                  struct btrfs_device *device,
3252                                  int rw, struct bio *bio)
3253 {
3254         int should_queue = 1;
3255         struct btrfs_pending_bios *pending_bios;
3256
3257         /* don't bother with additional async steps for reads, right now */
3258         if (!(rw & REQ_WRITE)) {
3259                 bio_get(bio);
3260                 submit_bio(rw, bio);
3261                 bio_put(bio);
3262                 return 0;
3263         }
3264
3265         /*
3266          * nr_async_bios allows us to reliably return congestion to the
3267          * higher layers.  Otherwise, the async bio makes it appear we have
3268          * made progress against dirty pages when we've really just put it
3269          * on a queue for later
3270          */
3271         atomic_inc(&root->fs_info->nr_async_bios);
3272         WARN_ON(bio->bi_next);
3273         bio->bi_next = NULL;
3274         bio->bi_rw |= rw;
3275
3276         spin_lock(&device->io_lock);
3277         if (bio->bi_rw & REQ_SYNC)
3278                 pending_bios = &device->pending_sync_bios;
3279         else
3280                 pending_bios = &device->pending_bios;
3281
3282         if (pending_bios->tail)
3283                 pending_bios->tail->bi_next = bio;
3284
3285         pending_bios->tail = bio;
3286         if (!pending_bios->head)
3287                 pending_bios->head = bio;
3288         if (device->running_pending)
3289                 should_queue = 0;
3290
3291         spin_unlock(&device->io_lock);
3292
3293         if (should_queue)
3294                 btrfs_queue_worker(&root->fs_info->submit_workers,
3295                                    &device->work);
3296         return 0;
3297 }
3298
3299 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3300                   int mirror_num, int async_submit)
3301 {
3302         struct btrfs_mapping_tree *map_tree;
3303         struct btrfs_device *dev;
3304         struct bio *first_bio = bio;
3305         u64 logical = (u64)bio->bi_sector << 9;
3306         u64 length = 0;
3307         u64 map_length;
3308         struct btrfs_multi_bio *multi = NULL;
3309         int ret;
3310         int dev_nr = 0;
3311         int total_devs = 1;
3312
3313         length = bio->bi_size;
3314         map_tree = &root->fs_info->mapping_tree;
3315         map_length = length;
3316
3317         ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
3318                               mirror_num);
3319         BUG_ON(ret);
3320
3321         total_devs = multi->num_stripes;
3322         if (map_length < length) {
3323                 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
3324                        "len %llu\n", (unsigned long long)logical,
3325                        (unsigned long long)length,
3326                        (unsigned long long)map_length);
3327                 BUG();
3328         }
3329         multi->end_io = first_bio->bi_end_io;
3330         multi->private = first_bio->bi_private;
3331         multi->orig_bio = first_bio;
3332         atomic_set(&multi->stripes_pending, multi->num_stripes);
3333
3334         while (dev_nr < total_devs) {
3335                 if (total_devs > 1) {
3336                         if (dev_nr < total_devs - 1) {
3337                                 bio = bio_clone(first_bio, GFP_NOFS);
3338                                 BUG_ON(!bio);
3339                         } else {
3340                                 bio = first_bio;
3341                         }
3342                         bio->bi_private = multi;
3343                         bio->bi_end_io = end_bio_multi_stripe;
3344                 }
3345                 bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
3346                 dev = multi->stripes[dev_nr].dev;
3347                 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3348                         bio->bi_bdev = dev->bdev;
3349                         if (async_submit)
3350                                 schedule_bio(root, dev, rw, bio);
3351                         else
3352                                 submit_bio(rw, bio);
3353                 } else {
3354                         bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
3355                         bio->bi_sector = logical >> 9;
3356                         bio_endio(bio, -EIO);
3357                 }
3358                 dev_nr++;
3359         }
3360         if (total_devs == 1)
3361                 kfree(multi);
3362         return 0;
3363 }
3364
3365 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
3366                                        u8 *uuid, u8 *fsid)
3367 {
3368         struct btrfs_device *device;
3369         struct btrfs_fs_devices *cur_devices;
3370
3371         cur_devices = root->fs_info->fs_devices;
3372         while (cur_devices) {
3373                 if (!fsid ||
3374                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
3375                         device = __find_device(&cur_devices->devices,
3376                                                devid, uuid);
3377                         if (device)
3378                                 return device;
3379                 }
3380                 cur_devices = cur_devices->seed;
3381         }
3382         return NULL;
3383 }
3384
3385 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
3386                                             u64 devid, u8 *dev_uuid)
3387 {
3388         struct btrfs_device *device;
3389         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
3390
3391         device = kzalloc(sizeof(*device), GFP_NOFS);
3392         if (!device)
3393                 return NULL;
3394         list_add(&device->dev_list,
3395                  &fs_devices->devices);
3396         device->barriers = 1;
3397         device->dev_root = root->fs_info->dev_root;
3398         device->devid = devid;
3399         device->work.func = pending_bios_fn;
3400         device->fs_devices = fs_devices;
3401         device->missing = 1;
3402         fs_devices->num_devices++;
3403         fs_devices->missing_devices++;
3404         spin_lock_init(&device->io_lock);
3405         INIT_LIST_HEAD(&device->dev_alloc_list);
3406         memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
3407         return device;
3408 }
3409
3410 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
3411                           struct extent_buffer *leaf,
3412                           struct btrfs_chunk *chunk)
3413 {
3414         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
3415         struct map_lookup *map;
3416         struct extent_map *em;
3417         u64 logical;
3418         u64 length;
3419         u64 devid;
3420         u8 uuid[BTRFS_UUID_SIZE];
3421         int num_stripes;
3422         int ret;
3423         int i;
3424
3425         logical = key->offset;
3426         length = btrfs_chunk_length(leaf, chunk);
3427
3428         read_lock(&map_tree->map_tree.lock);
3429         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
3430         read_unlock(&map_tree->map_tree.lock);
3431
3432         /* already mapped? */
3433         if (em && em->start <= logical && em->start + em->len > logical) {
3434                 free_extent_map(em);
3435                 return 0;
3436         } else if (em) {
3437                 free_extent_map(em);
3438         }
3439
3440         em = alloc_extent_map(GFP_NOFS);
3441         if (!em)
3442                 return -ENOMEM;
3443         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3444         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3445         if (!map) {
3446                 free_extent_map(em);
3447                 return -ENOMEM;
3448         }
3449
3450         em->bdev = (struct block_device *)map;
3451         em->start = logical;
3452         em->len = length;
3453         em->block_start = 0;
3454         em->block_len = em->len;
3455
3456         map->num_stripes = num_stripes;
3457         map->io_width = btrfs_chunk_io_width(leaf, chunk);
3458         map->io_align = btrfs_chunk_io_align(leaf, chunk);
3459         map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
3460         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
3461         map->type = btrfs_chunk_type(leaf, chunk);
3462         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
3463         for (i = 0; i < num_stripes; i++) {
3464                 map->stripes[i].physical =
3465                         btrfs_stripe_offset_nr(leaf, chunk, i);
3466                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
3467                 read_extent_buffer(leaf, uuid, (unsigned long)
3468                                    btrfs_stripe_dev_uuid_nr(chunk, i),
3469                                    BTRFS_UUID_SIZE);
3470                 map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
3471                                                         NULL);
3472                 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
3473                         kfree(map);
3474                         free_extent_map(em);
3475                         return -EIO;
3476                 }
3477                 if (!map->stripes[i].dev) {
3478                         map->stripes[i].dev =
3479                                 add_missing_dev(root, devid, uuid);
3480                         if (!map->stripes[i].dev) {
3481                                 kfree(map);
3482                                 free_extent_map(em);
3483                                 return -EIO;
3484                         }
3485                 }
3486                 map->stripes[i].dev->in_fs_metadata = 1;
3487         }
3488
3489         write_lock(&map_tree->map_tree.lock);
3490         ret = add_extent_mapping(&map_tree->map_tree, em);
3491         write_unlock(&map_tree->map_tree.lock);
3492         BUG_ON(ret);
3493         free_extent_map(em);
3494
3495         return 0;
3496 }
3497
3498 static int fill_device_from_item(struct extent_buffer *leaf,
3499                                  struct btrfs_dev_item *dev_item,
3500                                  struct btrfs_device *device)
3501 {
3502         unsigned long ptr;
3503
3504         device->devid = btrfs_device_id(leaf, dev_item);
3505         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
3506         device->total_bytes = device->disk_total_bytes;
3507         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
3508         device->type = btrfs_device_type(leaf, dev_item);
3509         device->io_align = btrfs_device_io_align(leaf, dev_item);
3510         device->io_width = btrfs_device_io_width(leaf, dev_item);
3511         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
3512
3513         ptr = (unsigned long)btrfs_device_uuid(dev_item);
3514         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
3515
3516         return 0;
3517 }
3518
3519 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
3520 {
3521         struct btrfs_fs_devices *fs_devices;
3522         int ret;
3523
3524         mutex_lock(&uuid_mutex);
3525
3526         fs_devices = root->fs_info->fs_devices->seed;
3527         while (fs_devices) {
3528                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
3529                         ret = 0;
3530                         goto out;
3531                 }
3532                 fs_devices = fs_devices->seed;
3533         }
3534
3535         fs_devices = find_fsid(fsid);
3536         if (!fs_devices) {
3537                 ret = -ENOENT;
3538                 goto out;
3539         }
3540
3541         fs_devices = clone_fs_devices(fs_devices);
3542         if (IS_ERR(fs_devices)) {
3543                 ret = PTR_ERR(fs_devices);
3544                 goto out;
3545         }
3546
3547         ret = __btrfs_open_devices(fs_devices, FMODE_READ,
3548                                    root->fs_info->bdev_holder);
3549         if (ret)
3550                 goto out;
3551
3552         if (!fs_devices->seeding) {
3553                 __btrfs_close_devices(fs_devices);
3554                 free_fs_devices(fs_devices);
3555                 ret = -EINVAL;
3556                 goto out;
3557         }
3558
3559         fs_devices->seed = root->fs_info->fs_devices->seed;
3560         root->fs_info->fs_devices->seed = fs_devices;
3561 out:
3562         mutex_unlock(&uuid_mutex);
3563         return ret;
3564 }
3565
3566 static int read_one_dev(struct btrfs_root *root,
3567                         struct extent_buffer *leaf,
3568                         struct btrfs_dev_item *dev_item)
3569 {
3570         struct btrfs_device *device;
3571         u64 devid;
3572         int ret;
3573         u8 fs_uuid[BTRFS_UUID_SIZE];
3574         u8 dev_uuid[BTRFS_UUID_SIZE];
3575
3576         devid = btrfs_device_id(leaf, dev_item);
3577         read_extent_buffer(leaf, dev_uuid,
3578                            (unsigned long)btrfs_device_uuid(dev_item),
3579                            BTRFS_UUID_SIZE);
3580         read_extent_buffer(leaf, fs_uuid,
3581                            (unsigned long)btrfs_device_fsid(dev_item),
3582                            BTRFS_UUID_SIZE);
3583
3584         if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
3585                 ret = open_seed_devices(root, fs_uuid);
3586                 if (ret && !btrfs_test_opt(root, DEGRADED))
3587                         return ret;
3588         }
3589
3590         device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
3591         if (!device || !device->bdev) {
3592                 if (!btrfs_test_opt(root, DEGRADED))
3593                         return -EIO;
3594
3595                 if (!device) {
3596                         printk(KERN_WARNING "warning devid %llu missing\n",
3597                                (unsigned long long)devid);
3598                         device = add_missing_dev(root, devid, dev_uuid);
3599                         if (!device)
3600                                 return -ENOMEM;
3601                 } else if (!device->missing) {
3602                         /*
3603                          * this happens when a device that was properly setup
3604                          * in the device info lists suddenly goes bad.
3605                          * device->bdev is NULL, and so we have to set
3606                          * device->missing to one here
3607                          */
3608                         root->fs_info->fs_devices->missing_devices++;
3609                         device->missing = 1;
3610                 }
3611         }
3612
3613         if (device->fs_devices != root->fs_info->fs_devices) {
3614                 BUG_ON(device->writeable);
3615                 if (device->generation !=
3616                     btrfs_device_generation(leaf, dev_item))
3617                         return -EINVAL;
3618         }
3619
3620         fill_device_from_item(leaf, dev_item, device);
3621         device->dev_root = root->fs_info->dev_root;
3622         device->in_fs_metadata = 1;
3623         if (device->writeable)
3624                 device->fs_devices->total_rw_bytes += device->total_bytes;
3625         ret = 0;
3626         return ret;
3627 }
3628
3629 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
3630 {
3631         struct btrfs_dev_item *dev_item;
3632
3633         dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
3634                                                      dev_item);
3635         return read_one_dev(root, buf, dev_item);
3636 }
3637
3638 int btrfs_read_sys_array(struct btrfs_root *root)
3639 {
3640         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
3641         struct extent_buffer *sb;
3642         struct btrfs_disk_key *disk_key;
3643         struct btrfs_chunk *chunk;
3644         u8 *ptr;
3645         unsigned long sb_ptr;
3646         int ret = 0;
3647         u32 num_stripes;
3648         u32 array_size;
3649         u32 len = 0;
3650         u32 cur;
3651         struct btrfs_key key;
3652
3653         sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
3654                                           BTRFS_SUPER_INFO_SIZE);
3655         if (!sb)
3656                 return -ENOMEM;
3657         btrfs_set_buffer_uptodate(sb);
3658         btrfs_set_buffer_lockdep_class(sb, 0);
3659
3660         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3661         array_size = btrfs_super_sys_array_size(super_copy);
3662
3663         ptr = super_copy->sys_chunk_array;
3664         sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
3665         cur = 0;
3666
3667         while (cur < array_size) {
3668                 disk_key = (struct btrfs_disk_key *)ptr;
3669                 btrfs_disk_key_to_cpu(&key, disk_key);
3670
3671                 len = sizeof(*disk_key); ptr += len;
3672                 sb_ptr += len;
3673                 cur += len;
3674
3675                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
3676                         chunk = (struct btrfs_chunk *)sb_ptr;
3677                         ret = read_one_chunk(root, &key, sb, chunk);
3678                         if (ret)
3679                                 break;
3680                         num_stripes = btrfs_chunk_num_stripes(sb, chunk);
3681                         len = btrfs_chunk_item_size(num_stripes);
3682                 } else {
3683                         ret = -EIO;
3684                         break;
3685                 }
3686                 ptr += len;
3687                 sb_ptr += len;
3688                 cur += len;
3689         }
3690         free_extent_buffer(sb);
3691         return ret;
3692 }
3693
3694 int btrfs_read_chunk_tree(struct btrfs_root *root)
3695 {
3696         struct btrfs_path *path;
3697         struct extent_buffer *leaf;
3698         struct btrfs_key key;
3699         struct btrfs_key found_key;
3700         int ret;
3701         int slot;
3702
3703         root = root->fs_info->chunk_root;
3704
3705         path = btrfs_alloc_path();
3706         if (!path)
3707                 return -ENOMEM;
3708
3709         /* first we search for all of the device items, and then we
3710          * read in all of the chunk items.  This way we can create chunk
3711          * mappings that reference all of the devices that are afound
3712          */
3713         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
3714         key.offset = 0;
3715         key.type = 0;
3716 again:
3717         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3718         if (ret < 0)
3719                 goto error;
3720         while (1) {
3721                 leaf = path->nodes[0];
3722                 slot = path->slots[0];
3723                 if (slot >= btrfs_header_nritems(leaf)) {
3724                         ret = btrfs_next_leaf(root, path);
3725                         if (ret == 0)
3726                                 continue;
3727                         if (ret < 0)
3728                                 goto error;
3729                         break;
3730                 }
3731                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3732                 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3733                         if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
3734                                 break;
3735                         if (found_key.type == BTRFS_DEV_ITEM_KEY) {
3736                                 struct btrfs_dev_item *dev_item;
3737                                 dev_item = btrfs_item_ptr(leaf, slot,
3738                                                   struct btrfs_dev_item);
3739                                 ret = read_one_dev(root, leaf, dev_item);
3740                                 if (ret)
3741                                         goto error;
3742                         }
3743                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
3744                         struct btrfs_chunk *chunk;
3745                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3746                         ret = read_one_chunk(root, &found_key, leaf, chunk);
3747                         if (ret)
3748                                 goto error;
3749                 }
3750                 path->slots[0]++;
3751         }
3752         if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3753                 key.objectid = 0;
3754                 btrfs_release_path(root, path);
3755                 goto again;
3756         }
3757         ret = 0;
3758 error:
3759         btrfs_free_path(path);
3760         return ret;
3761 }