fs/btrfs/extent-tree.c

   1 /*
   2  * Copyright (C) 2007 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18 #include <linux/sched.h>
  19 #include <linux/pagemap.h>
  20 #include <linux/writeback.h>
  21 #include <linux/blkdev.h>
  22 #include <linux/sort.h>
  23 #include <linux/rcupdate.h>
  24 #include <linux/kthread.h>
  25 #include <linux/slab.h>
  26 #include <linux/ratelimit.h>
  27 #include <linux/percpu_counter.h>
  28 #include "hash.h"
  29 #include "tree-log.h"
  30 #include "disk-io.h"
  31 #include "print-tree.h"
  32 #include "volumes.h"
  33 #include "raid56.h"
  34 #include "locking.h"
  35 #include "free-space-cache.h"
  36 #include "free-space-tree.h"
  37 #include "math.h"
  38 #include "sysfs.h"
  39 #include "qgroup.h"
  40
  41 #undef SCRAMBLE_DELAYED_REFS
  42
  43 /*
  44  * control flags for do_chunk_alloc's force field
  45  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  46  * if we really need one.
  47  *
  48  * CHUNK_ALLOC_LIMITED means to only try and allocate one
  49  * if we have very few chunks already allocated.  This is
  50  * used as part of the clustering code to help make sure
  51  * we have a good pool of storage to cluster in, without
  52  * filling the FS with empty chunks
  53  *
  54  * CHUNK_ALLOC_FORCE means it must try to allocate one
  55  *
  56  */
  57 enum {
  58         CHUNK_ALLOC_NO_FORCE = 0,
  59         CHUNK_ALLOC_LIMITED = 1,
  60         CHUNK_ALLOC_FORCE = 2,
  61 };
  62
  63 static int update_block_group(struct btrfs_trans_handle *trans,
  64                               struct btrfs_root *root, u64 bytenr,
  65                               u64 num_bytes, int alloc);
  66 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  67                                 struct btrfs_root *root,
  68                                 struct btrfs_delayed_ref_node *node, u64 parent,
  69                                 u64 root_objectid, u64 owner_objectid,
  70                                 u64 owner_offset, int refs_to_drop,
  71                                 struct btrfs_delayed_extent_op *extra_op);
  72 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  73                                     struct extent_buffer *leaf,
  74                                     struct btrfs_extent_item *ei);
  75 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  76                                       struct btrfs_root *root,
  77                                       u64 parent, u64 root_objectid,
  78                                       u64 flags, u64 owner, u64 offset,
  79                                       struct btrfs_key *ins, int ref_mod);
  80 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  81                                      struct btrfs_root *root,
  82                                      u64 parent, u64 root_objectid,
  83                                      u64 flags, struct btrfs_disk_key *key,
  84                                      int level, struct btrfs_key *ins);
  85 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
  86                           struct btrfs_root *extent_root, u64 flags,
  87                           int force);
  88 static int find_next_key(struct btrfs_path *path, int level,
  89                          struct btrfs_key *key);
  90 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
  91                             int dump_block_groups);
  92 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
  93                                     u64 ram_bytes, u64 num_bytes, int delalloc);
  94 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
  95                                      u64 num_bytes, int delalloc);
  96 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
  97                                u64 num_bytes);
  98 int btrfs_pin_extent(struct btrfs_root *root,
  99                      u64 bytenr, u64 num_bytes, int reserved);
 100 static int __reserve_metadata_bytes(struct btrfs_root *root,
 101                                     struct btrfs_space_info *space_info,
 102                                     u64 orig_bytes,
 103                                     enum btrfs_reserve_flush_enum flush);
 104 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
 105                                      struct btrfs_space_info *space_info,
 106                                      u64 num_bytes);
 107 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
 108                                      struct btrfs_space_info *space_info,
 109                                      u64 num_bytes);
 110
 111 static noinline int
 112 block_group_cache_done(struct btrfs_block_group_cache *cache)
 113 {
 114         smp_mb();
 115         return cache->cached == BTRFS_CACHE_FINISHED ||
 116                 cache->cached == BTRFS_CACHE_ERROR;
 117 }
 118
 119 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 120 {
 121         return (cache->flags & bits) == bits;
 122 }
 123
 124 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 125 {
 126         atomic_inc(&cache->count);
 127 }
 128
 129 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 130 {
 131         if (atomic_dec_and_test(&cache->count)) {
 132                 WARN_ON(cache->pinned > 0);
 133                 WARN_ON(cache->reserved > 0);
 134                 kfree(cache->free_space_ctl);
 135                 kfree(cache);
 136         }
 137 }
 138
 139 /*
 140  * this adds the block group to the fs_info rb tree for the block group
 141  * cache
 142  */
 143 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 144                                 struct btrfs_block_group_cache *block_group)
 145 {
 146         struct rb_node **p;
 147         struct rb_node *parent = NULL;
 148         struct btrfs_block_group_cache *cache;
 149
 150         spin_lock(&info->block_group_cache_lock);
 151         p = &info->block_group_cache_tree.rb_node;
 152
 153         while (*p) {
 154                 parent = *p;
 155                 cache = rb_entry(parent, struct btrfs_block_group_cache,
 156                                  cache_node);
 157                 if (block_group->key.objectid < cache->key.objectid) {
 158                         p = &(*p)->rb_left;
 159                 } else if (block_group->key.objectid > cache->key.objectid) {
 160                         p = &(*p)->rb_right;
 161                 } else {
 162                         spin_unlock(&info->block_group_cache_lock);
 163                         return -EEXIST;
 164                 }
 165         }
 166
 167         rb_link_node(&block_group->cache_node, parent, p);
 168         rb_insert_color(&block_group->cache_node,
 169                         &info->block_group_cache_tree);
 170
 171         if (info->first_logical_byte > block_group->key.objectid)
 172                 info->first_logical_byte = block_group->key.objectid;
 173
 174         spin_unlock(&info->block_group_cache_lock);
 175
 176         return 0;
 177 }
 178
 179 /*
 180  * This will return the block group at or after bytenr if contains is 0, else
 181  * it will return the block group that contains the bytenr
 182  */
 183 static struct btrfs_block_group_cache *
 184 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 185                               int contains)
 186 {
 187         struct btrfs_block_group_cache *cache, *ret = NULL;
 188         struct rb_node *n;
 189         u64 end, start;
 190
 191         spin_lock(&info->block_group_cache_lock);
 192         n = info->block_group_cache_tree.rb_node;
 193
 194         while (n) {
 195                 cache = rb_entry(n, struct btrfs_block_group_cache,
 196                                  cache_node);
 197                 end = cache->key.objectid + cache->key.offset - 1;
 198                 start = cache->key.objectid;
 199
 200                 if (bytenr < start) {
 201                         if (!contains && (!ret || start < ret->key.objectid))
 202                                 ret = cache;
 203                         n = n->rb_left;
 204                 } else if (bytenr > start) {
 205                         if (contains && bytenr <= end) {
 206                                 ret = cache;
 207                                 break;
 208                         }
 209                         n = n->rb_right;
 210                 } else {
 211                         ret = cache;
 212                         break;
 213                 }
 214         }
 215         if (ret) {
 216                 btrfs_get_block_group(ret);
 217                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
 218                         info->first_logical_byte = ret->key.objectid;
 219         }
 220         spin_unlock(&info->block_group_cache_lock);
 221
 222         return ret;
 223 }
 224
 225 static int add_excluded_extent(struct btrfs_root *root,
 226                                u64 start, u64 num_bytes)
 227 {
 228         u64 end = start + num_bytes - 1;
 229         set_extent_bits(&root->fs_info->freed_extents[0],
 230                         start, end, EXTENT_UPTODATE);
 231         set_extent_bits(&root->fs_info->freed_extents[1],
 232                         start, end, EXTENT_UPTODATE);
 233         return 0;
 234 }
 235
 236 static void free_excluded_extents(struct btrfs_root *root,
 237                                   struct btrfs_block_group_cache *cache)
 238 {
 239         u64 start, end;
 240
 241         start = cache->key.objectid;
 242         end = start + cache->key.offset - 1;
 243
 244         clear_extent_bits(&root->fs_info->freed_extents[0],
 245                           start, end, EXTENT_UPTODATE);
 246         clear_extent_bits(&root->fs_info->freed_extents[1],
 247                           start, end, EXTENT_UPTODATE);
 248 }
 249
 250 static int exclude_super_stripes(struct btrfs_root *root,
 251                                  struct btrfs_block_group_cache *cache)
 252 {
 253         u64 bytenr;
 254         u64 *logical;
 255         int stripe_len;
 256         int i, nr, ret;
 257
 258         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 259                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 260                 cache->bytes_super += stripe_len;
 261                 ret = add_excluded_extent(root, cache->key.objectid,
 262                                           stripe_len);
 263                 if (ret)
 264                         return ret;
 265         }
 266
 267         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 268                 bytenr = btrfs_sb_offset(i);
 269                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
 270                                        cache->key.objectid, bytenr,
 271                                        0, &logical, &nr, &stripe_len);
 272                 if (ret)
 273                         return ret;
 274
 275                 while (nr--) {
 276                         u64 start, len;
 277
 278                         if (logical[nr] > cache->key.objectid +
 279                             cache->key.offset)
 280                                 continue;
 281
 282                         if (logical[nr] + stripe_len <= cache->key.objectid)
 283                                 continue;
 284
 285                         start = logical[nr];
 286                         if (start < cache->key.objectid) {
 287                                 start = cache->key.objectid;
 288                                 len = (logical[nr] + stripe_len) - start;
 289                         } else {
 290                                 len = min_t(u64, stripe_len,
 291                                             cache->key.objectid +
 292                                             cache->key.offset - start);
 293                         }
 294
 295                         cache->bytes_super += len;
 296                         ret = add_excluded_extent(root, start, len);
 297                         if (ret) {
 298                                 kfree(logical);
 299                                 return ret;
 300                         }
 301                 }
 302
 303                 kfree(logical);
 304         }
 305         return 0;
 306 }
 307
 308 static struct btrfs_caching_control *
 309 get_caching_control(struct btrfs_block_group_cache *cache)
 310 {
 311         struct btrfs_caching_control *ctl;
 312
 313         spin_lock(&cache->lock);
 314         if (!cache->caching_ctl) {
 315                 spin_unlock(&cache->lock);
 316                 return NULL;
 317         }
 318
 319         ctl = cache->caching_ctl;
 320         atomic_inc(&ctl->count);
 321         spin_unlock(&cache->lock);
 322         return ctl;
 323 }
 324
 325 static void put_caching_control(struct btrfs_caching_control *ctl)
 326 {
 327         if (atomic_dec_and_test(&ctl->count))
 328                 kfree(ctl);
 329 }
 330
 331 #ifdef CONFIG_BTRFS_DEBUG
 332 static void fragment_free_space(struct btrfs_root *root,
 333                                 struct btrfs_block_group_cache *block_group)
 334 {
 335         u64 start = block_group->key.objectid;
 336         u64 len = block_group->key.offset;
 337         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
 338                 root->nodesize : root->sectorsize;
 339         u64 step = chunk << 1;
 340
 341         while (len > chunk) {
 342                 btrfs_remove_free_space(block_group, start, chunk);
 343                 start += step;
 344                 if (len < step)
 345                         len = 0;
 346                 else
 347                         len -= step;
 348         }
 349 }
 350 #endif
 351
 352 /*
 353  * this is only called by cache_block_group, since we could have freed extents
 354  * we need to check the pinned_extents for any extents that can't be used yet
 355  * since their free space will be released as soon as the transaction commits.
 356  */
 357 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 358                        struct btrfs_fs_info *info, u64 start, u64 end)
 359 {
 360         u64 extent_start, extent_end, size, total_added = 0;
 361         int ret;
 362
 363         while (start < end) {
 364                 ret = find_first_extent_bit(info->pinned_extents, start,
 365                                             &extent_start, &extent_end,
 366                                             EXTENT_DIRTY | EXTENT_UPTODATE,
 367                                             NULL);
 368                 if (ret)
 369                         break;
 370
 371                 if (extent_start <= start) {
 372                         start = extent_end + 1;
 373                 } else if (extent_start > start && extent_start < end) {
 374                         size = extent_start - start;
 375                         total_added += size;
 376                         ret = btrfs_add_free_space(block_group, start,
 377                                                    size);
 378                         BUG_ON(ret); /* -ENOMEM or logic error */
 379                         start = extent_end + 1;
 380                 } else {
 381                         break;
 382                 }
 383         }
 384
 385         if (start < end) {
 386                 size = end - start;
 387                 total_added += size;
 388                 ret = btrfs_add_free_space(block_group, start, size);
 389                 BUG_ON(ret); /* -ENOMEM or logic error */
 390         }
 391
 392         return total_added;
 393 }
 394
 395 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 396 {
 397         struct btrfs_block_group_cache *block_group;
 398         struct btrfs_fs_info *fs_info;
 399         struct btrfs_root *extent_root;
 400         struct btrfs_path *path;
 401         struct extent_buffer *leaf;
 402         struct btrfs_key key;
 403         u64 total_found = 0;
 404         u64 last = 0;
 405         u32 nritems;
 406         int ret;
 407         bool wakeup = true;
 408
 409         block_group = caching_ctl->block_group;
 410         fs_info = block_group->fs_info;
 411         extent_root = fs_info->extent_root;
 412
 413         path = btrfs_alloc_path();
 414         if (!path)
 415                 return -ENOMEM;
 416
 417         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 418
 419 #ifdef CONFIG_BTRFS_DEBUG
 420         /*
 421          * If we're fragmenting we don't want to make anybody think we can
 422          * allocate from this block group until we've had a chance to fragment
 423          * the free space.
 424          */
 425         if (btrfs_should_fragment_free_space(extent_root, block_group))
 426                 wakeup = false;
 427 #endif
 428         /*
 429          * We don't want to deadlock with somebody trying to allocate a new
 430          * extent for the extent root while also trying to search the extent
 431          * root to add free space.  So we skip locking and search the commit
 432          * root, since its read-only
 433          */
 434         path->skip_locking = 1;
 435         path->search_commit_root = 1;
 436         path->reada = READA_FORWARD;
 437
 438         key.objectid = last;
 439         key.offset = 0;
 440         key.type = BTRFS_EXTENT_ITEM_KEY;
 441
 442 next:
 443         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 444         if (ret < 0)
 445                 goto out;
 446
 447         leaf = path->nodes[0];
 448         nritems = btrfs_header_nritems(leaf);
 449
 450         while (1) {
 451                 if (btrfs_fs_closing(fs_info) > 1) {
 452                         last = (u64)-1;
 453                         break;
 454                 }
 455
 456                 if (path->slots[0] < nritems) {
 457                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 458                 } else {
 459                         ret = find_next_key(path, 0, &key);
 460                         if (ret)
 461                                 break;
 462
 463                         if (need_resched() ||
 464                             rwsem_is_contended(&fs_info->commit_root_sem)) {
 465                                 if (wakeup)
 466                                         caching_ctl->progress = last;
 467                                 btrfs_release_path(path);
 468                                 up_read(&fs_info->commit_root_sem);
 469                                 mutex_unlock(&caching_ctl->mutex);
 470                                 cond_resched();
 471                                 mutex_lock(&caching_ctl->mutex);
 472                                 down_read(&fs_info->commit_root_sem);
 473                                 goto next;
 474                         }
 475
 476                         ret = btrfs_next_leaf(extent_root, path);
 477                         if (ret < 0)
 478                                 goto out;
 479                         if (ret)
 480                                 break;
 481                         leaf = path->nodes[0];
 482                         nritems = btrfs_header_nritems(leaf);
 483                         continue;
 484                 }
 485
 486                 if (key.objectid < last) {
 487                         key.objectid = last;
 488                         key.offset = 0;
 489                         key.type = BTRFS_EXTENT_ITEM_KEY;
 490
 491                         if (wakeup)
 492                                 caching_ctl->progress = last;
 493                         btrfs_release_path(path);
 494                         goto next;
 495                 }
 496
 497                 if (key.objectid < block_group->key.objectid) {
 498                         path->slots[0]++;
 499                         continue;
 500                 }
 501
 502                 if (key.objectid >= block_group->key.objectid +
 503                     block_group->key.offset)
 504                         break;
 505
 506                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 507                     key.type == BTRFS_METADATA_ITEM_KEY) {
 508                         total_found += add_new_free_space(block_group,
 509                                                           fs_info, last,
 510                                                           key.objectid);
 511                         if (key.type == BTRFS_METADATA_ITEM_KEY)
 512                                 last = key.objectid +
 513                                         fs_info->tree_root->nodesize;
 514                         else
 515                                 last = key.objectid + key.offset;
 516
 517                         if (total_found > CACHING_CTL_WAKE_UP) {
 518                                 total_found = 0;
 519                                 if (wakeup)
 520                                         wake_up(&caching_ctl->wait);
 521                         }
 522                 }
 523                 path->slots[0]++;
 524         }
 525         ret = 0;
 526
 527         total_found += add_new_free_space(block_group, fs_info, last,
 528                                           block_group->key.objectid +
 529                                           block_group->key.offset);
 530         caching_ctl->progress = (u64)-1;
 531
 532 out:
 533         btrfs_free_path(path);
 534         return ret;
 535 }
 536
 537 static noinline void caching_thread(struct btrfs_work *work)
 538 {
 539         struct btrfs_block_group_cache *block_group;
 540         struct btrfs_fs_info *fs_info;
 541         struct btrfs_caching_control *caching_ctl;
 542         struct btrfs_root *extent_root;
 543         int ret;
 544
 545         caching_ctl = container_of(work, struct btrfs_caching_control, work);
 546         block_group = caching_ctl->block_group;
 547         fs_info = block_group->fs_info;
 548         extent_root = fs_info->extent_root;
 549
 550         mutex_lock(&caching_ctl->mutex);
 551         down_read(&fs_info->commit_root_sem);
 552
 553         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
 554                 ret = load_free_space_tree(caching_ctl);
 555         else
 556                 ret = load_extent_tree_free(caching_ctl);
 557
 558         spin_lock(&block_group->lock);
 559         block_group->caching_ctl = NULL;
 560         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
 561         spin_unlock(&block_group->lock);
 562
 563 #ifdef CONFIG_BTRFS_DEBUG
 564         if (btrfs_should_fragment_free_space(extent_root, block_group)) {
 565                 u64 bytes_used;
 566
 567                 spin_lock(&block_group->space_info->lock);
 568                 spin_lock(&block_group->lock);
 569                 bytes_used = block_group->key.offset -
 570                         btrfs_block_group_used(&block_group->item);
 571                 block_group->space_info->bytes_used += bytes_used >> 1;
 572                 spin_unlock(&block_group->lock);
 573                 spin_unlock(&block_group->space_info->lock);
 574                 fragment_free_space(extent_root, block_group);
 575         }
 576 #endif
 577
 578         caching_ctl->progress = (u64)-1;
 579
 580         up_read(&fs_info->commit_root_sem);
 581         free_excluded_extents(fs_info->extent_root, block_group);
 582         mutex_unlock(&caching_ctl->mutex);
 583
 584         wake_up(&caching_ctl->wait);
 585
 586         put_caching_control(caching_ctl);
 587         btrfs_put_block_group(block_group);
 588 }
 589
 590 static int cache_block_group(struct btrfs_block_group_cache *cache,
 591                              int load_cache_only)
 592 {
 593         DEFINE_WAIT(wait);
 594         struct btrfs_fs_info *fs_info = cache->fs_info;
 595         struct btrfs_caching_control *caching_ctl;
 596         int ret = 0;
 597
 598         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 599         if (!caching_ctl)
 600                 return -ENOMEM;
 601
 602         INIT_LIST_HEAD(&caching_ctl->list);
 603         mutex_init(&caching_ctl->mutex);
 604         init_waitqueue_head(&caching_ctl->wait);
 605         caching_ctl->block_group = cache;
 606         caching_ctl->progress = cache->key.objectid;
 607         atomic_set(&caching_ctl->count, 1);
 608         btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
 609                         caching_thread, NULL, NULL);
 610
 611         spin_lock(&cache->lock);
 612         /*
 613          * This should be a rare occasion, but this could happen I think in the
 614          * case where one thread starts to load the space cache info, and then
 615          * some other thread starts a transaction commit which tries to do an
 616          * allocation while the other thread is still loading the space cache
 617          * info.  The previous loop should have kept us from choosing this block
 618          * group, but if we've moved to the state where we will wait on caching
 619          * block groups we need to first check if we're doing a fast load here,
 620          * so we can wait for it to finish, otherwise we could end up allocating
 621          * from a block group who's cache gets evicted for one reason or
 622          * another.
 623          */
 624         while (cache->cached == BTRFS_CACHE_FAST) {
 625                 struct btrfs_caching_control *ctl;
 626
 627                 ctl = cache->caching_ctl;
 628                 atomic_inc(&ctl->count);
 629                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 630                 spin_unlock(&cache->lock);
 631
 632                 schedule();
 633
 634                 finish_wait(&ctl->wait, &wait);
 635                 put_caching_control(ctl);
 636                 spin_lock(&cache->lock);
 637         }
 638
 639         if (cache->cached != BTRFS_CACHE_NO) {
 640                 spin_unlock(&cache->lock);
 641                 kfree(caching_ctl);
 642                 return 0;
 643         }
 644         WARN_ON(cache->caching_ctl);
 645         cache->caching_ctl = caching_ctl;
 646         cache->cached = BTRFS_CACHE_FAST;
 647         spin_unlock(&cache->lock);
 648
 649         if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
 650                 mutex_lock(&caching_ctl->mutex);
 651                 ret = load_free_space_cache(fs_info, cache);
 652
 653                 spin_lock(&cache->lock);
 654                 if (ret == 1) {
 655                         cache->caching_ctl = NULL;
 656                         cache->cached = BTRFS_CACHE_FINISHED;
 657                         cache->last_byte_to_unpin = (u64)-1;
 658                         caching_ctl->progress = (u64)-1;
 659                 } else {
 660                         if (load_cache_only) {
 661                                 cache->caching_ctl = NULL;
 662                                 cache->cached = BTRFS_CACHE_NO;
 663                         } else {
 664                                 cache->cached = BTRFS_CACHE_STARTED;
 665                                 cache->has_caching_ctl = 1;
 666                         }
 667                 }
 668                 spin_unlock(&cache->lock);
 669 #ifdef CONFIG_BTRFS_DEBUG
 670                 if (ret == 1 &&
 671                     btrfs_should_fragment_free_space(fs_info->extent_root,
 672                                                      cache)) {
 673                         u64 bytes_used;
 674
 675                         spin_lock(&cache->space_info->lock);
 676                         spin_lock(&cache->lock);
 677                         bytes_used = cache->key.offset -
 678                                 btrfs_block_group_used(&cache->item);
 679                         cache->space_info->bytes_used += bytes_used >> 1;
 680                         spin_unlock(&cache->lock);
 681                         spin_unlock(&cache->space_info->lock);
 682                         fragment_free_space(fs_info->extent_root, cache);
 683                 }
 684 #endif
 685                 mutex_unlock(&caching_ctl->mutex);
 686
 687                 wake_up(&caching_ctl->wait);
 688                 if (ret == 1) {
 689                         put_caching_control(caching_ctl);
 690                         free_excluded_extents(fs_info->extent_root, cache);
 691                         return 0;
 692                 }
 693         } else {
 694                 /*
 695                  * We're either using the free space tree or no caching at all.
 696                  * Set cached to the appropriate value and wakeup any waiters.
 697                  */
 698                 spin_lock(&cache->lock);
 699                 if (load_cache_only) {
 700                         cache->caching_ctl = NULL;
 701                         cache->cached = BTRFS_CACHE_NO;
 702                 } else {
 703                         cache->cached = BTRFS_CACHE_STARTED;
 704                         cache->has_caching_ctl = 1;
 705                 }
 706                 spin_unlock(&cache->lock);
 707                 wake_up(&caching_ctl->wait);
 708         }
 709
 710         if (load_cache_only) {
 711                 put_caching_control(caching_ctl);
 712                 return 0;
 713         }
 714
 715         down_write(&fs_info->commit_root_sem);
 716         atomic_inc(&caching_ctl->count);
 717         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 718         up_write(&fs_info->commit_root_sem);
 719
 720         btrfs_get_block_group(cache);
 721
 722         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 723
 724         return ret;
 725 }
 726
 727 /*
 728  * return the block group that starts at or after bytenr
 729  */
 730 static struct btrfs_block_group_cache *
 731 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 732 {
 733         struct btrfs_block_group_cache *cache;
 734
 735         cache = block_group_cache_tree_search(info, bytenr, 0);
 736
 737         return cache;
 738 }
 739
 740 /*
 741  * return the block group that contains the given bytenr
 742  */
 743 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 744                                                  struct btrfs_fs_info *info,
 745                                                  u64 bytenr)
 746 {
 747         struct btrfs_block_group_cache *cache;
 748
 749         cache = block_group_cache_tree_search(info, bytenr, 1);
 750
 751         return cache;
 752 }
 753
 754 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 755                                                   u64 flags)
 756 {
 757         struct list_head *head = &info->space_info;
 758         struct btrfs_space_info *found;
 759
 760         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 761
 762         rcu_read_lock();
 763         list_for_each_entry_rcu(found, head, list) {
 764                 if (found->flags & flags) {
 765                         rcu_read_unlock();
 766                         return found;
 767                 }
 768         }
 769         rcu_read_unlock();
 770         return NULL;
 771 }
 772
 773 /*
 774  * after adding space to the filesystem, we need to clear the full flags
 775  * on all the space infos.
 776  */
 777 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 778 {
 779         struct list_head *head = &info->space_info;
 780         struct btrfs_space_info *found;
 781
 782         rcu_read_lock();
 783         list_for_each_entry_rcu(found, head, list)
 784                 found->full = 0;
 785         rcu_read_unlock();
 786 }
 787
 788 /* simple helper to search for an existing data extent at a given offset */
 789 int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
 790 {
 791         int ret;
 792         struct btrfs_key key;
 793         struct btrfs_path *path;
 794
 795         path = btrfs_alloc_path();
 796         if (!path)
 797                 return -ENOMEM;
 798
 799         key.objectid = start;
 800         key.offset = len;
 801         key.type = BTRFS_EXTENT_ITEM_KEY;
 802         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 803                                 0, 0);
 804         btrfs_free_path(path);
 805         return ret;
 806 }
 807
 808 /*
 809  * helper function to lookup reference count and flags of a tree block.
 810  *
 811  * the head node for delayed ref is used to store the sum of all the
 812  * reference count modifications queued up in the rbtree. the head
 813  * node may also store the extent flags to set. This way you can check
 814  * to see what the reference count and extent flags would be if all of
 815  * the delayed refs are not processed.
 816  */
 817 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 818                              struct btrfs_root *root, u64 bytenr,
 819                              u64 offset, int metadata, u64 *refs, u64 *flags)
 820 {
 821         struct btrfs_delayed_ref_head *head;
 822         struct btrfs_delayed_ref_root *delayed_refs;
 823         struct btrfs_path *path;
 824         struct btrfs_extent_item *ei;
 825         struct extent_buffer *leaf;
 826         struct btrfs_key key;
 827         u32 item_size;
 828         u64 num_refs;
 829         u64 extent_flags;
 830         int ret;
 831
 832         /*
 833          * If we don't have skinny metadata, don't bother doing anything
 834          * different
 835          */
 836         if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
 837                 offset = root->nodesize;
 838                 metadata = 0;
 839         }
 840
 841         path = btrfs_alloc_path();
 842         if (!path)
 843                 return -ENOMEM;
 844
 845         if (!trans) {
 846                 path->skip_locking = 1;
 847                 path->search_commit_root = 1;
 848         }
 849
 850 search_again:
 851         key.objectid = bytenr;
 852         key.offset = offset;
 853         if (metadata)
 854                 key.type = BTRFS_METADATA_ITEM_KEY;
 855         else
 856                 key.type = BTRFS_EXTENT_ITEM_KEY;
 857
 858         ret = btrfs_search_slot(trans, root->fs_info->extent_root,
 859                                 &key, path, 0, 0);
 860         if (ret < 0)
 861                 goto out_free;
 862
 863         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
 864                 if (path->slots[0]) {
 865                         path->slots[0]--;
 866                         btrfs_item_key_to_cpu(path->nodes[0], &key,
 867                                               path->slots[0]);
 868                         if (key.objectid == bytenr &&
 869                             key.type == BTRFS_EXTENT_ITEM_KEY &&
 870                             key.offset == root->nodesize)
 871                                 ret = 0;
 872                 }
 873         }
 874
 875         if (ret == 0) {
 876                 leaf = path->nodes[0];
 877                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 878                 if (item_size >= sizeof(*ei)) {
 879                         ei = btrfs_item_ptr(leaf, path->slots[0],
 880                                             struct btrfs_extent_item);
 881                         num_refs = btrfs_extent_refs(leaf, ei);
 882                         extent_flags = btrfs_extent_flags(leaf, ei);
 883                 } else {
 884 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 885                         struct btrfs_extent_item_v0 *ei0;
 886                         BUG_ON(item_size != sizeof(*ei0));
 887                         ei0 = btrfs_item_ptr(leaf, path->slots[0],
 888                                              struct btrfs_extent_item_v0);
 889                         num_refs = btrfs_extent_refs_v0(leaf, ei0);
 890                         /* FIXME: this isn't correct for data */
 891                         extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 892 #else
 893                         BUG();
 894 #endif
 895                 }
 896                 BUG_ON(num_refs == 0);
 897         } else {
 898                 num_refs = 0;
 899                 extent_flags = 0;
 900                 ret = 0;
 901         }
 902
 903         if (!trans)
 904                 goto out;
 905
 906         delayed_refs = &trans->transaction->delayed_refs;
 907         spin_lock(&delayed_refs->lock);
 908         head = btrfs_find_delayed_ref_head(trans, bytenr);
 909         if (head) {
 910                 if (!mutex_trylock(&head->mutex)) {
 911                         atomic_inc(&head->node.refs);
 912                         spin_unlock(&delayed_refs->lock);
 913
 914                         btrfs_release_path(path);
 915
 916                         /*
 917                          * Mutex was contended, block until it's released and try
 918                          * again
 919                          */
 920                         mutex_lock(&head->mutex);
 921                         mutex_unlock(&head->mutex);
 922                         btrfs_put_delayed_ref(&head->node);
 923                         goto search_again;
 924                 }
 925                 spin_lock(&head->lock);
 926                 if (head->extent_op && head->extent_op->update_flags)
 927                         extent_flags |= head->extent_op->flags_to_set;
 928                 else
 929                         BUG_ON(num_refs == 0);
 930
 931                 num_refs += head->node.ref_mod;
 932                 spin_unlock(&head->lock);
 933                 mutex_unlock(&head->mutex);
 934         }
 935         spin_unlock(&delayed_refs->lock);
 936 out:
 937         WARN_ON(num_refs == 0);
 938         if (refs)
 939                 *refs = num_refs;
 940         if (flags)
 941                 *flags = extent_flags;
 942 out_free:
 943         btrfs_free_path(path);
 944         return ret;
 945 }
 946
 947 /*
 948  * Back reference rules.  Back refs have three main goals:
 949  *
 950  * 1) differentiate between all holders of references to an extent so that
 951  *    when a reference is dropped we can make sure it was a valid reference
 952  *    before freeing the extent.
 953  *
 954  * 2) Provide enough information to quickly find the holders of an extent
 955  *    if we notice a given block is corrupted or bad.
 956  *
 957  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 958  *    maintenance.  This is actually the same as #2, but with a slightly
 959  *    different use case.
 960  *
 961  * There are two kinds of back refs. The implicit back refs is optimized
 962  * for pointers in non-shared tree blocks. For a given pointer in a block,
 963  * back refs of this kind provide information about the block's owner tree
 964  * and the pointer's key. These information allow us to find the block by
 965  * b-tree searching. The full back refs is for pointers in tree blocks not
 966  * referenced by their owner trees. The location of tree block is recorded
 967  * in the back refs. Actually the full back refs is generic, and can be
 968  * used in all cases the implicit back refs is used. The major shortcoming
 969  * of the full back refs is its overhead. Every time a tree block gets
 970  * COWed, we have to update back refs entry for all pointers in it.
 971  *
 972  * For a newly allocated tree block, we use implicit back refs for
 973  * pointers in it. This means most tree related operations only involve
 974  * implicit back refs. For a tree block created in old transaction, the
 975  * only way to drop a reference to it is COW it. So we can detect the
 976  * event that tree block loses its owner tree's reference and do the
 977  * back refs conversion.
 978  *
 979  * When a tree block is COWed through a tree, there are four cases:
 980  *
 981  * The reference count of the block is one and the tree is the block's
 982  * owner tree. Nothing to do in this case.
 983  *
 984  * The reference count of the block is one and the tree is not the
 985  * block's owner tree. In this case, full back refs is used for pointers
 986  * in the block. Remove these full back refs, add implicit back refs for
 987  * every pointers in the new block.
 988  *
 989  * The reference count of the block is greater than one and the tree is
 990  * the block's owner tree. In this case, implicit back refs is used for
 991  * pointers in the block. Add full back refs for every pointers in the
 992  * block, increase lower level extents' reference counts. The original
 993  * implicit back refs are entailed to the new block.
 994  *
 995  * The reference count of the block is greater than one and the tree is
 996  * not the block's owner tree. Add implicit back refs for every pointer in
 997  * the new block, increase lower level extents' reference count.
 998  *
 999  * Back Reference Key composing:
1000  *
1001  * The key objectid corresponds to the first byte in the extent,
1002  * The key type is used to differentiate between types of back refs.
1003  * There are different meanings of the key offset for different types
1004  * of back refs.
1005  *
1006  * File extents can be referenced by:
1007  *
1008  * - multiple snapshots, subvolumes, or different generations in one subvol
1009  * - different files inside a single subvolume
1010  * - different offsets inside a file (bookend extents in file.c)
1011  *
1012  * The extent ref structure for the implicit back refs has fields for:
1013  *
1014  * - Objectid of the subvolume root
1015  * - objectid of the file holding the reference
1016  * - original offset in the file
1017  * - how many bookend extents
1018  *
1019  * The key offset for the implicit back refs is hash of the first
1020  * three fields.
1021  *
1022  * The extent ref structure for the full back refs has field for:
1023  *
1024  * - number of pointers in the tree leaf
1025  *
1026  * The key offset for the implicit back refs is the first byte of
1027  * the tree leaf
1028  *
1029  * When a file extent is allocated, The implicit back refs is used.
1030  * the fields are filled in:
1031  *
1032  *     (root_key.objectid, inode objectid, offset in file, 1)
1033  *
1034  * When a file extent is removed file truncation, we find the
1035  * corresponding implicit back refs and check the following fields:
1036  *
1037  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
1038  *
1039  * Btree extents can be referenced by:
1040  *
1041  * - Different subvolumes
1042  *
1043  * Both the implicit back refs and the full back refs for tree blocks
1044  * only consist of key. The key offset for the implicit back refs is
1045  * objectid of block's owner tree. The key offset for the full back refs
1046  * is the first byte of parent block.
1047  *
1048  * When implicit back refs is used, information about the lowest key and
1049  * level of the tree block are required. These information are stored in
1050  * tree block info structure.
1051  */
1052
1053 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1054 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
1055                                   struct btrfs_root *root,
1056                                   struct btrfs_path *path,
1057                                   u64 owner, u32 extra_size)
1058 {
1059         struct btrfs_extent_item *item;
1060         struct btrfs_extent_item_v0 *ei0;
1061         struct btrfs_extent_ref_v0 *ref0;
1062         struct btrfs_tree_block_info *bi;
1063         struct extent_buffer *leaf;
1064         struct btrfs_key key;
1065         struct btrfs_key found_key;
1066         u32 new_size = sizeof(*item);
1067         u64 refs;
1068         int ret;
1069
1070         leaf = path->nodes[0];
1071         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
1072
1073         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1074         ei0 = btrfs_item_ptr(leaf, path->slots[0],
1075                              struct btrfs_extent_item_v0);
1076         refs = btrfs_extent_refs_v0(leaf, ei0);
1077
1078         if (owner == (u64)-1) {
1079                 while (1) {
1080                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1081                                 ret = btrfs_next_leaf(root, path);
1082                                 if (ret < 0)
1083                                         return ret;
1084                                 BUG_ON(ret > 0); /* Corruption */
1085                                 leaf = path->nodes[0];
1086                         }
1087                         btrfs_item_key_to_cpu(leaf, &found_key,
1088                                               path->slots[0]);
1089                         BUG_ON(key.objectid != found_key.objectid);
1090                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1091                                 path->slots[0]++;
1092                                 continue;
1093                         }
1094                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1095                                               struct btrfs_extent_ref_v0);
1096                         owner = btrfs_ref_objectid_v0(leaf, ref0);
1097                         break;
1098                 }
1099         }
1100         btrfs_release_path(path);
1101
1102         if (owner < BTRFS_FIRST_FREE_OBJECTID)
1103                 new_size += sizeof(*bi);
1104
1105         new_size -= sizeof(*ei0);
1106         ret = btrfs_search_slot(trans, root, &key, path,
1107                                 new_size + extra_size, 1);
1108         if (ret < 0)
1109                 return ret;
1110         BUG_ON(ret); /* Corruption */
1111
1112         btrfs_extend_item(root, path, new_size);
1113
1114         leaf = path->nodes[0];
1115         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1116         btrfs_set_extent_refs(leaf, item, refs);
1117         /* FIXME: get real generation */
1118         btrfs_set_extent_generation(leaf, item, 0);
1119         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1120                 btrfs_set_extent_flags(leaf, item,
1121                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
1122                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
1123                 bi = (struct btrfs_tree_block_info *)(item + 1);
1124                 /* FIXME: get first key of the block */
1125                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1126                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
1127         } else {
1128                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1129         }
1130         btrfs_mark_buffer_dirty(leaf);
1131         return 0;
1132 }
1133 #endif
1134
1135 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1136 {
1137         u32 high_crc = ~(u32)0;
1138         u32 low_crc = ~(u32)0;
1139         __le64 lenum;
1140
1141         lenum = cpu_to_le64(root_objectid);
1142         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1143         lenum = cpu_to_le64(owner);
1144         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1145         lenum = cpu_to_le64(offset);
1146         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1147
1148         return ((u64)high_crc << 31) ^ (u64)low_crc;
1149 }
1150
1151 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1152                                      struct btrfs_extent_data_ref *ref)
1153 {
1154         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1155                                     btrfs_extent_data_ref_objectid(leaf, ref),
1156                                     btrfs_extent_data_ref_offset(leaf, ref));
1157 }
1158
1159 static int match_extent_data_ref(struct extent_buffer *leaf,
1160                                  struct btrfs_extent_data_ref *ref,
1161                                  u64 root_objectid, u64 owner, u64 offset)
1162 {
1163         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1164             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1165             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1166                 return 0;
1167         return 1;
1168 }
1169
1170 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1171                                            struct btrfs_root *root,
1172                                            struct btrfs_path *path,
1173                                            u64 bytenr, u64 parent,
1174                                            u64 root_objectid,
1175                                            u64 owner, u64 offset)
1176 {
1177         struct btrfs_key key;
1178         struct btrfs_extent_data_ref *ref;
1179         struct extent_buffer *leaf;
1180         u32 nritems;
1181         int ret;
1182         int recow;
1183         int err = -ENOENT;
1184
1185         key.objectid = bytenr;
1186         if (parent) {
1187                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1188                 key.offset = parent;
1189         } else {
1190                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1191                 key.offset = hash_extent_data_ref(root_objectid,
1192                                                   owner, offset);
1193         }
1194 again:
1195         recow = 0;
1196         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1197         if (ret < 0) {
1198                 err = ret;
1199                 goto fail;
1200         }
1201
1202         if (parent) {
1203                 if (!ret)
1204                         return 0;
1205 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1206                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1207                 btrfs_release_path(path);
1208                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1209                 if (ret < 0) {
1210                         err = ret;
1211                         goto fail;
1212                 }
1213                 if (!ret)
1214                         return 0;
1215 #endif
1216                 goto fail;
1217         }
1218
1219         leaf = path->nodes[0];
1220         nritems = btrfs_header_nritems(leaf);
1221         while (1) {
1222                 if (path->slots[0] >= nritems) {
1223                         ret = btrfs_next_leaf(root, path);
1224                         if (ret < 0)
1225                                 err = ret;
1226                         if (ret)
1227                                 goto fail;
1228
1229                         leaf = path->nodes[0];
1230                         nritems = btrfs_header_nritems(leaf);
1231                         recow = 1;
1232                 }
1233
1234                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1235                 if (key.objectid != bytenr ||
1236                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1237                         goto fail;
1238
1239                 ref = btrfs_item_ptr(leaf, path->slots[0],
1240                                      struct btrfs_extent_data_ref);
1241
1242                 if (match_extent_data_ref(leaf, ref, root_objectid,
1243                                           owner, offset)) {
1244                         if (recow) {
1245                                 btrfs_release_path(path);
1246                                 goto again;
1247                         }
1248                         err = 0;
1249                         break;
1250                 }
1251                 path->slots[0]++;
1252         }
1253 fail:
1254         return err;
1255 }
1256
1257 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1258                                            struct btrfs_root *root,
1259                                            struct btrfs_path *path,
1260                                            u64 bytenr, u64 parent,
1261                                            u64 root_objectid, u64 owner,
1262                                            u64 offset, int refs_to_add)
1263 {
1264         struct btrfs_key key;
1265         struct extent_buffer *leaf;
1266         u32 size;
1267         u32 num_refs;
1268         int ret;
1269
1270         key.objectid = bytenr;
1271         if (parent) {
1272                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1273                 key.offset = parent;
1274                 size = sizeof(struct btrfs_shared_data_ref);
1275         } else {
1276                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1277                 key.offset = hash_extent_data_ref(root_objectid,
1278                                                   owner, offset);
1279                 size = sizeof(struct btrfs_extent_data_ref);
1280         }
1281
1282         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1283         if (ret && ret != -EEXIST)
1284                 goto fail;
1285
1286         leaf = path->nodes[0];
1287         if (parent) {
1288                 struct btrfs_shared_data_ref *ref;
1289                 ref = btrfs_item_ptr(leaf, path->slots[0],
1290                                      struct btrfs_shared_data_ref);
1291                 if (ret == 0) {
1292                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1293                 } else {
1294                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1295                         num_refs += refs_to_add;
1296                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1297                 }
1298         } else {
1299                 struct btrfs_extent_data_ref *ref;
1300                 while (ret == -EEXIST) {
1301                         ref = btrfs_item_ptr(leaf, path->slots[0],
1302                                              struct btrfs_extent_data_ref);
1303                         if (match_extent_data_ref(leaf, ref, root_objectid,
1304                                                   owner, offset))
1305                                 break;
1306                         btrfs_release_path(path);
1307                         key.offset++;
1308                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1309                                                       size);
1310                         if (ret && ret != -EEXIST)
1311                                 goto fail;
1312
1313                         leaf = path->nodes[0];
1314                 }
1315                 ref = btrfs_item_ptr(leaf, path->slots[0],
1316                                      struct btrfs_extent_data_ref);
1317                 if (ret == 0) {
1318                         btrfs_set_extent_data_ref_root(leaf, ref,
1319                                                        root_objectid);
1320                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1321                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1322                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1323                 } else {
1324                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1325                         num_refs += refs_to_add;
1326                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1327                 }
1328         }
1329         btrfs_mark_buffer_dirty(leaf);
1330         ret = 0;
1331 fail:
1332         btrfs_release_path(path);
1333         return ret;
1334 }
1335
1336 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1337                                            struct btrfs_root *root,
1338                                            struct btrfs_path *path,
1339                                            int refs_to_drop, int *last_ref)
1340 {
1341         struct btrfs_key key;
1342         struct btrfs_extent_data_ref *ref1 = NULL;
1343         struct btrfs_shared_data_ref *ref2 = NULL;
1344         struct extent_buffer *leaf;
1345         u32 num_refs = 0;
1346         int ret = 0;
1347
1348         leaf = path->nodes[0];
1349         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1350
1351         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1352                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1353                                       struct btrfs_extent_data_ref);
1354                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1355         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1356                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1357                                       struct btrfs_shared_data_ref);
1358                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1359 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1360         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1361                 struct btrfs_extent_ref_v0 *ref0;
1362                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1363                                       struct btrfs_extent_ref_v0);
1364                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1365 #endif
1366         } else {
1367                 BUG();
1368         }
1369
1370         BUG_ON(num_refs < refs_to_drop);
1371         num_refs -= refs_to_drop;
1372
1373         if (num_refs == 0) {
1374                 ret = btrfs_del_item(trans, root, path);
1375                 *last_ref = 1;
1376         } else {
1377                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1378                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1379                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1380                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1381 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1382                 else {
1383                         struct btrfs_extent_ref_v0 *ref0;
1384                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1385                                         struct btrfs_extent_ref_v0);
1386                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1387                 }
1388 #endif
1389                 btrfs_mark_buffer_dirty(leaf);
1390         }
1391         return ret;
1392 }
1393
1394 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1395                                           struct btrfs_extent_inline_ref *iref)
1396 {
1397         struct btrfs_key key;
1398         struct extent_buffer *leaf;
1399         struct btrfs_extent_data_ref *ref1;
1400         struct btrfs_shared_data_ref *ref2;
1401         u32 num_refs = 0;
1402
1403         leaf = path->nodes[0];
1404         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1405         if (iref) {
1406                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1407                     BTRFS_EXTENT_DATA_REF_KEY) {
1408                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1409                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1410                 } else {
1411                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1412                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1413                 }
1414         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1415                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1416                                       struct btrfs_extent_data_ref);
1417                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1418         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1419                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1420                                       struct btrfs_shared_data_ref);
1421                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1422 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1423         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1424                 struct btrfs_extent_ref_v0 *ref0;
1425                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1426                                       struct btrfs_extent_ref_v0);
1427                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1428 #endif
1429         } else {
1430                 WARN_ON(1);
1431         }
1432         return num_refs;
1433 }
1434
1435 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1436                                           struct btrfs_root *root,
1437                                           struct btrfs_path *path,
1438                                           u64 bytenr, u64 parent,
1439                                           u64 root_objectid)
1440 {
1441         struct btrfs_key key;
1442         int ret;
1443
1444         key.objectid = bytenr;
1445         if (parent) {
1446                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1447                 key.offset = parent;
1448         } else {
1449                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1450                 key.offset = root_objectid;
1451         }
1452
1453         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1454         if (ret > 0)
1455                 ret = -ENOENT;
1456 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1457         if (ret == -ENOENT && parent) {
1458                 btrfs_release_path(path);
1459                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1460                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1461                 if (ret > 0)
1462                         ret = -ENOENT;
1463         }
1464 #endif
1465         return ret;
1466 }
1467
1468 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1469                                           struct btrfs_root *root,
1470                                           struct btrfs_path *path,
1471                                           u64 bytenr, u64 parent,
1472                                           u64 root_objectid)
1473 {
1474         struct btrfs_key key;
1475         int ret;
1476
1477         key.objectid = bytenr;
1478         if (parent) {
1479                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1480                 key.offset = parent;
1481         } else {
1482                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1483                 key.offset = root_objectid;
1484         }
1485
1486         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1487         btrfs_release_path(path);
1488         return ret;
1489 }
1490
1491 static inline int extent_ref_type(u64 parent, u64 owner)
1492 {
1493         int type;
1494         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1495                 if (parent > 0)
1496                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1497                 else
1498                         type = BTRFS_TREE_BLOCK_REF_KEY;
1499         } else {
1500                 if (parent > 0)
1501                         type = BTRFS_SHARED_DATA_REF_KEY;
1502                 else
1503                         type = BTRFS_EXTENT_DATA_REF_KEY;
1504         }
1505         return type;
1506 }
1507
1508 static int find_next_key(struct btrfs_path *path, int level,
1509                          struct btrfs_key *key)
1510
1511 {
1512         for (; level < BTRFS_MAX_LEVEL; level++) {
1513                 if (!path->nodes[level])
1514                         break;
1515                 if (path->slots[level] + 1 >=
1516                     btrfs_header_nritems(path->nodes[level]))
1517                         continue;
1518                 if (level == 0)
1519                         btrfs_item_key_to_cpu(path->nodes[level], key,
1520                                               path->slots[level] + 1);
1521                 else
1522                         btrfs_node_key_to_cpu(path->nodes[level], key,
1523                                               path->slots[level] + 1);
1524                 return 0;
1525         }
1526         return 1;
1527 }
1528
1529 /*
1530  * look for inline back ref. if back ref is found, *ref_ret is set
1531  * to the address of inline back ref, and 0 is returned.
1532  *
1533  * if back ref isn't found, *ref_ret is set to the address where it
1534  * should be inserted, and -ENOENT is returned.
1535  *
1536  * if insert is true and there are too many inline back refs, the path
1537  * points to the extent item, and -EAGAIN is returned.
1538  *
1539  * NOTE: inline back refs are ordered in the same way that back ref
1540  *       items in the tree are ordered.
1541  */
1542 static noinline_for_stack
1543 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1544                                  struct btrfs_root *root,
1545                                  struct btrfs_path *path,
1546                                  struct btrfs_extent_inline_ref **ref_ret,
1547                                  u64 bytenr, u64 num_bytes,
1548                                  u64 parent, u64 root_objectid,
1549                                  u64 owner, u64 offset, int insert)
1550 {
1551         struct btrfs_key key;
1552         struct extent_buffer *leaf;
1553         struct btrfs_extent_item *ei;
1554         struct btrfs_extent_inline_ref *iref;
1555         u64 flags;
1556         u64 item_size;
1557         unsigned long ptr;
1558         unsigned long end;
1559         int extra_size;
1560         int type;
1561         int want;
1562         int ret;
1563         int err = 0;
1564         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
1565                                                  SKINNY_METADATA);
1566
1567         key.objectid = bytenr;
1568         key.type = BTRFS_EXTENT_ITEM_KEY;
1569         key.offset = num_bytes;
1570
1571         want = extent_ref_type(parent, owner);
1572         if (insert) {
1573                 extra_size = btrfs_extent_inline_ref_size(want);
1574                 path->keep_locks = 1;
1575         } else
1576                 extra_size = -1;
1577
1578         /*
1579          * Owner is our parent level, so we can just add one to get the level
1580          * for the block we are interested in.
1581          */
1582         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1583                 key.type = BTRFS_METADATA_ITEM_KEY;
1584                 key.offset = owner;
1585         }
1586
1587 again:
1588         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1589         if (ret < 0) {
1590                 err = ret;
1591                 goto out;
1592         }
1593
1594         /*
1595          * We may be a newly converted file system which still has the old fat
1596          * extent entries for metadata, so try and see if we have one of those.
1597          */
1598         if (ret > 0 && skinny_metadata) {
1599                 skinny_metadata = false;
1600                 if (path->slots[0]) {
1601                         path->slots[0]--;
1602                         btrfs_item_key_to_cpu(path->nodes[0], &key,
1603                                               path->slots[0]);
1604                         if (key.objectid == bytenr &&
1605                             key.type == BTRFS_EXTENT_ITEM_KEY &&
1606                             key.offset == num_bytes)
1607                                 ret = 0;
1608                 }
1609                 if (ret) {
1610                         key.objectid = bytenr;
1611                         key.type = BTRFS_EXTENT_ITEM_KEY;
1612                         key.offset = num_bytes;
1613                         btrfs_release_path(path);
1614                         goto again;
1615                 }
1616         }
1617
1618         if (ret && !insert) {
1619                 err = -ENOENT;
1620                 goto out;
1621         } else if (WARN_ON(ret)) {
1622                 err = -EIO;
1623                 goto out;
1624         }
1625
1626         leaf = path->nodes[0];
1627         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1628 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1629         if (item_size < sizeof(*ei)) {
1630                 if (!insert) {
1631                         err = -ENOENT;
1632                         goto out;
1633                 }
1634                 ret = convert_extent_item_v0(trans, root, path, owner,
1635                                              extra_size);
1636                 if (ret < 0) {
1637                         err = ret;
1638                         goto out;
1639                 }
1640                 leaf = path->nodes[0];
1641                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1642         }
1643 #endif
1644         BUG_ON(item_size < sizeof(*ei));
1645
1646         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1647         flags = btrfs_extent_flags(leaf, ei);
1648
1649         ptr = (unsigned long)(ei + 1);
1650         end = (unsigned long)ei + item_size;
1651
1652         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1653                 ptr += sizeof(struct btrfs_tree_block_info);
1654                 BUG_ON(ptr > end);
1655         }
1656
1657         err = -ENOENT;
1658         while (1) {
1659                 if (ptr >= end) {
1660                         WARN_ON(ptr > end);
1661                         break;
1662                 }
1663                 iref = (struct btrfs_extent_inline_ref *)ptr;
1664                 type = btrfs_extent_inline_ref_type(leaf, iref);
1665                 if (want < type)
1666                         break;
1667                 if (want > type) {
1668                         ptr += btrfs_extent_inline_ref_size(type);
1669                         continue;
1670                 }
1671
1672                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1673                         struct btrfs_extent_data_ref *dref;
1674                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1675                         if (match_extent_data_ref(leaf, dref, root_objectid,
1676                                                   owner, offset)) {
1677                                 err = 0;
1678                                 break;
1679                         }
1680                         if (hash_extent_data_ref_item(leaf, dref) <
1681                             hash_extent_data_ref(root_objectid, owner, offset))
1682                                 break;
1683                 } else {
1684                         u64 ref_offset;
1685                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1686                         if (parent > 0) {
1687                                 if (parent == ref_offset) {
1688                                         err = 0;
1689                                         break;
1690                                 }
1691                                 if (ref_offset < parent)
1692                                         break;
1693                         } else {
1694                                 if (root_objectid == ref_offset) {
1695                                         err = 0;
1696                                         break;
1697                                 }
1698                                 if (ref_offset < root_objectid)
1699                                         break;
1700                         }
1701                 }
1702                 ptr += btrfs_extent_inline_ref_size(type);
1703         }
1704         if (err == -ENOENT && insert) {
1705                 if (item_size + extra_size >=
1706                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1707                         err = -EAGAIN;
1708                         goto out;
1709                 }
1710                 /*
1711                  * To add new inline back ref, we have to make sure
1712                  * there is no corresponding back ref item.
1713                  * For simplicity, we just do not add new inline back
1714                  * ref if there is any kind of item for this block
1715                  */
1716                 if (find_next_key(path, 0, &key) == 0 &&
1717                     key.objectid == bytenr &&
1718                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1719                         err = -EAGAIN;
1720                         goto out;
1721                 }
1722         }
1723         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1724 out:
1725         if (insert) {
1726                 path->keep_locks = 0;
1727                 btrfs_unlock_up_safe(path, 1);
1728         }
1729         return err;
1730 }
1731
1732 /*
1733  * helper to add new inline back ref
1734  */
1735 static noinline_for_stack
1736 void setup_inline_extent_backref(struct btrfs_root *root,
1737                                  struct btrfs_path *path,
1738                                  struct btrfs_extent_inline_ref *iref,
1739                                  u64 parent, u64 root_objectid,
1740                                  u64 owner, u64 offset, int refs_to_add,
1741                                  struct btrfs_delayed_extent_op *extent_op)
1742 {
1743         struct extent_buffer *leaf;
1744         struct btrfs_extent_item *ei;
1745         unsigned long ptr;
1746         unsigned long end;
1747         unsigned long item_offset;
1748         u64 refs;
1749         int size;
1750         int type;
1751
1752         leaf = path->nodes[0];
1753         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1754         item_offset = (unsigned long)iref - (unsigned long)ei;
1755
1756         type = extent_ref_type(parent, owner);
1757         size = btrfs_extent_inline_ref_size(type);
1758
1759         btrfs_extend_item(root, path, size);
1760
1761         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1762         refs = btrfs_extent_refs(leaf, ei);
1763         refs += refs_to_add;
1764         btrfs_set_extent_refs(leaf, ei, refs);
1765         if (extent_op)
1766                 __run_delayed_extent_op(extent_op, leaf, ei);
1767
1768         ptr = (unsigned long)ei + item_offset;
1769         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1770         if (ptr < end - size)
1771                 memmove_extent_buffer(leaf, ptr + size, ptr,
1772                                       end - size - ptr);
1773
1774         iref = (struct btrfs_extent_inline_ref *)ptr;
1775         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1776         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1777                 struct btrfs_extent_data_ref *dref;
1778                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1779                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1780                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1781                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1782                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1783         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1784                 struct btrfs_shared_data_ref *sref;
1785                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1786                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1787                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1788         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1789                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1790         } else {
1791                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1792         }
1793         btrfs_mark_buffer_dirty(leaf);
1794 }
1795
1796 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1797                                  struct btrfs_root *root,
1798                                  struct btrfs_path *path,
1799                                  struct btrfs_extent_inline_ref **ref_ret,
1800                                  u64 bytenr, u64 num_bytes, u64 parent,
1801                                  u64 root_objectid, u64 owner, u64 offset)
1802 {
1803         int ret;
1804
1805         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1806                                            bytenr, num_bytes, parent,
1807                                            root_objectid, owner, offset, 0);
1808         if (ret != -ENOENT)
1809                 return ret;
1810
1811         btrfs_release_path(path);
1812         *ref_ret = NULL;
1813
1814         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1815                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1816                                             root_objectid);
1817         } else {
1818                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1819                                              root_objectid, owner, offset);
1820         }
1821         return ret;
1822 }
1823
1824 /*
1825  * helper to update/remove inline back ref
1826  */
1827 static noinline_for_stack
1828 void update_inline_extent_backref(struct btrfs_root *root,
1829                                   struct btrfs_path *path,
1830                                   struct btrfs_extent_inline_ref *iref,
1831                                   int refs_to_mod,
1832                                   struct btrfs_delayed_extent_op *extent_op,
1833                                   int *last_ref)
1834 {
1835         struct extent_buffer *leaf;
1836         struct btrfs_extent_item *ei;
1837         struct btrfs_extent_data_ref *dref = NULL;
1838         struct btrfs_shared_data_ref *sref = NULL;
1839         unsigned long ptr;
1840         unsigned long end;
1841         u32 item_size;
1842         int size;
1843         int type;
1844         u64 refs;
1845
1846         leaf = path->nodes[0];
1847         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1848         refs = btrfs_extent_refs(leaf, ei);
1849         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1850         refs += refs_to_mod;
1851         btrfs_set_extent_refs(leaf, ei, refs);
1852         if (extent_op)
1853                 __run_delayed_extent_op(extent_op, leaf, ei);
1854
1855         type = btrfs_extent_inline_ref_type(leaf, iref);
1856
1857         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1858                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1859                 refs = btrfs_extent_data_ref_count(leaf, dref);
1860         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1861                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1862                 refs = btrfs_shared_data_ref_count(leaf, sref);
1863         } else {
1864                 refs = 1;
1865                 BUG_ON(refs_to_mod != -1);
1866         }
1867
1868         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1869         refs += refs_to_mod;
1870
1871         if (refs > 0) {
1872                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1873                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1874                 else
1875                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1876         } else {
1877                 *last_ref = 1;
1878                 size =  btrfs_extent_inline_ref_size(type);
1879                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1880                 ptr = (unsigned long)iref;
1881                 end = (unsigned long)ei + item_size;
1882                 if (ptr + size < end)
1883                         memmove_extent_buffer(leaf, ptr, ptr + size,
1884                                               end - ptr - size);
1885                 item_size -= size;
1886                 btrfs_truncate_item(root, path, item_size, 1);
1887         }
1888         btrfs_mark_buffer_dirty(leaf);
1889 }
1890
1891 static noinline_for_stack
1892 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1893                                  struct btrfs_root *root,
1894                                  struct btrfs_path *path,
1895                                  u64 bytenr, u64 num_bytes, u64 parent,
1896                                  u64 root_objectid, u64 owner,
1897                                  u64 offset, int refs_to_add,
1898                                  struct btrfs_delayed_extent_op *extent_op)
1899 {
1900         struct btrfs_extent_inline_ref *iref;
1901         int ret;
1902
1903         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1904                                            bytenr, num_bytes, parent,
1905                                            root_objectid, owner, offset, 1);
1906         if (ret == 0) {
1907                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1908                 update_inline_extent_backref(root, path, iref,
1909                                              refs_to_add, extent_op, NULL);
1910         } else if (ret == -ENOENT) {
1911                 setup_inline_extent_backref(root, path, iref, parent,
1912                                             root_objectid, owner, offset,
1913                                             refs_to_add, extent_op);
1914                 ret = 0;
1915         }
1916         return ret;
1917 }
1918
1919 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1920                                  struct btrfs_root *root,
1921                                  struct btrfs_path *path,
1922                                  u64 bytenr, u64 parent, u64 root_objectid,
1923                                  u64 owner, u64 offset, int refs_to_add)
1924 {
1925         int ret;
1926         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1927                 BUG_ON(refs_to_add != 1);
1928                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1929                                             parent, root_objectid);
1930         } else {
1931                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1932                                              parent, root_objectid,
1933                                              owner, offset, refs_to_add);
1934         }
1935         return ret;
1936 }
1937
1938 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1939                                  struct btrfs_root *root,
1940                                  struct btrfs_path *path,
1941                                  struct btrfs_extent_inline_ref *iref,
1942                                  int refs_to_drop, int is_data, int *last_ref)
1943 {
1944         int ret = 0;
1945
1946         BUG_ON(!is_data && refs_to_drop != 1);
1947         if (iref) {
1948                 update_inline_extent_backref(root, path, iref,
1949                                              -refs_to_drop, NULL, last_ref);
1950         } else if (is_data) {
1951                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
1952                                              last_ref);
1953         } else {
1954                 *last_ref = 1;
1955                 ret = btrfs_del_item(trans, root, path);
1956         }
1957         return ret;
1958 }
1959
1960 #define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
1961 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1962                                u64 *discarded_bytes)
1963 {
1964         int j, ret = 0;
1965         u64 bytes_left, end;
1966         u64 aligned_start = ALIGN(start, 1 << 9);
1967
1968         if (WARN_ON(start != aligned_start)) {
1969                 len -= aligned_start - start;
1970                 len = round_down(len, 1 << 9);
1971                 start = aligned_start;
1972         }
1973
1974         *discarded_bytes = 0;
1975
1976         if (!len)
1977                 return 0;
1978
1979         end = start + len;
1980         bytes_left = len;
1981
1982         /* Skip any superblocks on this device. */
1983         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1984                 u64 sb_start = btrfs_sb_offset(j);
1985                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1986                 u64 size = sb_start - start;
1987
1988                 if (!in_range(sb_start, start, bytes_left) &&
1989                     !in_range(sb_end, start, bytes_left) &&
1990                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1991                         continue;
1992
1993                 /*
1994                  * Superblock spans beginning of range.  Adjust start and
1995                  * try again.
1996                  */
1997                 if (sb_start <= start) {
1998                         start += sb_end - start;
1999                         if (start > end) {
2000                                 bytes_left = 0;
2001                                 break;
2002                         }
2003                         bytes_left = end - start;
2004                         continue;
2005                 }
2006
2007                 if (size) {
2008                         ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
2009                                                    GFP_NOFS, 0);
2010                         if (!ret)
2011                                 *discarded_bytes += size;
2012                         else if (ret != -EOPNOTSUPP)
2013                                 return ret;
2014                 }
2015
2016                 start = sb_end;
2017                 if (start > end) {
2018                         bytes_left = 0;
2019                         break;
2020                 }
2021                 bytes_left = end - start;
2022         }
2023
2024         if (bytes_left) {
2025                 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
2026                                            GFP_NOFS, 0);
2027                 if (!ret)
2028                         *discarded_bytes += bytes_left;
2029         }
2030         return ret;
2031 }
2032
2033 int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
2034                          u64 num_bytes, u64 *actual_bytes)
2035 {
2036         int ret;
2037         u64 discarded_bytes = 0;
2038         struct btrfs_bio *bbio = NULL;
2039
2040
2041         /*
2042          * Avoid races with device replace and make sure our bbio has devices
2043          * associated to its stripes that don't go away while we are discarding.
2044          */
2045         btrfs_bio_counter_inc_blocked(root->fs_info);
2046         /* Tell the block device(s) that the sectors can be discarded */
2047         ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
2048                               bytenr, &num_bytes, &bbio, 0);
2049         /* Error condition is -ENOMEM */
2050         if (!ret) {
2051                 struct btrfs_bio_stripe *stripe = bbio->stripes;
2052                 int i;
2053
2054
2055                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
2056                         u64 bytes;
2057                         if (!stripe->dev->can_discard)
2058                                 continue;
2059
2060                         ret = btrfs_issue_discard(stripe->dev->bdev,
2061                                                   stripe->physical,
2062                                                   stripe->length,
2063                                                   &bytes);
2064                         if (!ret)
2065                                 discarded_bytes += bytes;
2066                         else if (ret != -EOPNOTSUPP)
2067                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
2068
2069                         /*
2070                          * Just in case we get back EOPNOTSUPP for some reason,
2071                          * just ignore the return value so we don't screw up
2072                          * people calling discard_extent.
2073                          */
2074                         ret = 0;
2075                 }
2076                 btrfs_put_bbio(bbio);
2077         }
2078         btrfs_bio_counter_dec(root->fs_info);
2079
2080         if (actual_bytes)
2081                 *actual_bytes = discarded_bytes;
2082
2083
2084         if (ret == -EOPNOTSUPP)
2085                 ret = 0;
2086         return ret;
2087 }
2088
2089 /* Can return -ENOMEM */
2090 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2091                          struct btrfs_root *root,
2092                          u64 bytenr, u64 num_bytes, u64 parent,
2093                          u64 root_objectid, u64 owner, u64 offset)
2094 {
2095         int ret;
2096         struct btrfs_fs_info *fs_info = root->fs_info;
2097
2098         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2099                root_objectid == BTRFS_TREE_LOG_OBJECTID);
2100
2101         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2102                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
2103                                         num_bytes,
2104                                         parent, root_objectid, (int)owner,
2105                                         BTRFS_ADD_DELAYED_REF, NULL);
2106         } else {
2107                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
2108                                         num_bytes, parent, root_objectid,
2109                                         owner, offset, 0,
2110                                         BTRFS_ADD_DELAYED_REF, NULL);
2111         }
2112         return ret;
2113 }
2114
2115 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2116                                   struct btrfs_root *root,
2117                                   struct btrfs_delayed_ref_node *node,
2118                                   u64 parent, u64 root_objectid,
2119                                   u64 owner, u64 offset, int refs_to_add,
2120                                   struct btrfs_delayed_extent_op *extent_op)
2121 {
2122         struct btrfs_fs_info *fs_info = root->fs_info;
2123         struct btrfs_path *path;
2124         struct extent_buffer *leaf;
2125         struct btrfs_extent_item *item;
2126         struct btrfs_key key;
2127         u64 bytenr = node->bytenr;
2128         u64 num_bytes = node->num_bytes;
2129         u64 refs;
2130         int ret;
2131
2132         path = btrfs_alloc_path();
2133         if (!path)
2134                 return -ENOMEM;
2135
2136         path->reada = READA_FORWARD;
2137         path->leave_spinning = 1;
2138         /* this will setup the path even if it fails to insert the back ref */
2139         ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
2140                                            bytenr, num_bytes, parent,
2141                                            root_objectid, owner, offset,
2142                                            refs_to_add, extent_op);
2143         if ((ret < 0 && ret != -EAGAIN) || !ret)
2144                 goto out;
2145
2146         /*
2147          * Ok we had -EAGAIN which means we didn't have space to insert and
2148          * inline extent ref, so just update the reference count and add a
2149          * normal backref.
2150          */
2151         leaf = path->nodes[0];
2152         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2153         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2154         refs = btrfs_extent_refs(leaf, item);
2155         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2156         if (extent_op)
2157                 __run_delayed_extent_op(extent_op, leaf, item);
2158
2159         btrfs_mark_buffer_dirty(leaf);
2160         btrfs_release_path(path);
2161
2162         path->reada = READA_FORWARD;
2163         path->leave_spinning = 1;
2164         /* now insert the actual backref */
2165         ret = insert_extent_backref(trans, root->fs_info->extent_root,
2166                                     path, bytenr, parent, root_objectid,
2167                                     owner, offset, refs_to_add);
2168         if (ret)
2169                 btrfs_abort_transaction(trans, ret);
2170 out:
2171         btrfs_free_path(path);
2172         return ret;
2173 }
2174
2175 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2176                                 struct btrfs_root *root,
2177                                 struct btrfs_delayed_ref_node *node,
2178                                 struct btrfs_delayed_extent_op *extent_op,
2179                                 int insert_reserved)
2180 {
2181         int ret = 0;
2182         struct btrfs_delayed_data_ref *ref;
2183         struct btrfs_key ins;
2184         u64 parent = 0;
2185         u64 ref_root = 0;
2186         u64 flags = 0;
2187
2188         ins.objectid = node->bytenr;
2189         ins.offset = node->num_bytes;
2190         ins.type = BTRFS_EXTENT_ITEM_KEY;
2191
2192         ref = btrfs_delayed_node_to_data_ref(node);
2193         trace_run_delayed_data_ref(root->fs_info, node, ref, node->action);
2194
2195         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2196                 parent = ref->parent;
2197         ref_root = ref->root;
2198
2199         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2200                 if (extent_op)
2201                         flags |= extent_op->flags_to_set;
2202                 ret = alloc_reserved_file_extent(trans, root,
2203                                                  parent, ref_root, flags,
2204                                                  ref->objectid, ref->offset,
2205                                                  &ins, node->ref_mod);
2206         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2207                 ret = __btrfs_inc_extent_ref(trans, root, node, parent,
2208                                              ref_root, ref->objectid,
2209                                              ref->offset, node->ref_mod,
2210                                              extent_op);
2211         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2212                 ret = __btrfs_free_extent(trans, root, node, parent,
2213                                           ref_root, ref->objectid,
2214                                           ref->offset, node->ref_mod,
2215                                           extent_op);
2216         } else {
2217                 BUG();
2218         }
2219         return ret;
2220 }
2221
2222 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2223                                     struct extent_buffer *leaf,
2224                                     struct btrfs_extent_item *ei)
2225 {
2226         u64 flags = btrfs_extent_flags(leaf, ei);
2227         if (extent_op->update_flags) {
2228                 flags |= extent_op->flags_to_set;
2229                 btrfs_set_extent_flags(leaf, ei, flags);
2230         }
2231
2232         if (extent_op->update_key) {
2233                 struct btrfs_tree_block_info *bi;
2234                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2235                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2236                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2237         }
2238 }
2239
2240 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2241                                  struct btrfs_root *root,
2242                                  struct btrfs_delayed_ref_node *node,
2243                                  struct btrfs_delayed_extent_op *extent_op)
2244 {
2245         struct btrfs_key key;
2246         struct btrfs_path *path;
2247         struct btrfs_extent_item *ei;
2248         struct extent_buffer *leaf;
2249         u32 item_size;
2250         int ret;
2251         int err = 0;
2252         int metadata = !extent_op->is_data;
2253
2254         if (trans->aborted)
2255                 return 0;
2256
2257         if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2258                 metadata = 0;
2259
2260         path = btrfs_alloc_path();
2261         if (!path)
2262                 return -ENOMEM;
2263
2264         key.objectid = node->bytenr;
2265
2266         if (metadata) {
2267                 key.type = BTRFS_METADATA_ITEM_KEY;
2268                 key.offset = extent_op->level;
2269         } else {
2270                 key.type = BTRFS_EXTENT_ITEM_KEY;
2271                 key.offset = node->num_bytes;
2272         }
2273
2274 again:
2275         path->reada = READA_FORWARD;
2276         path->leave_spinning = 1;
2277         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2278                                 path, 0, 1);
2279         if (ret < 0) {
2280                 err = ret;
2281                 goto out;
2282         }
2283         if (ret > 0) {
2284                 if (metadata) {
2285                         if (path->slots[0] > 0) {
2286                                 path->slots[0]--;
2287                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
2288                                                       path->slots[0]);
2289                                 if (key.objectid == node->bytenr &&
2290                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
2291                                     key.offset == node->num_bytes)
2292                                         ret = 0;
2293                         }
2294                         if (ret > 0) {
2295                                 btrfs_release_path(path);
2296                                 metadata = 0;
2297
2298                                 key.objectid = node->bytenr;
2299                                 key.offset = node->num_bytes;
2300                                 key.type = BTRFS_EXTENT_ITEM_KEY;
2301                                 goto again;
2302                         }
2303                 } else {
2304                         err = -EIO;
2305                         goto out;
2306                 }
2307         }
2308
2309         leaf = path->nodes[0];
2310         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2311 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2312         if (item_size < sizeof(*ei)) {
2313                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2314                                              path, (u64)-1, 0);
2315                 if (ret < 0) {
2316                         err = ret;
2317                         goto out;
2318                 }
2319                 leaf = path->nodes[0];
2320                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2321         }
2322 #endif
2323         BUG_ON(item_size < sizeof(*ei));
2324         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2325         __run_delayed_extent_op(extent_op, leaf, ei);
2326
2327         btrfs_mark_buffer_dirty(leaf);
2328 out:
2329         btrfs_free_path(path);
2330         return err;
2331 }
2332
2333 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2334                                 struct btrfs_root *root,
2335                                 struct btrfs_delayed_ref_node *node,
2336                                 struct btrfs_delayed_extent_op *extent_op,
2337                                 int insert_reserved)
2338 {
2339         int ret = 0;
2340         struct btrfs_delayed_tree_ref *ref;
2341         struct btrfs_key ins;
2342         u64 parent = 0;
2343         u64 ref_root = 0;
2344         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
2345                                                  SKINNY_METADATA);
2346
2347         ref = btrfs_delayed_node_to_tree_ref(node);
2348         trace_run_delayed_tree_ref(root->fs_info, node, ref, node->action);
2349
2350         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2351                 parent = ref->parent;
2352         ref_root = ref->root;
2353
2354         ins.objectid = node->bytenr;
2355         if (skinny_metadata) {
2356                 ins.offset = ref->level;
2357                 ins.type = BTRFS_METADATA_ITEM_KEY;
2358         } else {
2359                 ins.offset = node->num_bytes;
2360                 ins.type = BTRFS_EXTENT_ITEM_KEY;
2361         }
2362
2363         BUG_ON(node->ref_mod != 1);
2364         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2365                 BUG_ON(!extent_op || !extent_op->update_flags);
2366                 ret = alloc_reserved_tree_block(trans, root,
2367                                                 parent, ref_root,
2368                                                 extent_op->flags_to_set,
2369                                                 &extent_op->key,
2370                                                 ref->level, &ins);
2371         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2372                 ret = __btrfs_inc_extent_ref(trans, root, node,
2373                                              parent, ref_root,
2374                                              ref->level, 0, 1,
2375                                              extent_op);
2376         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2377                 ret = __btrfs_free_extent(trans, root, node,
2378                                           parent, ref_root,
2379                                           ref->level, 0, 1, extent_op);
2380         } else {
2381                 BUG();
2382         }
2383         return ret;
2384 }
2385
2386 /* helper function to actually process a single delayed ref entry */
2387 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2388                                struct btrfs_root *root,
2389                                struct btrfs_delayed_ref_node *node,
2390                                struct btrfs_delayed_extent_op *extent_op,
2391                                int insert_reserved)
2392 {
2393         int ret = 0;
2394
2395         if (trans->aborted) {
2396                 if (insert_reserved)
2397                         btrfs_pin_extent(root, node->bytenr,
2398                                          node->num_bytes, 1);
2399                 return 0;
2400         }
2401
2402         if (btrfs_delayed_ref_is_head(node)) {
2403                 struct btrfs_delayed_ref_head *head;
2404                 /*
2405                  * we've hit the end of the chain and we were supposed
2406                  * to insert this extent into the tree.  But, it got
2407                  * deleted before we ever needed to insert it, so all
2408                  * we have to do is clean up the accounting
2409                  */
2410                 BUG_ON(extent_op);
2411                 head = btrfs_delayed_node_to_head(node);
2412                 trace_run_delayed_ref_head(root->fs_info, node, head,
2413                                            node->action);
2414
2415                 if (insert_reserved) {
2416                         btrfs_pin_extent(root, node->bytenr,
2417                                          node->num_bytes, 1);
2418                         if (head->is_data) {
2419                                 ret = btrfs_del_csums(trans, root,
2420                                                       node->bytenr,
2421                                                       node->num_bytes);
2422                         }
2423                 }
2424
2425                 /* Also free its reserved qgroup space */
2426                 btrfs_qgroup_free_delayed_ref(root->fs_info,
2427                                               head->qgroup_ref_root,
2428                                               head->qgroup_reserved);
2429                 return ret;
2430         }
2431
2432         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2433             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2434                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
2435                                            insert_reserved);
2436         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2437                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2438                 ret = run_delayed_data_ref(trans, root, node, extent_op,
2439                                            insert_reserved);
2440         else
2441                 BUG();
2442         return ret;
2443 }
2444
2445 static inline struct btrfs_delayed_ref_node *
2446 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2447 {
2448         struct btrfs_delayed_ref_node *ref;
2449
2450         if (list_empty(&head->ref_list))
2451                 return NULL;
2452
2453         /*
2454          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2455          * This is to prevent a ref count from going down to zero, which deletes
2456          * the extent item from the extent tree, when there still are references
2457          * to add, which would fail because they would not find the extent item.
2458          */
2459         list_for_each_entry(ref, &head->ref_list, list) {
2460                 if (ref->action == BTRFS_ADD_DELAYED_REF)
2461                         return ref;
2462         }
2463
2464         return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
2465                           list);
2466 }
2467
2468 /*
2469  * Returns 0 on success or if called with an already aborted transaction.
2470  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2471  */
2472 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2473                                              struct btrfs_root *root,
2474                                              unsigned long nr)
2475 {
2476         struct btrfs_delayed_ref_root *delayed_refs;
2477         struct btrfs_delayed_ref_node *ref;
2478         struct btrfs_delayed_ref_head *locked_ref = NULL;
2479         struct btrfs_delayed_extent_op *extent_op;
2480         struct btrfs_fs_info *fs_info = root->fs_info;
2481         ktime_t start = ktime_get();
2482         int ret;
2483         unsigned long count = 0;
2484         unsigned long actual_count = 0;
2485         int must_insert_reserved = 0;
2486
2487         delayed_refs = &trans->transaction->delayed_refs;
2488         while (1) {
2489                 if (!locked_ref) {
2490                         if (count >= nr)
2491                                 break;
2492
2493                         spin_lock(&delayed_refs->lock);
2494                         locked_ref = btrfs_select_ref_head(trans);
2495                         if (!locked_ref) {
2496                                 spin_unlock(&delayed_refs->lock);
2497                                 break;
2498                         }
2499
2500                         /* grab the lock that says we are going to process
2501                          * all the refs for this head */
2502                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
2503                         spin_unlock(&delayed_refs->lock);
2504                         /*
2505                          * we may have dropped the spin lock to get the head
2506                          * mutex lock, and that might have given someone else
2507                          * time to free the head.  If that's true, it has been
2508                          * removed from our list and we can move on.
2509                          */
2510                         if (ret == -EAGAIN) {
2511                                 locked_ref = NULL;
2512                                 count++;
2513                                 continue;
2514                         }
2515                 }
2516
2517                 /*
2518                  * We need to try and merge add/drops of the same ref since we
2519                  * can run into issues with relocate dropping the implicit ref
2520                  * and then it being added back again before the drop can
2521                  * finish.  If we merged anything we need to re-loop so we can
2522                  * get a good ref.
2523                  * Or we can get node references of the same type that weren't
2524                  * merged when created due to bumps in the tree mod seq, and
2525                  * we need to merge them to prevent adding an inline extent
2526                  * backref before dropping it (triggering a BUG_ON at
2527                  * insert_inline_extent_backref()).
2528                  */
2529                 spin_lock(&locked_ref->lock);
2530                 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2531                                          locked_ref);
2532
2533                 /*
2534                  * locked_ref is the head node, so we have to go one
2535                  * node back for any delayed ref updates
2536                  */
2537                 ref = select_delayed_ref(locked_ref);
2538
2539                 if (ref && ref->seq &&
2540                     btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2541                         spin_unlock(&locked_ref->lock);
2542                         btrfs_delayed_ref_unlock(locked_ref);
2543                         spin_lock(&delayed_refs->lock);
2544                         locked_ref->processing = 0;
2545                         delayed_refs->num_heads_ready++;
2546                         spin_unlock(&delayed_refs->lock);
2547                         locked_ref = NULL;
2548                         cond_resched();
2549                         count++;
2550                         continue;
2551                 }
2552
2553                 /*
2554                  * record the must insert reserved flag before we
2555                  * drop the spin lock.
2556                  */
2557                 must_insert_reserved = locked_ref->must_insert_reserved;
2558                 locked_ref->must_insert_reserved = 0;
2559
2560                 extent_op = locked_ref->extent_op;
2561                 locked_ref->extent_op = NULL;
2562
2563                 if (!ref) {
2564
2565
2566                         /* All delayed refs have been processed, Go ahead
2567                          * and send the head node to run_one_delayed_ref,
2568                          * so that any accounting fixes can happen
2569                          */
2570                         ref = &locked_ref->node;
2571
2572                         if (extent_op && must_insert_reserved) {
2573                                 btrfs_free_delayed_extent_op(extent_op);
2574                                 extent_op = NULL;
2575                         }
2576
2577                         if (extent_op) {
2578                                 spin_unlock(&locked_ref->lock);
2579                                 ret = run_delayed_extent_op(trans, root,
2580                                                             ref, extent_op);
2581                                 btrfs_free_delayed_extent_op(extent_op);
2582
2583                                 if (ret) {
2584                                         /*
2585                                          * Need to reset must_insert_reserved if
2586                                          * there was an error so the abort stuff
2587                                          * can cleanup the reserved space
2588                                          * properly.
2589                                          */
2590                                         if (must_insert_reserved)
2591                                                 locked_ref->must_insert_reserved = 1;
2592                                         locked_ref->processing = 0;
2593                                         btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2594                                         btrfs_delayed_ref_unlock(locked_ref);
2595                                         return ret;
2596                                 }
2597                                 continue;
2598                         }
2599
2600                         /*
2601                          * Need to drop our head ref lock and re-acquire the
2602                          * delayed ref lock and then re-check to make sure
2603                          * nobody got added.
2604                          */
2605                         spin_unlock(&locked_ref->lock);
2606                         spin_lock(&delayed_refs->lock);
2607                         spin_lock(&locked_ref->lock);
2608                         if (!list_empty(&locked_ref->ref_list) ||
2609                             locked_ref->extent_op) {
2610                                 spin_unlock(&locked_ref->lock);
2611                                 spin_unlock(&delayed_refs->lock);
2612                                 continue;
2613                         }
2614                         ref->in_tree = 0;
2615                         delayed_refs->num_heads--;
2616                         rb_erase(&locked_ref->href_node,
2617                                  &delayed_refs->href_root);
2618                         spin_unlock(&delayed_refs->lock);
2619                 } else {
2620                         actual_count++;
2621                         ref->in_tree = 0;
2622                         list_del(&ref->list);
2623                 }
2624                 atomic_dec(&delayed_refs->num_entries);
2625
2626                 if (!btrfs_delayed_ref_is_head(ref)) {
2627                         /*
2628                          * when we play the delayed ref, also correct the
2629                          * ref_mod on head
2630                          */
2631                         switch (ref->action) {
2632                         case BTRFS_ADD_DELAYED_REF:
2633                         case BTRFS_ADD_DELAYED_EXTENT:
2634                                 locked_ref->node.ref_mod -= ref->ref_mod;
2635                                 break;
2636                         case BTRFS_DROP_DELAYED_REF:
2637                                 locked_ref->node.ref_mod += ref->ref_mod;
2638                                 break;
2639                         default:
2640                                 WARN_ON(1);
2641                         }
2642                 }
2643                 spin_unlock(&locked_ref->lock);
2644
2645                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2646                                           must_insert_reserved);
2647
2648                 btrfs_free_delayed_extent_op(extent_op);
2649                 if (ret) {
2650                         locked_ref->processing = 0;
2651                         btrfs_delayed_ref_unlock(locked_ref);
2652                         btrfs_put_delayed_ref(ref);
2653                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
2654                         return ret;
2655                 }
2656
2657                 /*
2658                  * If this node is a head, that means all the refs in this head
2659                  * have been dealt with, and we will pick the next head to deal
2660                  * with, so we must unlock the head and drop it from the cluster
2661                  * list before we release it.
2662                  */
2663                 if (btrfs_delayed_ref_is_head(ref)) {
2664                         if (locked_ref->is_data &&
2665                             locked_ref->total_ref_mod < 0) {
2666                                 spin_lock(&delayed_refs->lock);
2667                                 delayed_refs->pending_csums -= ref->num_bytes;
2668                                 spin_unlock(&delayed_refs->lock);
2669                         }
2670                         btrfs_delayed_ref_unlock(locked_ref);
2671                         locked_ref = NULL;
2672                 }
2673                 btrfs_put_delayed_ref(ref);
2674                 count++;
2675                 cond_resched();
2676         }
2677
2678         /*
2679          * We don't want to include ref heads since we can have empty ref heads
2680          * and those will drastically skew our runtime down since we just do
2681          * accounting, no actual extent tree updates.
2682          */
2683         if (actual_count > 0) {
2684                 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2685                 u64 avg;
2686
2687                 /*
2688                  * We weigh the current average higher than our current runtime
2689                  * to avoid large swings in the average.
2690                  */
2691                 spin_lock(&delayed_refs->lock);
2692                 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2693                 fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2694                 spin_unlock(&delayed_refs->lock);
2695         }
2696         return 0;
2697 }
2698
2699 #ifdef SCRAMBLE_DELAYED_REFS
2700 /*
2701  * Normally delayed refs get processed in ascending bytenr order. This
2702  * correlates in most cases to the order added. To expose dependencies on this
2703  * order, we start to process the tree in the middle instead of the beginning
2704  */
2705 static u64 find_middle(struct rb_root *root)
2706 {
2707         struct rb_node *n = root->rb_node;
2708         struct btrfs_delayed_ref_node *entry;
2709         int alt = 1;
2710         u64 middle;
2711         u64 first = 0, last = 0;
2712
2713         n = rb_first(root);
2714         if (n) {
2715                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2716                 first = entry->bytenr;
2717         }
2718         n = rb_last(root);
2719         if (n) {
2720                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2721                 last = entry->bytenr;
2722         }
2723         n = root->rb_node;
2724
2725         while (n) {
2726                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2727                 WARN_ON(!entry->in_tree);
2728
2729                 middle = entry->bytenr;
2730
2731                 if (alt)
2732                         n = n->rb_left;
2733                 else
2734                         n = n->rb_right;
2735
2736                 alt = 1 - alt;
2737         }
2738         return middle;
2739 }
2740 #endif
2741
2742 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2743 {
2744         u64 num_bytes;
2745
2746         num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2747                              sizeof(struct btrfs_extent_inline_ref));
2748         if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2749                 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2750
2751         /*
2752          * We don't ever fill up leaves all the way so multiply by 2 just to be
2753          * closer to what we're really going to want to use.
2754          */
2755         return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2756 }
2757
2758 /*
2759  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2760  * would require to store the csums for that many bytes.
2761  */
2762 u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
2763 {
2764         u64 csum_size;
2765         u64 num_csums_per_leaf;
2766         u64 num_csums;
2767
2768         csum_size = BTRFS_MAX_ITEM_SIZE(root);
2769         num_csums_per_leaf = div64_u64(csum_size,
2770                         (u64)btrfs_super_csum_size(root->fs_info->super_copy));
2771         num_csums = div64_u64(csum_bytes, root->sectorsize);
2772         num_csums += num_csums_per_leaf - 1;
2773         num_csums = div64_u64(num_csums, num_csums_per_leaf);
2774         return num_csums;
2775 }
2776
2777 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2778                                        struct btrfs_root *root)
2779 {
2780         struct btrfs_block_rsv *global_rsv;
2781         u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2782         u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2783         u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
2784         u64 num_bytes, num_dirty_bgs_bytes;
2785         int ret = 0;
2786
2787         num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2788         num_heads = heads_to_leaves(root, num_heads);
2789         if (num_heads > 1)
2790                 num_bytes += (num_heads - 1) * root->nodesize;
2791         num_bytes <<= 1;
2792         num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
2793         num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
2794                                                              num_dirty_bgs);
2795         global_rsv = &root->fs_info->global_block_rsv;
2796
2797         /*
2798          * If we can't allocate any more chunks lets make sure we have _lots_ of
2799          * wiggle room since running delayed refs can create more delayed refs.
2800          */
2801         if (global_rsv->space_info->full) {
2802                 num_dirty_bgs_bytes <<= 1;
2803                 num_bytes <<= 1;
2804         }
2805
2806         spin_lock(&global_rsv->lock);
2807         if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2808                 ret = 1;
2809         spin_unlock(&global_rsv->lock);
2810         return ret;
2811 }
2812
2813 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2814                                        struct btrfs_root *root)
2815 {
2816         struct btrfs_fs_info *fs_info = root->fs_info;
2817         u64 num_entries =
2818                 atomic_read(&trans->transaction->delayed_refs.num_entries);
2819         u64 avg_runtime;
2820         u64 val;
2821
2822         smp_mb();
2823         avg_runtime = fs_info->avg_delayed_ref_runtime;
2824         val = num_entries * avg_runtime;
2825         if (num_entries * avg_runtime >= NSEC_PER_SEC)
2826                 return 1;
2827         if (val >= NSEC_PER_SEC / 2)
2828                 return 2;
2829
2830         return btrfs_check_space_for_delayed_refs(trans, root);
2831 }
2832
2833 struct async_delayed_refs {
2834         struct btrfs_root *root;
2835         u64 transid;
2836         int count;
2837         int error;
2838         int sync;
2839         struct completion wait;
2840         struct btrfs_work work;
2841 };
2842
2843 static void delayed_ref_async_start(struct btrfs_work *work)
2844 {
2845         struct async_delayed_refs *async;
2846         struct btrfs_trans_handle *trans;
2847         int ret;
2848
2849         async = container_of(work, struct async_delayed_refs, work);
2850
2851         /* if the commit is already started, we don't need to wait here */
2852         if (btrfs_transaction_blocked(async->root->fs_info))
2853                 goto done;
2854
2855         trans = btrfs_join_transaction(async->root);
2856         if (IS_ERR(trans)) {
2857                 async->error = PTR_ERR(trans);
2858                 goto done;
2859         }
2860
2861         /*
2862          * trans->sync means that when we call end_transaction, we won't
2863          * wait on delayed refs
2864          */
2865         trans->sync = true;
2866
2867         /* Don't bother flushing if we got into a different transaction */
2868         if (trans->transid > async->transid)
2869                 goto end;
2870
2871         ret = btrfs_run_delayed_refs(trans, async->root, async->count);
2872         if (ret)
2873                 async->error = ret;
2874 end:
2875         ret = btrfs_end_transaction(trans, async->root);
2876         if (ret && !async->error)
2877                 async->error = ret;
2878 done:
2879         if (async->sync)
2880                 complete(&async->wait);
2881         else
2882                 kfree(async);
2883 }
2884
2885 int btrfs_async_run_delayed_refs(struct btrfs_root *root,
2886                                  unsigned long count, u64 transid, int wait)
2887 {
2888         struct async_delayed_refs *async;
2889         int ret;
2890
2891         async = kmalloc(sizeof(*async), GFP_NOFS);
2892         if (!async)
2893                 return -ENOMEM;
2894
2895         async->root = root->fs_info->tree_root;
2896         async->count = count;
2897         async->error = 0;
2898         async->transid = transid;
2899         if (wait)
2900                 async->sync = 1;
2901         else
2902                 async->sync = 0;
2903         init_completion(&async->wait);
2904
2905         btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2906                         delayed_ref_async_start, NULL, NULL);
2907
2908         btrfs_queue_work(root->fs_info->extent_workers, &async->work);
2909
2910         if (wait) {
2911                 wait_for_completion(&async->wait);
2912                 ret = async->error;
2913                 kfree(async);
2914                 return ret;
2915         }
2916         return 0;
2917 }
2918
2919 /*
2920  * this starts processing the delayed reference count updates and
2921  * extent insertions we have queued up so far.  count can be
2922  * 0, which means to process everything in the tree at the start
2923  * of the run (but not newly added entries), or it can be some target
2924  * number you'd like to process.
2925  *
2926  * Returns 0 on success or if called with an aborted transaction
2927  * Returns <0 on error and aborts the transaction
2928  */
2929 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2930                            struct btrfs_root *root, unsigned long count)
2931 {
2932         struct rb_node *node;
2933         struct btrfs_delayed_ref_root *delayed_refs;
2934         struct btrfs_delayed_ref_head *head;
2935         int ret;
2936         int run_all = count == (unsigned long)-1;
2937         bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
2938
2939         /* We'll clean this up in btrfs_cleanup_transaction */
2940         if (trans->aborted)
2941                 return 0;
2942
2943         if (root->fs_info->creating_free_space_tree)
2944                 return 0;
2945
2946         if (root == root->fs_info->extent_root)
2947                 root = root->fs_info->tree_root;
2948
2949         delayed_refs = &trans->transaction->delayed_refs;
2950         if (count == 0)
2951                 count = atomic_read(&delayed_refs->num_entries) * 2;
2952
2953 again:
2954 #ifdef SCRAMBLE_DELAYED_REFS
2955         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2956 #endif
2957         trans->can_flush_pending_bgs = false;
2958         ret = __btrfs_run_delayed_refs(trans, root, count);
2959         if (ret < 0) {
2960                 btrfs_abort_transaction(trans, ret);
2961                 return ret;
2962         }
2963
2964         if (run_all) {
2965                 if (!list_empty(&trans->new_bgs))
2966                         btrfs_create_pending_block_groups(trans, root);
2967
2968                 spin_lock(&delayed_refs->lock);
2969                 node = rb_first(&delayed_refs->href_root);
2970                 if (!node) {
2971                         spin_unlock(&delayed_refs->lock);
2972                         goto out;
2973                 }
2974                 count = (unsigned long)-1;
2975
2976                 while (node) {
2977                         head = rb_entry(node, struct btrfs_delayed_ref_head,
2978                                         href_node);
2979                         if (btrfs_delayed_ref_is_head(&head->node)) {
2980                                 struct btrfs_delayed_ref_node *ref;
2981
2982                                 ref = &head->node;
2983                                 atomic_inc(&ref->refs);
2984
2985                                 spin_unlock(&delayed_refs->lock);
2986                                 /*
2987                                  * Mutex was contended, block until it's
2988                                  * released and try again
2989                                  */
2990                                 mutex_lock(&head->mutex);
2991                                 mutex_unlock(&head->mutex);
2992
2993                                 btrfs_put_delayed_ref(ref);
2994                                 cond_resched();
2995                                 goto again;
2996                         } else {
2997                                 WARN_ON(1);
2998                         }
2999                         node = rb_next(node);
3000                 }
3001                 spin_unlock(&delayed_refs->lock);
3002                 cond_resched();
3003                 goto again;
3004         }
3005 out:
3006         assert_qgroups_uptodate(trans);
3007         trans->can_flush_pending_bgs = can_flush_pending_bgs;
3008         return 0;
3009 }
3010
3011 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3012                                 struct btrfs_root *root,
3013                                 u64 bytenr, u64 num_bytes, u64 flags,
3014                                 int level, int is_data)
3015 {
3016         struct btrfs_delayed_extent_op *extent_op;
3017         int ret;
3018
3019         extent_op = btrfs_alloc_delayed_extent_op();
3020         if (!extent_op)
3021                 return -ENOMEM;
3022
3023         extent_op->flags_to_set = flags;
3024         extent_op->update_flags = true;
3025         extent_op->update_key = false;
3026         extent_op->is_data = is_data ? true : false;
3027         extent_op->level = level;
3028
3029         ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
3030                                           num_bytes, extent_op);
3031         if (ret)
3032                 btrfs_free_delayed_extent_op(extent_op);
3033         return ret;
3034 }
3035
3036 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
3037                                       struct btrfs_root *root,
3038                                       struct btrfs_path *path,
3039                                       u64 objectid, u64 offset, u64 bytenr)
3040 {
3041         struct btrfs_delayed_ref_head *head;
3042         struct btrfs_delayed_ref_node *ref;
3043         struct btrfs_delayed_data_ref *data_ref;
3044         struct btrfs_delayed_ref_root *delayed_refs;
3045         int ret = 0;
3046
3047         delayed_refs = &trans->transaction->delayed_refs;
3048         spin_lock(&delayed_refs->lock);
3049         head = btrfs_find_delayed_ref_head(trans, bytenr);
3050         if (!head) {
3051                 spin_unlock(&delayed_refs->lock);
3052                 return 0;
3053         }
3054
3055         if (!mutex_trylock(&head->mutex)) {
3056                 atomic_inc(&head->node.refs);
3057                 spin_unlock(&delayed_refs->lock);
3058
3059                 btrfs_release_path(path);
3060
3061                 /*
3062                  * Mutex was contended, block until it's released and let
3063                  * caller try again
3064                  */
3065                 mutex_lock(&head->mutex);
3066                 mutex_unlock(&head->mutex);
3067                 btrfs_put_delayed_ref(&head->node);
3068                 return -EAGAIN;
3069         }
3070         spin_unlock(&delayed_refs->lock);
3071
3072         spin_lock(&head->lock);
3073         list_for_each_entry(ref, &head->ref_list, list) {
3074                 /* If it's a shared ref we know a cross reference exists */
3075                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
3076                         ret = 1;
3077                         break;
3078                 }
3079
3080                 data_ref = btrfs_delayed_node_to_data_ref(ref);
3081
3082                 /*
3083                  * If our ref doesn't match the one we're currently looking at
3084                  * then we have a cross reference.
3085                  */
3086                 if (data_ref->root != root->root_key.objectid ||
3087                     data_ref->objectid != objectid ||
3088                     data_ref->offset != offset) {
3089                         ret = 1;
3090                         break;
3091                 }
3092         }
3093         spin_unlock(&head->lock);
3094         mutex_unlock(&head->mutex);
3095         return ret;
3096 }
3097
3098 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
3099                                         struct btrfs_root *root,
3100                                         struct btrfs_path *path,
3101                                         u64 objectid, u64 offset, u64 bytenr)
3102 {
3103         struct btrfs_root *extent_root = root->fs_info->extent_root;
3104         struct extent_buffer *leaf;
3105         struct btrfs_extent_data_ref *ref;
3106         struct btrfs_extent_inline_ref *iref;
3107         struct btrfs_extent_item *ei;
3108         struct btrfs_key key;
3109         u32 item_size;
3110         int ret;
3111
3112         key.objectid = bytenr;
3113         key.offset = (u64)-1;
3114         key.type = BTRFS_EXTENT_ITEM_KEY;
3115
3116         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3117         if (ret < 0)
3118                 goto out;
3119         BUG_ON(ret == 0); /* Corruption */
3120
3121         ret = -ENOENT;
3122         if (path->slots[0] == 0)
3123                 goto out;
3124
3125         path->slots[0]--;
3126         leaf = path->nodes[0];
3127         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3128
3129         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3130                 goto out;
3131
3132         ret = 1;
3133         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3134 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3135         if (item_size < sizeof(*ei)) {
3136                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3137                 goto out;
3138         }
3139 #endif
3140         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3141
3142         if (item_size != sizeof(*ei) +
3143             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3144                 goto out;
3145
3146         if (btrfs_extent_generation(leaf, ei) <=
3147             btrfs_root_last_snapshot(&root->root_item))
3148                 goto out;
3149
3150         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3151         if (btrfs_extent_inline_ref_type(leaf, iref) !=
3152             BTRFS_EXTENT_DATA_REF_KEY)
3153                 goto out;
3154
3155         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3156         if (btrfs_extent_refs(leaf, ei) !=
3157             btrfs_extent_data_ref_count(leaf, ref) ||
3158             btrfs_extent_data_ref_root(leaf, ref) !=
3159             root->root_key.objectid ||
3160             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3161             btrfs_extent_data_ref_offset(leaf, ref) != offset)
3162                 goto out;
3163
3164         ret = 0;
3165 out:
3166         return ret;
3167 }
3168
3169 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
3170                           struct btrfs_root *root,
3171                           u64 objectid, u64 offset, u64 bytenr)
3172 {
3173         struct btrfs_path *path;
3174         int ret;
3175         int ret2;
3176
3177         path = btrfs_alloc_path();
3178         if (!path)
3179                 return -ENOENT;
3180
3181         do {
3182                 ret = check_committed_ref(trans, root, path, objectid,
3183                                           offset, bytenr);
3184                 if (ret && ret != -ENOENT)
3185                         goto out;
3186
3187                 ret2 = check_delayed_ref(trans, root, path, objectid,
3188                                          offset, bytenr);
3189         } while (ret2 == -EAGAIN);
3190
3191         if (ret2 && ret2 != -ENOENT) {
3192                 ret = ret2;
3193                 goto out;
3194         }
3195
3196         if (ret != -ENOENT || ret2 != -ENOENT)
3197                 ret = 0;
3198 out:
3199         btrfs_free_path(path);
3200         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3201                 WARN_ON(ret > 0);
3202         return ret;
3203 }
3204
3205 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3206                            struct btrfs_root *root,
3207                            struct extent_buffer *buf,
3208                            int full_backref, int inc)
3209 {
3210         u64 bytenr;
3211         u64 num_bytes;
3212         u64 parent;
3213         u64 ref_root;
3214         u32 nritems;
3215         struct btrfs_key key;
3216         struct btrfs_file_extent_item *fi;
3217         int i;
3218         int level;
3219         int ret = 0;
3220         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
3221                             u64, u64, u64, u64, u64, u64);
3222
3223
3224         if (btrfs_is_testing(root->fs_info))
3225                 return 0;
3226
3227         ref_root = btrfs_header_owner(buf);
3228         nritems = btrfs_header_nritems(buf);
3229         level = btrfs_header_level(buf);
3230
3231         if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3232                 return 0;
3233
3234         if (inc)
3235                 process_func = btrfs_inc_extent_ref;
3236         else
3237                 process_func = btrfs_free_extent;
3238
3239         if (full_backref)
3240                 parent = buf->start;
3241         else
3242                 parent = 0;
3243
3244         for (i = 0; i < nritems; i++) {
3245                 if (level == 0) {
3246                         btrfs_item_key_to_cpu(buf, &key, i);
3247                         if (key.type != BTRFS_EXTENT_DATA_KEY)
3248                                 continue;
3249                         fi = btrfs_item_ptr(buf, i,
3250                                             struct btrfs_file_extent_item);
3251                         if (btrfs_file_extent_type(buf, fi) ==
3252                             BTRFS_FILE_EXTENT_INLINE)
3253                                 continue;
3254                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3255                         if (bytenr == 0)
3256                                 continue;
3257
3258                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3259                         key.offset -= btrfs_file_extent_offset(buf, fi);
3260                         ret = process_func(trans, root, bytenr, num_bytes,
3261                                            parent, ref_root, key.objectid,
3262                                            key.offset);
3263                         if (ret)
3264                                 goto fail;
3265                 } else {
3266                         bytenr = btrfs_node_blockptr(buf, i);
3267                         num_bytes = root->nodesize;
3268                         ret = process_func(trans, root, bytenr, num_bytes,
3269                                            parent, ref_root, level - 1, 0);
3270                         if (ret)
3271                                 goto fail;
3272                 }
3273         }
3274         return 0;
3275 fail:
3276         return ret;
3277 }
3278
3279 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3280                   struct extent_buffer *buf, int full_backref)
3281 {
3282         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3283 }
3284
3285 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3286                   struct extent_buffer *buf, int full_backref)
3287 {
3288         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3289 }
3290
3291 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3292                                  struct btrfs_root *root,
3293                                  struct btrfs_path *path,
3294                                  struct btrfs_block_group_cache *cache)
3295 {
3296         int ret;
3297         struct btrfs_root *extent_root = root->fs_info->extent_root;
3298         unsigned long bi;
3299         struct extent_buffer *leaf;
3300
3301         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3302         if (ret) {
3303                 if (ret > 0)
3304                         ret = -ENOENT;
3305                 goto fail;
3306         }
3307
3308         leaf = path->nodes[0];
3309         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3310         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3311         btrfs_mark_buffer_dirty(leaf);
3312 fail:
3313         btrfs_release_path(path);
3314         return ret;
3315
3316 }
3317
3318 static struct btrfs_block_group_cache *
3319 next_block_group(struct btrfs_root *root,
3320                  struct btrfs_block_group_cache *cache)
3321 {
3322         struct rb_node *node;
3323
3324         spin_lock(&root->fs_info->block_group_cache_lock);
3325
3326         /* If our block group was removed, we need a full search. */
3327         if (RB_EMPTY_NODE(&cache->cache_node)) {
3328                 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3329
3330                 spin_unlock(&root->fs_info->block_group_cache_lock);
3331                 btrfs_put_block_group(cache);
3332                 cache = btrfs_lookup_first_block_group(root->fs_info,
3333                                                        next_bytenr);
3334                 return cache;
3335         }
3336         node = rb_next(&cache->cache_node);
3337         btrfs_put_block_group(cache);
3338         if (node) {
3339                 cache = rb_entry(node, struct btrfs_block_group_cache,
3340                                  cache_node);
3341                 btrfs_get_block_group(cache);
3342         } else
3343                 cache = NULL;
3344         spin_unlock(&root->fs_info->block_group_cache_lock);
3345         return cache;
3346 }
3347
3348 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3349                             struct btrfs_trans_handle *trans,
3350                             struct btrfs_path *path)
3351 {
3352         struct btrfs_root *root = block_group->fs_info->tree_root;
3353         struct inode *inode = NULL;
3354         u64 alloc_hint = 0;
3355         int dcs = BTRFS_DC_ERROR;
3356         u64 num_pages = 0;
3357         int retries = 0;
3358         int ret = 0;
3359
3360         /*
3361          * If this block group is smaller than 100 megs don't bother caching the
3362          * block group.
3363          */
3364         if (block_group->key.offset < (100 * SZ_1M)) {
3365                 spin_lock(&block_group->lock);
3366                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3367                 spin_unlock(&block_group->lock);
3368                 return 0;
3369         }
3370
3371         if (trans->aborted)
3372                 return 0;
3373 again:
3374         inode = lookup_free_space_inode(root, block_group, path);
3375         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3376                 ret = PTR_ERR(inode);
3377                 btrfs_release_path(path);
3378                 goto out;
3379         }
3380
3381         if (IS_ERR(inode)) {
3382                 BUG_ON(retries);
3383                 retries++;
3384
3385                 if (block_group->ro)
3386                         goto out_free;
3387
3388                 ret = create_free_space_inode(root, trans, block_group, path);
3389                 if (ret)
3390                         goto out_free;
3391                 goto again;
3392         }
3393
3394         /* We've already setup this transaction, go ahead and exit */
3395         if (block_group->cache_generation == trans->transid &&
3396             i_size_read(inode)) {
3397                 dcs = BTRFS_DC_SETUP;
3398                 goto out_put;
3399         }
3400
3401         /*
3402          * We want to set the generation to 0, that way if anything goes wrong
3403          * from here on out we know not to trust this cache when we load up next
3404          * time.
3405          */
3406         BTRFS_I(inode)->generation = 0;
3407         ret = btrfs_update_inode(trans, root, inode);
3408         if (ret) {
3409                 /*
3410                  * So theoretically we could recover from this, simply set the
3411                  * super cache generation to 0 so we know to invalidate the
3412                  * cache, but then we'd have to keep track of the block groups
3413                  * that fail this way so we know we _have_ to reset this cache
3414                  * before the next commit or risk reading stale cache.  So to
3415                  * limit our exposure to horrible edge cases lets just abort the
3416                  * transaction, this only happens in really bad situations
3417                  * anyway.
3418                  */
3419                 btrfs_abort_transaction(trans, ret);
3420                 goto out_put;
3421         }
3422         WARN_ON(ret);
3423
3424         if (i_size_read(inode) > 0) {
3425                 ret = btrfs_check_trunc_cache_free_space(root,
3426                                         &root->fs_info->global_block_rsv);
3427                 if (ret)
3428                         goto out_put;
3429
3430                 ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
3431                 if (ret)
3432                         goto out_put;
3433         }
3434
3435         spin_lock(&block_group->lock);
3436         if (block_group->cached != BTRFS_CACHE_FINISHED ||
3437             !btrfs_test_opt(root->fs_info, SPACE_CACHE)) {
3438                 /*
3439                  * don't bother trying to write stuff out _if_
3440                  * a) we're not cached,
3441                  * b) we're with nospace_cache mount option.
3442                  */
3443                 dcs = BTRFS_DC_WRITTEN;
3444                 spin_unlock(&block_group->lock);
3445                 goto out_put;
3446         }
3447         spin_unlock(&block_group->lock);
3448
3449         /*
3450          * We hit an ENOSPC when setting up the cache in this transaction, just
3451          * skip doing the setup, we've already cleared the cache so we're safe.
3452          */
3453         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3454                 ret = -ENOSPC;
3455                 goto out_put;
3456         }
3457
3458         /*
3459          * Try to preallocate enough space based on how big the block group is.
3460          * Keep in mind this has to include any pinned space which could end up
3461          * taking up quite a bit since it's not folded into the other space
3462          * cache.
3463          */
3464         num_pages = div_u64(block_group->key.offset, SZ_256M);
3465         if (!num_pages)
3466                 num_pages = 1;
3467
3468         num_pages *= 16;
3469         num_pages *= PAGE_SIZE;
3470
3471         ret = btrfs_check_data_free_space(inode, 0, num_pages);
3472         if (ret)
3473                 goto out_put;
3474
3475         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3476                                               num_pages, num_pages,
3477                                               &alloc_hint);
3478         /*
3479          * Our cache requires contiguous chunks so that we don't modify a bunch
3480          * of metadata or split extents when writing the cache out, which means
3481          * we can enospc if we are heavily fragmented in addition to just normal
3482          * out of space conditions.  So if we hit this just skip setting up any
3483          * other block groups for this transaction, maybe we'll unpin enough
3484          * space the next time around.
3485          */
3486         if (!ret)
3487                 dcs = BTRFS_DC_SETUP;
3488         else if (ret == -ENOSPC)
3489                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3490
3491 out_put:
3492         iput(inode);
3493 out_free:
3494         btrfs_release_path(path);
3495 out:
3496         spin_lock(&block_group->lock);
3497         if (!ret && dcs == BTRFS_DC_SETUP)
3498                 block_group->cache_generation = trans->transid;
3499         block_group->disk_cache_state = dcs;
3500         spin_unlock(&block_group->lock);
3501
3502         return ret;
3503 }
3504
3505 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3506                             struct btrfs_root *root)
3507 {
3508         struct btrfs_block_group_cache *cache, *tmp;
3509         struct btrfs_transaction *cur_trans = trans->transaction;
3510         struct btrfs_path *path;
3511
3512         if (list_empty(&cur_trans->dirty_bgs) ||
3513             !btrfs_test_opt(root->fs_info, SPACE_CACHE))
3514                 return 0;
3515
3516         path = btrfs_alloc_path();
3517         if (!path)
3518                 return -ENOMEM;
3519
3520         /* Could add new block groups, use _safe just in case */
3521         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3522                                  dirty_list) {
3523                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3524                         cache_save_setup(cache, trans, path);
3525         }
3526
3527         btrfs_free_path(path);
3528         return 0;
3529 }
3530
3531 /*
3532  * transaction commit does final block group cache writeback during a
3533  * critical section where nothing is allowed to change the FS.  This is
3534  * required in order for the cache to actually match the block group,
3535  * but can introduce a lot of latency into the commit.
3536  *
3537  * So, btrfs_start_dirty_block_groups is here to kick off block group
3538  * cache IO.  There's a chance we'll have to redo some of it if the
3539  * block group changes again during the commit, but it greatly reduces
3540  * the commit latency by getting rid of the easy block groups while
3541  * we're still allowing others to join the commit.
3542  */
3543 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
3544                                    struct btrfs_root *root)
3545 {
3546         struct btrfs_block_group_cache *cache;
3547         struct btrfs_transaction *cur_trans = trans->transaction;
3548         int ret = 0;
3549         int should_put;
3550         struct btrfs_path *path = NULL;
3551         LIST_HEAD(dirty);
3552         struct list_head *io = &cur_trans->io_bgs;
3553         int num_started = 0;
3554         int loops = 0;
3555
3556         spin_lock(&cur_trans->dirty_bgs_lock);
3557         if (list_empty(&cur_trans->dirty_bgs)) {
3558                 spin_unlock(&cur_trans->dirty_bgs_lock);
3559                 return 0;
3560         }
3561         list_splice_init(&cur_trans->dirty_bgs, &dirty);
3562         spin_unlock(&cur_trans->dirty_bgs_lock);
3563
3564 again:
3565         /*
3566          * make sure all the block groups on our dirty list actually
3567          * exist
3568          */
3569         btrfs_create_pending_block_groups(trans, root);
3570
3571         if (!path) {
3572                 path = btrfs_alloc_path();
3573                 if (!path)
3574                         return -ENOMEM;
3575         }
3576
3577         /*
3578          * cache_write_mutex is here only to save us from balance or automatic
3579          * removal of empty block groups deleting this block group while we are
3580          * writing out the cache
3581          */
3582         mutex_lock(&trans->transaction->cache_write_mutex);
3583         while (!list_empty(&dirty)) {
3584                 cache = list_first_entry(&dirty,
3585                                          struct btrfs_block_group_cache,
3586                                          dirty_list);
3587                 /*
3588                  * this can happen if something re-dirties a block
3589                  * group that is already under IO.  Just wait for it to
3590                  * finish and then do it all again
3591                  */
3592                 if (!list_empty(&cache->io_list)) {
3593                         list_del_init(&cache->io_list);
3594                         btrfs_wait_cache_io(root, trans, cache,
3595                                             &cache->io_ctl, path,
3596                                             cache->key.objectid);
3597                         btrfs_put_block_group(cache);
3598                 }
3599
3600
3601                 /*
3602                  * btrfs_wait_cache_io uses the cache->dirty_list to decide
3603                  * if it should update the cache_state.  Don't delete
3604                  * until after we wait.
3605                  *
3606                  * Since we're not running in the commit critical section
3607                  * we need the dirty_bgs_lock to protect from update_block_group
3608                  */
3609                 spin_lock(&cur_trans->dirty_bgs_lock);
3610                 list_del_init(&cache->dirty_list);
3611                 spin_unlock(&cur_trans->dirty_bgs_lock);
3612
3613                 should_put = 1;
3614
3615                 cache_save_setup(cache, trans, path);
3616
3617                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3618                         cache->io_ctl.inode = NULL;
3619                         ret = btrfs_write_out_cache(root, trans, cache, path);
3620                         if (ret == 0 && cache->io_ctl.inode) {
3621                                 num_started++;
3622                                 should_put = 0;
3623
3624                                 /*
3625                                  * the cache_write_mutex is protecting
3626                                  * the io_list
3627                                  */
3628                                 list_add_tail(&cache->io_list, io);
3629                         } else {
3630                                 /*
3631                                  * if we failed to write the cache, the
3632                                  * generation will be bad and life goes on
3633                                  */
3634                                 ret = 0;
3635                         }
3636                 }
3637                 if (!ret) {
3638                         ret = write_one_cache_group(trans, root, path, cache);
3639                         /*
3640                          * Our block group might still be attached to the list
3641                          * of new block groups in the transaction handle of some
3642                          * other task (struct btrfs_trans_handle->new_bgs). This
3643                          * means its block group item isn't yet in the extent
3644                          * tree. If this happens ignore the error, as we will
3645                          * try again later in the critical section of the
3646                          * transaction commit.
3647                          */
3648                         if (ret == -ENOENT) {
3649                                 ret = 0;
3650                                 spin_lock(&cur_trans->dirty_bgs_lock);
3651                                 if (list_empty(&cache->dirty_list)) {
3652                                         list_add_tail(&cache->dirty_list,
3653                                                       &cur_trans->dirty_bgs);
3654                                         btrfs_get_block_group(cache);
3655                                 }
3656                                 spin_unlock(&cur_trans->dirty_bgs_lock);
3657                         } else if (ret) {
3658                                 btrfs_abort_transaction(trans, ret);
3659                         }
3660                 }
3661
3662                 /* if its not on the io list, we need to put the block group */
3663                 if (should_put)
3664                         btrfs_put_block_group(cache);
3665
3666                 if (ret)
3667                         break;
3668
3669                 /*
3670                  * Avoid blocking other tasks for too long. It might even save
3671                  * us from writing caches for block groups that are going to be
3672                  * removed.
3673                  */
3674                 mutex_unlock(&trans->transaction->cache_write_mutex);
3675                 mutex_lock(&trans->transaction->cache_write_mutex);
3676         }
3677         mutex_unlock(&trans->transaction->cache_write_mutex);
3678
3679         /*
3680          * go through delayed refs for all the stuff we've just kicked off
3681          * and then loop back (just once)
3682          */
3683         ret = btrfs_run_delayed_refs(trans, root, 0);
3684         if (!ret && loops == 0) {
3685                 loops++;
3686                 spin_lock(&cur_trans->dirty_bgs_lock);
3687                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3688                 /*
3689                  * dirty_bgs_lock protects us from concurrent block group
3690                  * deletes too (not just cache_write_mutex).
3691                  */
3692                 if (!list_empty(&dirty)) {
3693                         spin_unlock(&cur_trans->dirty_bgs_lock);
3694                         goto again;
3695                 }
3696                 spin_unlock(&cur_trans->dirty_bgs_lock);
3697         }
3698
3699         btrfs_free_path(path);
3700         return ret;
3701 }
3702
3703 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3704                                    struct btrfs_root *root)
3705 {
3706         struct btrfs_block_group_cache *cache;
3707         struct btrfs_transaction *cur_trans = trans->transaction;
3708         int ret = 0;
3709         int should_put;
3710         struct btrfs_path *path;
3711         struct list_head *io = &cur_trans->io_bgs;
3712         int num_started = 0;
3713
3714         path = btrfs_alloc_path();
3715         if (!path)
3716                 return -ENOMEM;
3717
3718         /*
3719          * Even though we are in the critical section of the transaction commit,
3720          * we can still have concurrent tasks adding elements to this
3721          * transaction's list of dirty block groups. These tasks correspond to
3722          * endio free space workers started when writeback finishes for a
3723          * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3724          * allocate new block groups as a result of COWing nodes of the root
3725          * tree when updating the free space inode. The writeback for the space
3726          * caches is triggered by an earlier call to
3727          * btrfs_start_dirty_block_groups() and iterations of the following
3728          * loop.
3729          * Also we want to do the cache_save_setup first and then run the
3730          * delayed refs to make sure we have the best chance at doing this all
3731          * in one shot.
3732          */
3733         spin_lock(&cur_trans->dirty_bgs_lock);
3734         while (!list_empty(&cur_trans->dirty_bgs)) {
3735                 cache = list_first_entry(&cur_trans->dirty_bgs,
3736                                          struct btrfs_block_group_cache,
3737                                          dirty_list);
3738
3739                 /*
3740                  * this can happen if cache_save_setup re-dirties a block
3741                  * group that is already under IO.  Just wait for it to
3742                  * finish and then do it all again
3743                  */
3744                 if (!list_empty(&cache->io_list)) {
3745                         spin_unlock(&cur_trans->dirty_bgs_lock);
3746                         list_del_init(&cache->io_list);
3747                         btrfs_wait_cache_io(root, trans, cache,
3748                                             &cache->io_ctl, path,
3749                                             cache->key.objectid);
3750                         btrfs_put_block_group(cache);
3751                         spin_lock(&cur_trans->dirty_bgs_lock);
3752                 }
3753
3754                 /*
3755                  * don't remove from the dirty list until after we've waited
3756                  * on any pending IO
3757                  */
3758                 list_del_init(&cache->dirty_list);
3759                 spin_unlock(&cur_trans->dirty_bgs_lock);
3760                 should_put = 1;
3761
3762                 cache_save_setup(cache, trans, path);
3763
3764                 if (!ret)
3765                         ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
3766
3767                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3768                         cache->io_ctl.inode = NULL;
3769                         ret = btrfs_write_out_cache(root, trans, cache, path);
3770                         if (ret == 0 && cache->io_ctl.inode) {
3771                                 num_started++;
3772                                 should_put = 0;
3773                                 list_add_tail(&cache->io_list, io);
3774                         } else {
3775                                 /*
3776                                  * if we failed to write the cache, the
3777                                  * generation will be bad and life goes on
3778                                  */
3779                                 ret = 0;
3780                         }
3781                 }
3782                 if (!ret) {
3783                         ret = write_one_cache_group(trans, root, path, cache);
3784                         /*
3785                          * One of the free space endio workers might have
3786                          * created a new block group while updating a free space
3787                          * cache's inode (at inode.c:btrfs_finish_ordered_io())
3788                          * and hasn't released its transaction handle yet, in
3789                          * which case the new block group is still attached to
3790                          * its transaction handle and its creation has not
3791                          * finished yet (no block group item in the extent tree
3792                          * yet, etc). If this is the case, wait for all free
3793                          * space endio workers to finish and retry. This is a
3794                          * a very rare case so no need for a more efficient and
3795                          * complex approach.
3796                          */
3797                         if (ret == -ENOENT) {
3798                                 wait_event(cur_trans->writer_wait,
3799                                    atomic_read(&cur_trans->num_writers) == 1);
3800                                 ret = write_one_cache_group(trans, root, path,
3801                                                             cache);
3802                         }
3803                         if (ret)
3804                                 btrfs_abort_transaction(trans, ret);
3805                 }
3806
3807                 /* if its not on the io list, we need to put the block group */
3808                 if (should_put)
3809                         btrfs_put_block_group(cache);
3810                 spin_lock(&cur_trans->dirty_bgs_lock);
3811         }
3812         spin_unlock(&cur_trans->dirty_bgs_lock);
3813
3814         while (!list_empty(io)) {
3815                 cache = list_first_entry(io, struct btrfs_block_group_cache,
3816                                          io_list);
3817                 list_del_init(&cache->io_list);
3818                 btrfs_wait_cache_io(root, trans, cache,
3819                                     &cache->io_ctl, path, cache->key.objectid);
3820                 btrfs_put_block_group(cache);
3821         }
3822
3823         btrfs_free_path(path);
3824         return ret;
3825 }
3826
3827 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3828 {
3829         struct btrfs_block_group_cache *block_group;
3830         int readonly = 0;
3831
3832         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3833         if (!block_group || block_group->ro)
3834                 readonly = 1;
3835         if (block_group)
3836                 btrfs_put_block_group(block_group);
3837         return readonly;
3838 }
3839
3840 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3841 {
3842         struct btrfs_block_group_cache *bg;
3843         bool ret = true;
3844
3845         bg = btrfs_lookup_block_group(fs_info, bytenr);
3846         if (!bg)
3847                 return false;
3848
3849         spin_lock(&bg->lock);
3850         if (bg->ro)
3851                 ret = false;
3852         else
3853                 atomic_inc(&bg->nocow_writers);
3854         spin_unlock(&bg->lock);
3855
3856         /* no put on block group, done by btrfs_dec_nocow_writers */
3857         if (!ret)
3858                 btrfs_put_block_group(bg);
3859
3860         return ret;
3861
3862 }
3863
3864 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3865 {
3866         struct btrfs_block_group_cache *bg;
3867
3868         bg = btrfs_lookup_block_group(fs_info, bytenr);
3869         ASSERT(bg);
3870         if (atomic_dec_and_test(&bg->nocow_writers))
3871                 wake_up_atomic_t(&bg->nocow_writers);
3872         /*
3873          * Once for our lookup and once for the lookup done by a previous call
3874          * to btrfs_inc_nocow_writers()
3875          */
3876         btrfs_put_block_group(bg);
3877         btrfs_put_block_group(bg);
3878 }
3879
3880 static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
3881 {
3882         schedule();
3883         return 0;
3884 }
3885
3886 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3887 {
3888         wait_on_atomic_t(&bg->nocow_writers,
3889                          btrfs_wait_nocow_writers_atomic_t,
3890                          TASK_UNINTERRUPTIBLE);
3891 }
3892
3893 static const char *alloc_name(u64 flags)
3894 {
3895         switch (flags) {
3896         case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3897                 return "mixed";
3898         case BTRFS_BLOCK_GROUP_METADATA:
3899                 return "metadata";
3900         case BTRFS_BLOCK_GROUP_DATA:
3901                 return "data";
3902         case BTRFS_BLOCK_GROUP_SYSTEM:
3903                 return "system";
3904         default:
3905                 WARN_ON(1);
3906                 return "invalid-combination";
3907         };
3908 }
3909
3910 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3911                              u64 total_bytes, u64 bytes_used,
3912                              u64 bytes_readonly,
3913                              struct btrfs_space_info **space_info)
3914 {
3915         struct btrfs_space_info *found;
3916         int i;
3917         int factor;
3918         int ret;
3919
3920         if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3921                      BTRFS_BLOCK_GROUP_RAID10))
3922                 factor = 2;
3923         else
3924                 factor = 1;
3925
3926         found = __find_space_info(info, flags);
3927         if (found) {
3928                 spin_lock(&found->lock);
3929                 found->total_bytes += total_bytes;
3930                 found->disk_total += total_bytes * factor;
3931                 found->bytes_used += bytes_used;
3932                 found->disk_used += bytes_used * factor;
3933                 found->bytes_readonly += bytes_readonly;
3934                 if (total_bytes > 0)
3935                         found->full = 0;
3936                 space_info_add_new_bytes(info, found, total_bytes -
3937                                          bytes_used - bytes_readonly);
3938                 spin_unlock(&found->lock);
3939                 *space_info = found;
3940                 return 0;
3941         }
3942         found = kzalloc(sizeof(*found), GFP_NOFS);
3943         if (!found)
3944                 return -ENOMEM;
3945
3946         ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
3947         if (ret) {
3948                 kfree(found);
3949                 return ret;
3950         }
3951
3952         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3953                 INIT_LIST_HEAD(&found->block_groups[i]);
3954         init_rwsem(&found->groups_sem);
3955         spin_lock_init(&found->lock);
3956         found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3957         found->total_bytes = total_bytes;
3958         found->disk_total = total_bytes * factor;
3959         found->bytes_used = bytes_used;
3960         found->disk_used = bytes_used * factor;
3961         found->bytes_pinned = 0;
3962         found->bytes_reserved = 0;
3963         found->bytes_readonly = bytes_readonly;
3964         found->bytes_may_use = 0;
3965         found->full = 0;
3966         found->max_extent_size = 0;
3967         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3968         found->chunk_alloc = 0;
3969         found->flush = 0;
3970         init_waitqueue_head(&found->wait);
3971         INIT_LIST_HEAD(&found->ro_bgs);
3972         INIT_LIST_HEAD(&found->tickets);
3973         INIT_LIST_HEAD(&found->priority_tickets);
3974
3975         ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3976                                     info->space_info_kobj, "%s",
3977                                     alloc_name(found->flags));
3978         if (ret) {
3979                 kfree(found);
3980                 return ret;
3981         }
3982
3983         *space_info = found;
3984         list_add_rcu(&found->list, &info->space_info);
3985         if (flags & BTRFS_BLOCK_GROUP_DATA)
3986                 info->data_sinfo = found;
3987
3988         return ret;
3989 }
3990
3991 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3992 {
3993         u64 extra_flags = chunk_to_extended(flags) &
3994                                 BTRFS_EXTENDED_PROFILE_MASK;
3995
3996         write_seqlock(&fs_info->profiles_lock);
3997         if (flags & BTRFS_BLOCK_GROUP_DATA)
3998                 fs_info->avail_data_alloc_bits |= extra_flags;
3999         if (flags & BTRFS_BLOCK_GROUP_METADATA)
4000                 fs_info->avail_metadata_alloc_bits |= extra_flags;
4001         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4002                 fs_info->avail_system_alloc_bits |= extra_flags;
4003         write_sequnlock(&fs_info->profiles_lock);
4004 }
4005
4006 /*
4007  * returns target flags in extended format or 0 if restripe for this
4008  * chunk_type is not in progress
4009  *
4010  * should be called with either volume_mutex or balance_lock held
4011  */
4012 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
4013 {
4014         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4015         u64 target = 0;
4016
4017         if (!bctl)
4018                 return 0;
4019
4020         if (flags & BTRFS_BLOCK_GROUP_DATA &&
4021             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4022                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
4023         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
4024                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4025                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
4026         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
4027                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4028                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
4029         }
4030
4031         return target;
4032 }
4033
4034 /*
4035  * @flags: available profiles in extended format (see ctree.h)
4036  *
4037  * Returns reduced profile in chunk format.  If profile changing is in
4038  * progress (either running or paused) picks the target profile (if it's
4039  * already available), otherwise falls back to plain reducing.
4040  */
4041 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
4042 {
4043         u64 num_devices = root->fs_info->fs_devices->rw_devices;
4044         u64 target;
4045         u64 raid_type;
4046         u64 allowed = 0;
4047
4048         /*
4049          * see if restripe for this chunk_type is in progress, if so
4050          * try to reduce to the target profile
4051          */
4052         spin_lock(&root->fs_info->balance_lock);
4053         target = get_restripe_target(root->fs_info, flags);
4054         if (target) {
4055                 /* pick target profile only if it's already available */
4056                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
4057                         spin_unlock(&root->fs_info->balance_lock);
4058                         return extended_to_chunk(target);
4059                 }
4060         }
4061         spin_unlock(&root->fs_info->balance_lock);
4062
4063         /* First, mask out the RAID levels which aren't possible */
4064         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4065                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
4066                         allowed |= btrfs_raid_group[raid_type];
4067         }
4068         allowed &= flags;
4069
4070         if (allowed & BTRFS_BLOCK_GROUP_RAID6)
4071                 allowed = BTRFS_BLOCK_GROUP_RAID6;
4072         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
4073                 allowed = BTRFS_BLOCK_GROUP_RAID5;
4074         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
4075                 allowed = BTRFS_BLOCK_GROUP_RAID10;
4076         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
4077                 allowed = BTRFS_BLOCK_GROUP_RAID1;
4078         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
4079                 allowed = BTRFS_BLOCK_GROUP_RAID0;
4080
4081         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
4082
4083         return extended_to_chunk(flags | allowed);
4084 }
4085
4086 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
4087 {
4088         unsigned seq;
4089         u64 flags;
4090
4091         do {
4092                 flags = orig_flags;
4093                 seq = read_seqbegin(&root->fs_info->profiles_lock);
4094
4095                 if (flags & BTRFS_BLOCK_GROUP_DATA)
4096                         flags |= root->fs_info->avail_data_alloc_bits;
4097                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4098                         flags |= root->fs_info->avail_system_alloc_bits;
4099                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
4100                         flags |= root->fs_info->avail_metadata_alloc_bits;
4101         } while (read_seqretry(&root->fs_info->profiles_lock, seq));
4102
4103         return btrfs_reduce_alloc_profile(root, flags);
4104 }
4105
4106 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
4107 {
4108         u64 flags;
4109         u64 ret;
4110
4111         if (data)
4112                 flags = BTRFS_BLOCK_GROUP_DATA;
4113         else if (root == root->fs_info->chunk_root)
4114                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
4115         else
4116                 flags = BTRFS_BLOCK_GROUP_METADATA;
4117
4118         ret = get_alloc_profile(root, flags);
4119         return ret;
4120 }
4121
4122 int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
4123 {
4124         struct btrfs_space_info *data_sinfo;
4125         struct btrfs_root *root = BTRFS_I(inode)->root;
4126         struct btrfs_fs_info *fs_info = root->fs_info;
4127         u64 used;
4128         int ret = 0;
4129         int need_commit = 2;
4130         int have_pinned_space;
4131
4132         /* make sure bytes are sectorsize aligned */
4133         bytes = ALIGN(bytes, root->sectorsize);
4134
4135         if (btrfs_is_free_space_inode(inode)) {
4136                 need_commit = 0;
4137                 ASSERT(current->journal_info);
4138         }
4139
4140         data_sinfo = fs_info->data_sinfo;
4141         if (!data_sinfo)
4142                 goto alloc;
4143
4144 again:
4145         /* make sure we have enough space to handle the data first */
4146         spin_lock(&data_sinfo->lock);
4147         used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
4148                 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
4149                 data_sinfo->bytes_may_use;
4150
4151         if (used + bytes > data_sinfo->total_bytes) {
4152                 struct btrfs_trans_handle *trans;
4153
4154                 /*
4155                  * if we don't have enough free bytes in this space then we need
4156                  * to alloc a new chunk.
4157                  */
4158                 if (!data_sinfo->full) {
4159                         u64 alloc_target;
4160
4161                         data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4162                         spin_unlock(&data_sinfo->lock);
4163 alloc:
4164                         alloc_target = btrfs_get_alloc_profile(root, 1);
4165                         /*
4166                          * It is ugly that we don't call nolock join
4167                          * transaction for the free space inode case here.
4168                          * But it is safe because we only do the data space
4169                          * reservation for the free space cache in the
4170                          * transaction context, the common join transaction
4171                          * just increase the counter of the current transaction
4172                          * handler, doesn't try to acquire the trans_lock of
4173                          * the fs.
4174                          */
4175                         trans = btrfs_join_transaction(root);
4176                         if (IS_ERR(trans))
4177                                 return PTR_ERR(trans);
4178
4179                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4180                                              alloc_target,
4181                                              CHUNK_ALLOC_NO_FORCE);
4182                         btrfs_end_transaction(trans, root);
4183                         if (ret < 0) {
4184                                 if (ret != -ENOSPC)
4185                                         return ret;
4186                                 else {
4187                                         have_pinned_space = 1;
4188                                         goto commit_trans;
4189                                 }
4190                         }
4191
4192                         if (!data_sinfo)
4193                                 data_sinfo = fs_info->data_sinfo;
4194
4195                         goto again;
4196                 }
4197
4198                 /*
4199                  * If we don't have enough pinned space to deal with this
4200                  * allocation, and no removed chunk in current transaction,
4201                  * don't bother committing the transaction.
4202                  */
4203                 have_pinned_space = percpu_counter_compare(
4204                         &data_sinfo->total_bytes_pinned,
4205                         used + bytes - data_sinfo->total_bytes);
4206                 spin_unlock(&data_sinfo->lock);
4207
4208                 /* commit the current transaction and try again */
4209 commit_trans:
4210                 if (need_commit &&
4211                     !atomic_read(&root->fs_info->open_ioctl_trans)) {
4212                         need_commit--;
4213
4214                         if (need_commit > 0) {
4215                                 btrfs_start_delalloc_roots(fs_info, 0, -1);
4216                                 btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
4217                         }
4218
4219                         trans = btrfs_join_transaction(root);
4220                         if (IS_ERR(trans))
4221                                 return PTR_ERR(trans);
4222                         if (have_pinned_space >= 0 ||
4223                             test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4224                                      &trans->transaction->flags) ||
4225                             need_commit > 0) {
4226                                 ret = btrfs_commit_transaction(trans, root);
4227                                 if (ret)
4228                                         return ret;
4229                                 /*
4230                                  * The cleaner kthread might still be doing iput
4231                                  * operations. Wait for it to finish so that
4232                                  * more space is released.
4233                                  */
4234                                 mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
4235                                 mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
4236                                 goto again;
4237                         } else {
4238                                 btrfs_end_transaction(trans, root);
4239                         }
4240                 }
4241
4242                 trace_btrfs_space_reservation(root->fs_info,
4243                                               "space_info:enospc",
4244                                               data_sinfo->flags, bytes, 1);
4245                 return -ENOSPC;
4246         }
4247         data_sinfo->bytes_may_use += bytes;
4248         trace_btrfs_space_reservation(root->fs_info, "space_info",
4249                                       data_sinfo->flags, bytes, 1);
4250         spin_unlock(&data_sinfo->lock);
4251
4252         return ret;
4253 }
4254
4255 /*
4256  * New check_data_free_space() with ability for precious data reservation
4257  * Will replace old btrfs_check_data_free_space(), but for patch split,
4258  * add a new function first and then replace it.
4259  */
4260 int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
4261 {
4262         struct btrfs_root *root = BTRFS_I(inode)->root;
4263         int ret;
4264
4265         /* align the range */
4266         len = round_up(start + len, root->sectorsize) -
4267               round_down(start, root->sectorsize);
4268         start = round_down(start, root->sectorsize);
4269
4270         ret = btrfs_alloc_data_chunk_ondemand(inode, len);
4271         if (ret < 0)
4272                 return ret;
4273
4274         /*
4275          * Use new btrfs_qgroup_reserve_data to reserve precious data space
4276          *
4277          * TODO: Find a good method to avoid reserve data space for NOCOW
4278          * range, but don't impact performance on quota disable case.
4279          */
4280         ret = btrfs_qgroup_reserve_data(inode, start, len);
4281         return ret;
4282 }
4283
4284 /*
4285  * Called if we need to clear a data reservation for this inode
4286  * Normally in a error case.
4287  *
4288  * This one will *NOT* use accurate qgroup reserved space API, just for case
4289  * which we can't sleep and is sure it won't affect qgroup reserved space.
4290  * Like clear_bit_hook().
4291  */
4292 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4293                                             u64 len)
4294 {
4295         struct btrfs_root *root = BTRFS_I(inode)->root;
4296         struct btrfs_space_info *data_sinfo;
4297
4298         /* Make sure the range is aligned to sectorsize */
4299         len = round_up(start + len, root->sectorsize) -
4300               round_down(start, root->sectorsize);
4301         start = round_down(start, root->sectorsize);
4302
4303         data_sinfo = root->fs_info->data_sinfo;
4304         spin_lock(&data_sinfo->lock);
4305         if (WARN_ON(data_sinfo->bytes_may_use < len))
4306                 data_sinfo->bytes_may_use = 0;
4307         else
4308                 data_sinfo->bytes_may_use -= len;
4309         trace_btrfs_space_reservation(root->fs_info, "space_info",
4310                                       data_sinfo->flags, len, 0);
4311         spin_unlock(&data_sinfo->lock);
4312 }
4313
4314 /*
4315  * Called if we need to clear a data reservation for this inode
4316  * Normally in a error case.
4317  *
4318  * This one will handle the per-inode data rsv map for accurate reserved
4319  * space framework.
4320  */
4321 void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
4322 {
4323         btrfs_free_reserved_data_space_noquota(inode, start, len);
4324         btrfs_qgroup_free_data(inode, start, len);
4325 }
4326
4327 static void force_metadata_allocation(struct btrfs_fs_info *info)
4328 {
4329         struct list_head *head = &info->space_info;
4330         struct btrfs_space_info *found;
4331
4332         rcu_read_lock();
4333         list_for_each_entry_rcu(found, head, list) {
4334                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4335                         found->force_alloc = CHUNK_ALLOC_FORCE;
4336         }
4337         rcu_read_unlock();
4338 }
4339
4340 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4341 {
4342         return (global->size << 1);
4343 }
4344
4345 static int should_alloc_chunk(struct btrfs_root *root,
4346                               struct btrfs_space_info *sinfo, int force)
4347 {
4348         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4349         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
4350         u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
4351         u64 thresh;
4352
4353         if (force == CHUNK_ALLOC_FORCE)
4354                 return 1;
4355
4356         /*
4357          * We need to take into account the global rsv because for all intents
4358          * and purposes it's used space.  Don't worry about locking the
4359          * global_rsv, it doesn't change except when the transaction commits.
4360          */
4361         if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4362                 num_allocated += calc_global_rsv_need_space(global_rsv);
4363
4364         /*
4365          * in limited mode, we want to have some free space up to
4366          * about 1% of the FS size.
4367          */
4368         if (force == CHUNK_ALLOC_LIMITED) {
4369                 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
4370                 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4371
4372                 if (num_bytes - num_allocated < thresh)
4373                         return 1;
4374         }
4375
4376         if (num_allocated + SZ_2M < div_factor(num_bytes, 8))
4377                 return 0;
4378         return 1;
4379 }
4380
4381 static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
4382 {
4383         u64 num_dev;
4384
4385         if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4386                     BTRFS_BLOCK_GROUP_RAID0 |
4387                     BTRFS_BLOCK_GROUP_RAID5 |
4388                     BTRFS_BLOCK_GROUP_RAID6))
4389                 num_dev = root->fs_info->fs_devices->rw_devices;
4390         else if (type & BTRFS_BLOCK_GROUP_RAID1)
4391                 num_dev = 2;
4392         else
4393                 num_dev = 1;    /* DUP or single */
4394
4395         return num_dev;
4396 }
4397
4398 /*
4399  * If @is_allocation is true, reserve space in the system space info necessary
4400  * for allocating a chunk, otherwise if it's false, reserve space necessary for
4401  * removing a chunk.
4402  */
4403 void check_system_chunk(struct btrfs_trans_handle *trans,
4404                         struct btrfs_root *root,
4405                         u64 type)
4406 {
4407         struct btrfs_space_info *info;
4408         u64 left;
4409         u64 thresh;
4410         int ret = 0;
4411         u64 num_devs;
4412
4413         /*
4414          * Needed because we can end up allocating a system chunk and for an
4415          * atomic and race free space reservation in the chunk block reserve.
4416          */
4417         ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
4418
4419         info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4420         spin_lock(&info->lock);
4421         left = info->total_bytes - info->bytes_used - info->bytes_pinned -
4422                 info->bytes_reserved - info->bytes_readonly -
4423                 info->bytes_may_use;
4424         spin_unlock(&info->lock);
4425
4426         num_devs = get_profile_num_devs(root, type);
4427
4428         /* num_devs device items to update and 1 chunk item to add or remove */
4429         thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
4430                 btrfs_calc_trans_metadata_size(root, 1);
4431
4432         if (left < thresh && btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
4433                 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
4434                         left, thresh, type);
4435                 dump_space_info(info, 0, 0);
4436         }
4437
4438         if (left < thresh) {
4439                 u64 flags;
4440
4441                 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
4442                 /*
4443                  * Ignore failure to create system chunk. We might end up not
4444                  * needing it, as we might not need to COW all nodes/leafs from
4445                  * the paths we visit in the chunk tree (they were already COWed
4446                  * or created in the current transaction for example).
4447                  */
4448                 ret = btrfs_alloc_chunk(trans, root, flags);
4449         }
4450
4451         if (!ret) {
4452                 ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
4453                                           &root->fs_info->chunk_block_rsv,
4454                                           thresh, BTRFS_RESERVE_NO_FLUSH);
4455                 if (!ret)
4456                         trans->chunk_bytes_reserved += thresh;
4457         }
4458 }
4459
4460 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
4461                           struct btrfs_root *extent_root, u64 flags, int force)
4462 {
4463         struct btrfs_space_info *space_info;
4464         struct btrfs_fs_info *fs_info = extent_root->fs_info;
4465         int wait_for_alloc = 0;
4466         int ret = 0;
4467
4468         /* Don't re-enter if we're already allocating a chunk */
4469         if (trans->allocating_chunk)
4470                 return -ENOSPC;
4471
4472         space_info = __find_space_info(extent_root->fs_info, flags);
4473         if (!space_info) {
4474                 ret = update_space_info(extent_root->fs_info, flags,
4475                                         0, 0, 0, &space_info);
4476                 BUG_ON(ret); /* -ENOMEM */
4477         }
4478         BUG_ON(!space_info); /* Logic error */
4479
4480 again:
4481         spin_lock(&space_info->lock);
4482         if (force < space_info->force_alloc)
4483                 force = space_info->force_alloc;
4484         if (space_info->full) {
4485                 if (should_alloc_chunk(extent_root, space_info, force))
4486                         ret = -ENOSPC;
4487                 else
4488                         ret = 0;
4489                 spin_unlock(&space_info->lock);
4490                 return ret;
4491         }
4492
4493         if (!should_alloc_chunk(extent_root, space_info, force)) {
4494                 spin_unlock(&space_info->lock);
4495                 return 0;
4496         } else if (space_info->chunk_alloc) {
4497                 wait_for_alloc = 1;
4498         } else {
4499                 space_info->chunk_alloc = 1;
4500         }
4501
4502         spin_unlock(&space_info->lock);
4503
4504         mutex_lock(&fs_info->chunk_mutex);
4505
4506         /*
4507          * The chunk_mutex is held throughout the entirety of a chunk
4508          * allocation, so once we've acquired the chunk_mutex we know that the
4509          * other guy is done and we need to recheck and see if we should
4510          * allocate.
4511          */
4512         if (wait_for_alloc) {
4513                 mutex_unlock(&fs_info->chunk_mutex);
4514                 wait_for_alloc = 0;
4515                 goto again;
4516         }
4517
4518         trans->allocating_chunk = true;
4519
4520         /*
4521          * If we have mixed data/metadata chunks we want to make sure we keep
4522          * allocating mixed chunks instead of individual chunks.
4523          */
4524         if (btrfs_mixed_space_info(space_info))
4525                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4526
4527         /*
4528          * if we're doing a data chunk, go ahead and make sure that
4529          * we keep a reasonable number of metadata chunks allocated in the
4530          * FS as well.
4531          */
4532         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4533                 fs_info->data_chunk_allocations++;
4534                 if (!(fs_info->data_chunk_allocations %
4535                       fs_info->metadata_ratio))
4536                         force_metadata_allocation(fs_info);
4537         }
4538
4539         /*
4540          * Check if we have enough space in SYSTEM chunk because we may need
4541          * to update devices.
4542          */
4543         check_system_chunk(trans, extent_root, flags);
4544
4545         ret = btrfs_alloc_chunk(trans, extent_root, flags);
4546         trans->allocating_chunk = false;
4547
4548         spin_lock(&space_info->lock);
4549         if (ret < 0 && ret != -ENOSPC)
4550                 goto out;
4551         if (ret)
4552                 space_info->full = 1;
4553         else
4554                 ret = 1;
4555
4556         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4557 out:
4558         space_info->chunk_alloc = 0;
4559         spin_unlock(&space_info->lock);
4560         mutex_unlock(&fs_info->chunk_mutex);
4561         /*
4562          * When we allocate a new chunk we reserve space in the chunk block
4563          * reserve to make sure we can COW nodes/leafs in the chunk tree or
4564          * add new nodes/leafs to it if we end up needing to do it when
4565          * inserting the chunk item and updating device items as part of the
4566          * second phase of chunk allocation, performed by
4567          * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4568          * large number of new block groups to create in our transaction
4569          * handle's new_bgs list to avoid exhausting the chunk block reserve
4570          * in extreme cases - like having a single transaction create many new
4571          * block groups when starting to write out the free space caches of all
4572          * the block groups that were made dirty during the lifetime of the
4573          * transaction.
4574          */
4575         if (trans->can_flush_pending_bgs &&
4576             trans->chunk_bytes_reserved >= (u64)SZ_2M) {
4577                 btrfs_create_pending_block_groups(trans, extent_root);
4578                 btrfs_trans_release_chunk_metadata(trans);
4579         }
4580         return ret;
4581 }
4582
4583 static int can_overcommit(struct btrfs_root *root,
4584                           struct btrfs_space_info *space_info, u64 bytes,
4585                           enum btrfs_reserve_flush_enum flush)
4586 {
4587         struct btrfs_block_rsv *global_rsv;
4588         u64 profile;
4589         u64 space_size;
4590         u64 avail;
4591         u64 used;
4592
4593         /* Don't overcommit when in mixed mode. */
4594         if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4595                 return 0;
4596
4597         BUG_ON(root->fs_info == NULL);
4598         global_rsv = &root->fs_info->global_block_rsv;
4599         profile = btrfs_get_alloc_profile(root, 0);
4600         used = space_info->bytes_used + space_info->bytes_reserved +
4601                 space_info->bytes_pinned + space_info->bytes_readonly;
4602
4603         /*
4604          * We only want to allow over committing if we have lots of actual space
4605          * free, but if we don't have enough space to handle the global reserve
4606          * space then we could end up having a real enospc problem when trying
4607          * to allocate a chunk or some other such important allocation.
4608          */
4609         spin_lock(&global_rsv->lock);
4610         space_size = calc_global_rsv_need_space(global_rsv);
4611         spin_unlock(&global_rsv->lock);
4612         if (used + space_size >= space_info->total_bytes)
4613                 return 0;
4614
4615         used += space_info->bytes_may_use;
4616
4617         spin_lock(&root->fs_info->free_chunk_lock);
4618         avail = root->fs_info->free_chunk_space;
4619         spin_unlock(&root->fs_info->free_chunk_lock);
4620
4621         /*
4622          * If we have dup, raid1 or raid10 then only half of the free
4623          * space is actually useable.  For raid56, the space info used
4624          * doesn't include the parity drive, so we don't have to
4625          * change the math
4626          */
4627         if (profile & (BTRFS_BLOCK_GROUP_DUP |
4628                        BTRFS_BLOCK_GROUP_RAID1 |
4629                        BTRFS_BLOCK_GROUP_RAID10))
4630                 avail >>= 1;
4631
4632         /*
4633          * If we aren't flushing all things, let us overcommit up to
4634          * 1/2th of the space. If we can flush, don't let us overcommit
4635          * too much, let it overcommit up to 1/8 of the space.
4636          */
4637         if (flush == BTRFS_RESERVE_FLUSH_ALL)
4638                 avail >>= 3;
4639         else
4640                 avail >>= 1;
4641
4642         if (used + bytes < space_info->total_bytes + avail)
4643                 return 1;
4644         return 0;
4645 }
4646
4647 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
4648                                          unsigned long nr_pages, int nr_items)
4649 {
4650         struct super_block *sb = root->fs_info->sb;
4651
4652         if (down_read_trylock(&sb->s_umount)) {
4653                 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4654                 up_read(&sb->s_umount);
4655         } else {
4656                 /*
4657                  * We needn't worry the filesystem going from r/w to r/o though
4658                  * we don't acquire ->s_umount mutex, because the filesystem
4659                  * should guarantee the delalloc inodes list be empty after
4660                  * the filesystem is readonly(all dirty pages are written to
4661                  * the disk).
4662                  */
4663                 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
4664                 if (!current->journal_info)
4665                         btrfs_wait_ordered_roots(root->fs_info, nr_items,
4666                                                  0, (u64)-1);
4667         }
4668 }
4669
4670 static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
4671 {
4672         u64 bytes;
4673         int nr;
4674
4675         bytes = btrfs_calc_trans_metadata_size(root, 1);
4676         nr = (int)div64_u64(to_reclaim, bytes);
4677         if (!nr)
4678                 nr = 1;
4679         return nr;
4680 }
4681
4682 #define EXTENT_SIZE_PER_ITEM    SZ_256K
4683
4684 /*
4685  * shrink metadata reservation for delalloc
4686  */
4687 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4688                             bool wait_ordered)
4689 {
4690         struct btrfs_block_rsv *block_rsv;
4691         struct btrfs_space_info *space_info;
4692         struct btrfs_trans_handle *trans;
4693         u64 delalloc_bytes;
4694         u64 max_reclaim;
4695         long time_left;
4696         unsigned long nr_pages;
4697         int loops;
4698         int items;
4699         enum btrfs_reserve_flush_enum flush;
4700
4701         /* Calc the number of the pages we need flush for space reservation */
4702         items = calc_reclaim_items_nr(root, to_reclaim);
4703         to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
4704
4705         trans = (struct btrfs_trans_handle *)current->journal_info;
4706         block_rsv = &root->fs_info->delalloc_block_rsv;
4707         space_info = block_rsv->space_info;
4708
4709         delalloc_bytes = percpu_counter_sum_positive(
4710                                                 &root->fs_info->delalloc_bytes);
4711         if (delalloc_bytes == 0) {
4712                 if (trans)
4713                         return;
4714                 if (wait_ordered)
4715                         btrfs_wait_ordered_roots(root->fs_info, items,
4716                                                  0, (u64)-1);
4717                 return;
4718         }
4719
4720         loops = 0;
4721         while (delalloc_bytes && loops < 3) {
4722                 max_reclaim = min(delalloc_bytes, to_reclaim);
4723                 nr_pages = max_reclaim >> PAGE_SHIFT;
4724                 btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4725                 /*
4726                  * We need to wait for the async pages to actually start before
4727                  * we do anything.
4728                  */
4729                 max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
4730                 if (!max_reclaim)
4731                         goto skip_async;
4732
4733                 if (max_reclaim <= nr_pages)
4734                         max_reclaim = 0;
4735                 else
4736                         max_reclaim -= nr_pages;
4737
4738                 wait_event(root->fs_info->async_submit_wait,
4739                            atomic_read(&root->fs_info->async_delalloc_pages) <=
4740                            (int)max_reclaim);
4741 skip_async:
4742                 if (!trans)
4743                         flush = BTRFS_RESERVE_FLUSH_ALL;
4744                 else
4745                         flush = BTRFS_RESERVE_NO_FLUSH;
4746                 spin_lock(&space_info->lock);
4747                 if (can_overcommit(root, space_info, orig, flush)) {
4748                         spin_unlock(&space_info->lock);
4749                         break;
4750                 }
4751                 if (list_empty(&space_info->tickets) &&
4752                     list_empty(&space_info->priority_tickets)) {
4753                         spin_unlock(&space_info->lock);
4754                         break;
4755                 }
4756                 spin_unlock(&space_info->lock);
4757
4758                 loops++;
4759                 if (wait_ordered && !trans) {
4760                         btrfs_wait_ordered_roots(root->fs_info, items,
4761                                                  0, (u64)-1);
4762                 } else {
4763                         time_left = schedule_timeout_killable(1);
4764                         if (time_left)
4765                                 break;
4766                 }
4767                 delalloc_bytes = percpu_counter_sum_positive(
4768                                                 &root->fs_info->delalloc_bytes);
4769         }
4770 }
4771
4772 /**
4773  * maybe_commit_transaction - possibly commit the transaction if its ok to
4774  * @root - the root we're allocating for
4775  * @bytes - the number of bytes we want to reserve
4776  * @force - force the commit
4777  *
4778  * This will check to make sure that committing the transaction will actually
4779  * get us somewhere and then commit the transaction if it does.  Otherwise it
4780  * will return -ENOSPC.
4781  */
4782 static int may_commit_transaction(struct btrfs_root *root,
4783                                   struct btrfs_space_info *space_info,
4784                                   u64 bytes, int force)
4785 {
4786         struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
4787         struct btrfs_trans_handle *trans;
4788
4789         trans = (struct btrfs_trans_handle *)current->journal_info;
4790         if (trans)
4791                 return -EAGAIN;
4792
4793         if (force)
4794                 goto commit;
4795
4796         /* See if there is enough pinned space to make this reservation */
4797         if (percpu_counter_compare(&space_info->total_bytes_pinned,
4798                                    bytes) >= 0)
4799                 goto commit;
4800
4801         /*
4802          * See if there is some space in the delayed insertion reservation for
4803          * this reservation.
4804          */
4805         if (space_info != delayed_rsv->space_info)
4806                 return -ENOSPC;
4807
4808         spin_lock(&delayed_rsv->lock);
4809         if (percpu_counter_compare(&space_info->total_bytes_pinned,
4810                                    bytes - delayed_rsv->size) >= 0) {
4811                 spin_unlock(&delayed_rsv->lock);
4812                 return -ENOSPC;
4813         }
4814         spin_unlock(&delayed_rsv->lock);
4815
4816 commit:
4817         trans = btrfs_join_transaction(root);
4818         if (IS_ERR(trans))
4819                 return -ENOSPC;
4820
4821         return btrfs_commit_transaction(trans, root);
4822 }
4823
4824 struct reserve_ticket {
4825         u64 bytes;
4826         int error;
4827         struct list_head list;
4828         wait_queue_head_t wait;
4829 };
4830
4831 static int flush_space(struct btrfs_root *root,
4832                        struct btrfs_space_info *space_info, u64 num_bytes,
4833                        u64 orig_bytes, int state)
4834 {
4835         struct btrfs_trans_handle *trans;
4836         int nr;
4837         int ret = 0;
4838
4839         switch (state) {
4840         case FLUSH_DELAYED_ITEMS_NR:
4841         case FLUSH_DELAYED_ITEMS:
4842                 if (state == FLUSH_DELAYED_ITEMS_NR)
4843                         nr = calc_reclaim_items_nr(root, num_bytes) * 2;
4844                 else
4845                         nr = -1;
4846
4847                 trans = btrfs_join_transaction(root);
4848                 if (IS_ERR(trans)) {
4849                         ret = PTR_ERR(trans);
4850                         break;
4851                 }
4852                 ret = btrfs_run_delayed_items_nr(trans, root, nr);
4853                 btrfs_end_transaction(trans, root);
4854                 break;
4855         case FLUSH_DELALLOC:
4856         case FLUSH_DELALLOC_WAIT:
4857                 shrink_delalloc(root, num_bytes * 2, orig_bytes,
4858                                 state == FLUSH_DELALLOC_WAIT);
4859                 break;
4860         case ALLOC_CHUNK:
4861                 trans = btrfs_join_transaction(root);
4862                 if (IS_ERR(trans)) {
4863                         ret = PTR_ERR(trans);
4864                         break;
4865                 }
4866                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4867                                      btrfs_get_alloc_profile(root, 0),
4868                                      CHUNK_ALLOC_NO_FORCE);
4869                 btrfs_end_transaction(trans, root);
4870                 if (ret > 0 || ret == -ENOSPC)
4871                         ret = 0;
4872                 break;
4873         case COMMIT_TRANS:
4874                 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
4875                 break;
4876         default:
4877                 ret = -ENOSPC;
4878                 break;
4879         }
4880
4881         trace_btrfs_flush_space(root->fs_info, space_info->flags, num_bytes,
4882                                 orig_bytes, state, ret);
4883         return ret;
4884 }
4885
4886 static inline u64
4887 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4888                                  struct btrfs_space_info *space_info)
4889 {
4890         struct reserve_ticket *ticket;
4891         u64 used;
4892         u64 expected;
4893         u64 to_reclaim = 0;
4894
4895         to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
4896         if (can_overcommit(root, space_info, to_reclaim,
4897                            BTRFS_RESERVE_FLUSH_ALL))
4898                 return 0;
4899
4900         list_for_each_entry(ticket, &space_info->tickets, list)
4901                 to_reclaim += ticket->bytes;
4902         list_for_each_entry(ticket, &space_info->priority_tickets, list)
4903                 to_reclaim += ticket->bytes;
4904         if (to_reclaim)
4905                 return to_reclaim;
4906
4907         used = space_info->bytes_used + space_info->bytes_reserved +
4908                space_info->bytes_pinned + space_info->bytes_readonly +
4909                space_info->bytes_may_use;
4910         if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL))
4911                 expected = div_factor_fine(space_info->total_bytes, 95);
4912         else
4913                 expected = div_factor_fine(space_info->total_bytes, 90);
4914
4915         if (used > expected)
4916                 to_reclaim = used - expected;
4917         else
4918                 to_reclaim = 0;
4919         to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4920                                      space_info->bytes_reserved);
4921         return to_reclaim;
4922 }
4923
4924 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4925                                         struct btrfs_root *root, u64 used)
4926 {
4927         u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4928
4929         /* If we're just plain full then async reclaim just slows us down. */
4930         if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
4931                 return 0;
4932
4933         if (!btrfs_calc_reclaim_metadata_size(root, space_info))
4934                 return 0;
4935
4936         return (used >= thresh && !btrfs_fs_closing(root->fs_info) &&
4937                 !test_bit(BTRFS_FS_STATE_REMOUNTING,
4938                           &root->fs_info->fs_state));
4939 }
4940
4941 static void wake_all_tickets(struct list_head *head)
4942 {
4943         struct reserve_ticket *ticket;
4944
4945         while (!list_empty(head)) {
4946                 ticket = list_first_entry(head, struct reserve_ticket, list);
4947                 list_del_init(&ticket->list);
4948                 ticket->error = -ENOSPC;
4949                 wake_up(&ticket->wait);
4950         }
4951 }
4952
4953 /*
4954  * This is for normal flushers, we can wait all goddamned day if we want to.  We
4955  * will loop and continuously try to flush as long as we are making progress.
4956  * We count progress as clearing off tickets each time we have to loop.
4957  */
4958 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4959 {
4960         struct reserve_ticket *last_ticket = NULL;
4961         struct btrfs_fs_info *fs_info;
4962         struct btrfs_space_info *space_info;
4963         u64 to_reclaim;
4964         int flush_state;
4965         int commit_cycles = 0;
4966
4967         fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4968         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4969
4970         spin_lock(&space_info->lock);
4971         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
4972                                                       space_info);
4973         if (!to_reclaim) {
4974                 space_info->flush = 0;
4975                 spin_unlock(&space_info->lock);
4976                 return;
4977         }
4978         last_ticket = list_first_entry(&space_info->tickets,
4979                                        struct reserve_ticket, list);
4980         spin_unlock(&space_info->lock);
4981
4982         flush_state = FLUSH_DELAYED_ITEMS_NR;
4983         do {
4984                 struct reserve_ticket *ticket;
4985                 int ret;
4986
4987                 ret = flush_space(fs_info->fs_root, space_info, to_reclaim,
4988                             to_reclaim, flush_state);
4989                 spin_lock(&space_info->lock);
4990                 if (list_empty(&space_info->tickets)) {
4991                         space_info->flush = 0;
4992                         spin_unlock(&space_info->lock);
4993                         return;
4994                 }
4995                 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
4996                                                               space_info);
4997                 ticket = list_first_entry(&space_info->tickets,
4998                                           struct reserve_ticket, list);
4999                 if (last_ticket == ticket) {
5000                         flush_state++;
5001                 } else {
5002                         last_ticket = ticket;
5003                         flush_state = FLUSH_DELAYED_ITEMS_NR;
5004                         if (commit_cycles)
5005                                 commit_cycles--;
5006                 }
5007
5008                 if (flush_state > COMMIT_TRANS) {
5009                         commit_cycles++;
5010                         if (commit_cycles > 2) {
5011                                 wake_all_tickets(&space_info->tickets);
5012                                 space_info->flush = 0;
5013                         } else {
5014                                 flush_state = FLUSH_DELAYED_ITEMS_NR;
5015                         }
5016                 }
5017                 spin_unlock(&space_info->lock);
5018         } while (flush_state <= COMMIT_TRANS);
5019 }
5020
5021 void btrfs_init_async_reclaim_work(struct work_struct *work)
5022 {
5023         INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5024 }
5025
5026 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5027                                             struct btrfs_space_info *space_info,
5028                                             struct reserve_ticket *ticket)
5029 {
5030         u64 to_reclaim;
5031         int flush_state = FLUSH_DELAYED_ITEMS_NR;
5032
5033         spin_lock(&space_info->lock);
5034         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
5035                                                       space_info);
5036         if (!to_reclaim) {
5037                 spin_unlock(&space_info->lock);
5038                 return;
5039         }
5040         spin_unlock(&space_info->lock);
5041
5042         do {
5043                 flush_space(fs_info->fs_root, space_info, to_reclaim,
5044                             to_reclaim, flush_state);
5045                 flush_state++;
5046                 spin_lock(&space_info->lock);
5047                 if (ticket->bytes == 0) {
5048                         spin_unlock(&space_info->lock);
5049                         return;
5050                 }
5051                 spin_unlock(&space_info->lock);
5052
5053                 /*
5054                  * Priority flushers can't wait on delalloc without
5055                  * deadlocking.
5056                  */
5057                 if (flush_state == FLUSH_DELALLOC ||
5058                     flush_state == FLUSH_DELALLOC_WAIT)
5059                         flush_state = ALLOC_CHUNK;
5060         } while (flush_state < COMMIT_TRANS);
5061 }
5062
5063 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5064                                struct btrfs_space_info *space_info,
5065                                struct reserve_ticket *ticket, u64 orig_bytes)
5066
5067 {
5068         DEFINE_WAIT(wait);
5069         int ret = 0;
5070
5071         spin_lock(&space_info->lock);
5072         while (ticket->bytes > 0 && ticket->error == 0) {
5073                 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
5074                 if (ret) {
5075                         ret = -EINTR;
5076                         break;
5077                 }
5078                 spin_unlock(&space_info->lock);
5079
5080                 schedule();
5081
5082                 finish_wait(&ticket->wait, &wait);
5083                 spin_lock(&space_info->lock);
5084         }
5085         if (!ret)
5086                 ret = ticket->error;
5087         if (!list_empty(&ticket->list))
5088                 list_del_init(&ticket->list);
5089         if (ticket->bytes && ticket->bytes < orig_bytes) {
5090                 u64 num_bytes = orig_bytes - ticket->bytes;
5091                 space_info->bytes_may_use -= num_bytes;
5092                 trace_btrfs_space_reservation(fs_info, "space_info",
5093                                               space_info->flags, num_bytes, 0);
5094         }
5095         spin_unlock(&space_info->lock);
5096
5097         return ret;
5098 }
5099
5100 /**
5101  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5102  * @root - the root we're allocating for
5103  * @space_info - the space info we want to allocate from
5104  * @orig_bytes - the number of bytes we want
5105  * @flush - whether or not we can flush to make our reservation
5106  *
5107  * This will reserve orig_bytes number of bytes from the space info associated
5108  * with the block_rsv.  If there is not enough space it will make an attempt to
5109  * flush out space to make room.  It will do this by flushing delalloc if
5110  * possible or committing the transaction.  If flush is 0 then no attempts to
5111  * regain reservations will be made and this will fail if there is not enough
5112  * space already.
5113  */
5114 static int __reserve_metadata_bytes(struct btrfs_root *root,
5115                                     struct btrfs_space_info *space_info,
5116                                     u64 orig_bytes,
5117                                     enum btrfs_reserve_flush_enum flush)
5118 {
5119         struct reserve_ticket ticket;
5120         u64 used;
5121         int ret = 0;
5122
5123         ASSERT(orig_bytes);
5124         ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
5125
5126         spin_lock(&space_info->lock);
5127         ret = -ENOSPC;
5128         used = space_info->bytes_used + space_info->bytes_reserved +
5129                 space_info->bytes_pinned + space_info->bytes_readonly +
5130                 space_info->bytes_may_use;
5131
5132         /*
5133          * If we have enough space then hooray, make our reservation and carry
5134          * on.  If not see if we can overcommit, and if we can, hooray carry on.
5135          * If not things get more complicated.
5136          */
5137         if (used + orig_bytes <= space_info->total_bytes) {
5138                 space_info->bytes_may_use += orig_bytes;
5139                 trace_btrfs_space_reservation(root->fs_info, "space_info",
5140                                               space_info->flags, orig_bytes,
5141                                               1);
5142                 ret = 0;
5143         } else if (can_overcommit(root, space_info, orig_bytes, flush)) {
5144                 space_info->bytes_may_use += orig_bytes;
5145                 trace_btrfs_space_reservation(root->fs_info, "space_info",
5146                                               space_info->flags, orig_bytes,
5147                                               1);
5148                 ret = 0;
5149         }
5150
5151         /*
5152          * If we couldn't make a reservation then setup our reservation ticket
5153          * and kick the async worker if it's not already running.
5154          *
5155          * If we are a priority flusher then we just need to add our ticket to
5156          * the list and we will do our own flushing further down.
5157          */
5158         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5159                 ticket.bytes = orig_bytes;
5160                 ticket.error = 0;
5161                 init_waitqueue_head(&ticket.wait);
5162                 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5163                         list_add_tail(&ticket.list, &space_info->tickets);
5164                         if (!space_info->flush) {
5165                                 space_info->flush = 1;
5166                                 trace_btrfs_trigger_flush(root->fs_info,
5167                                                           space_info->flags,
5168                                                           orig_bytes, flush,
5169                                                           "enospc");
5170                                 queue_work(system_unbound_wq,
5171                                            &root->fs_info->async_reclaim_work);
5172                         }
5173                 } else {
5174                         list_add_tail(&ticket.list,
5175                                       &space_info->priority_tickets);
5176                 }
5177         } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5178                 used += orig_bytes;
5179                 /*
5180                  * We will do the space reservation dance during log replay,
5181                  * which means we won't have fs_info->fs_root set, so don't do
5182                  * the async reclaim as we will panic.
5183                  */
5184                 if (!root->fs_info->log_root_recovering &&
5185                     need_do_async_reclaim(space_info, root, used) &&
5186                     !work_busy(&root->fs_info->async_reclaim_work)) {
5187                         trace_btrfs_trigger_flush(root->fs_info,
5188                                                   space_info->flags,
5189                                                   orig_bytes, flush,
5190                                                   "preempt");
5191                         queue_work(system_unbound_wq,
5192                                    &root->fs_info->async_reclaim_work);
5193                 }
5194         }
5195         spin_unlock(&space_info->lock);
5196         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
5197                 return ret;
5198
5199         if (flush == BTRFS_RESERVE_FLUSH_ALL)
5200                 return wait_reserve_ticket(root->fs_info, space_info, &ticket,
5201                                            orig_bytes);
5202
5203         ret = 0;
5204         priority_reclaim_metadata_space(root->fs_info, space_info, &ticket);
5205         spin_lock(&space_info->lock);
5206         if (ticket.bytes) {
5207                 if (ticket.bytes < orig_bytes) {
5208                         u64 num_bytes = orig_bytes - ticket.bytes;
5209                         space_info->bytes_may_use -= num_bytes;
5210                         trace_btrfs_space_reservation(root->fs_info,
5211                                         "space_info", space_info->flags,
5212                                         num_bytes, 0);
5213
5214                 }
5215                 list_del_init(&ticket.list);
5216                 ret = -ENOSPC;
5217         }
5218         spin_unlock(&space_info->lock);
5219         ASSERT(list_empty(&ticket.list));
5220         return ret;
5221 }
5222
5223 /**
5224  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5225  * @root - the root we're allocating for
5226  * @block_rsv - the block_rsv we're allocating for
5227  * @orig_bytes - the number of bytes we want
5228  * @flush - whether or not we can flush to make our reservation
5229  *
5230  * This will reserve orgi_bytes number of bytes from the space info associated
5231  * with the block_rsv.  If there is not enough space it will make an attempt to
5232  * flush out space to make room.  It will do this by flushing delalloc if
5233  * possible or committing the transaction.  If flush is 0 then no attempts to
5234  * regain reservations will be made and this will fail if there is not enough
5235  * space already.
5236  */
5237 static int reserve_metadata_bytes(struct btrfs_root *root,
5238                                   struct btrfs_block_rsv *block_rsv,
5239                                   u64 orig_bytes,
5240                                   enum btrfs_reserve_flush_enum flush)
5241 {
5242         int ret;
5243
5244         ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes,
5245                                        flush);
5246         if (ret == -ENOSPC &&
5247             unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5248                 struct btrfs_block_rsv *global_rsv =
5249                         &root->fs_info->global_block_rsv;
5250
5251                 if (block_rsv != global_rsv &&
5252                     !block_rsv_use_bytes(global_rsv, orig_bytes))
5253                         ret = 0;
5254         }
5255         if (ret == -ENOSPC)
5256                 trace_btrfs_space_reservation(root->fs_info,
5257                                               "space_info:enospc",
5258                                               block_rsv->space_info->flags,
5259                                               orig_bytes, 1);
5260         return ret;
5261 }
5262
5263 static struct btrfs_block_rsv *get_block_rsv(
5264                                         const struct btrfs_trans_handle *trans,
5265                                         const struct btrfs_root *root)
5266 {
5267         struct btrfs_block_rsv *block_rsv = NULL;
5268
5269         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5270             (root == root->fs_info->csum_root && trans->adding_csums) ||
5271              (root == root->fs_info->uuid_root))
5272                 block_rsv = trans->block_rsv;
5273
5274         if (!block_rsv)
5275                 block_rsv = root->block_rsv;
5276
5277         if (!block_rsv)
5278                 block_rsv = &root->fs_info->empty_block_rsv;
5279
5280         return block_rsv;
5281 }
5282
5283 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5284                                u64 num_bytes)
5285 {
5286         int ret = -ENOSPC;
5287         spin_lock(&block_rsv->lock);
5288         if (block_rsv->reserved >= num_bytes) {
5289                 block_rsv->reserved -= num_bytes;
5290                 if (block_rsv->reserved < block_rsv->size)
5291                         block_rsv->full = 0;
5292                 ret = 0;
5293         }
5294         spin_unlock(&block_rsv->lock);
5295         return ret;
5296 }
5297
5298 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5299                                 u64 num_bytes, int update_size)
5300 {
5301         spin_lock(&block_rsv->lock);
5302         block_rsv->reserved += num_bytes;
5303         if (update_size)
5304                 block_rsv->size += num_bytes;
5305         else if (block_rsv->reserved >= block_rsv->size)
5306                 block_rsv->full = 1;
5307         spin_unlock(&block_rsv->lock);
5308 }
5309
5310 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5311                              struct btrfs_block_rsv *dest, u64 num_bytes,
5312                              int min_factor)
5313 {
5314         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5315         u64 min_bytes;
5316
5317         if (global_rsv->space_info != dest->space_info)
5318                 return -ENOSPC;
5319
5320         spin_lock(&global_rsv->lock);
5321         min_bytes = div_factor(global_rsv->size, min_factor);
5322         if (global_rsv->reserved < min_bytes + num_bytes) {
5323                 spin_unlock(&global_rsv->lock);
5324                 return -ENOSPC;
5325         }
5326         global_rsv->reserved -= num_bytes;
5327         if (global_rsv->reserved < global_rsv->size)
5328                 global_rsv->full = 0;
5329         spin_unlock(&global_rsv->lock);
5330
5331         block_rsv_add_bytes(dest, num_bytes, 1);
5332         return 0;
5333 }
5334
5335 /*
5336  * This is for space we already have accounted in space_info->bytes_may_use, so
5337  * basically when we're returning space from block_rsv's.
5338  */
5339 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5340                                      struct btrfs_space_info *space_info,
5341                                      u64 num_bytes)
5342 {
5343         struct reserve_ticket *ticket;
5344         struct list_head *head;
5345         u64 used;
5346         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5347         bool check_overcommit = false;
5348
5349         spin_lock(&space_info->lock);
5350         head = &space_info->priority_tickets;
5351
5352         /*
5353          * If we are over our limit then we need to check and see if we can
5354          * overcommit, and if we can't then we just need to free up our space
5355          * and not satisfy any requests.
5356          */
5357         used = space_info->bytes_used + space_info->bytes_reserved +
5358                 space_info->bytes_pinned + space_info->bytes_readonly +
5359                 space_info->bytes_may_use;
5360         if (used - num_bytes >= space_info->total_bytes)
5361                 check_overcommit = true;
5362 again:
5363         while (!list_empty(head) && num_bytes) {
5364                 ticket = list_first_entry(head, struct reserve_ticket,
5365                                           list);
5366                 /*
5367                  * We use 0 bytes because this space is already reserved, so
5368                  * adding the ticket space would be a double count.
5369                  */
5370                 if (check_overcommit &&
5371                     !can_overcommit(fs_info->extent_root, space_info, 0,
5372                                     flush))
5373                         break;
5374                 if (num_bytes >= ticket->bytes) {
5375                         list_del_init(&ticket->list);
5376                         num_bytes -= ticket->bytes;
5377                         ticket->bytes = 0;
5378                         wake_up(&ticket->wait);
5379                 } else {
5380                         ticket->bytes -= num_bytes;
5381                         num_bytes = 0;
5382                 }
5383         }
5384
5385         if (num_bytes && head == &space_info->priority_tickets) {
5386                 head = &space_info->tickets;
5387                 flush = BTRFS_RESERVE_FLUSH_ALL;
5388                 goto again;
5389         }
5390         space_info->bytes_may_use -= num_bytes;
5391         trace_btrfs_space_reservation(fs_info, "space_info",
5392                                       space_info->flags, num_bytes, 0);
5393         spin_unlock(&space_info->lock);
5394 }
5395
5396 /*
5397  * This is for newly allocated space that isn't accounted in
5398  * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
5399  * we use this helper.
5400  */
5401 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5402                                      struct btrfs_space_info *space_info,
5403                                      u64 num_bytes)
5404 {
5405         struct reserve_ticket *ticket;
5406         struct list_head *head = &space_info->priority_tickets;
5407
5408 again:
5409         while (!list_empty(head) && num_bytes) {
5410                 ticket = list_first_entry(head, struct reserve_ticket,
5411                                           list);
5412                 if (num_bytes >= ticket->bytes) {
5413                         trace_btrfs_space_reservation(fs_info, "space_info",
5414                                                       space_info->flags,
5415                                                       ticket->bytes, 1);
5416                         list_del_init(&ticket->list);
5417                         num_bytes -= ticket->bytes;
5418                         space_info->bytes_may_use += ticket->bytes;
5419                         ticket->bytes = 0;
5420                         wake_up(&ticket->wait);
5421                 } else {
5422                         trace_btrfs_space_reservation(fs_info, "space_info",
5423                                                       space_info->flags,
5424                                                       num_bytes, 1);
5425                         space_info->bytes_may_use += num_bytes;
5426                         ticket->bytes -= num_bytes;
5427                         num_bytes = 0;
5428                 }
5429         }
5430
5431         if (num_bytes && head == &space_info->priority_tickets) {
5432                 head = &space_info->tickets;
5433                 goto again;
5434         }
5435 }
5436
5437 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5438                                     struct btrfs_block_rsv *block_rsv,
5439                                     struct btrfs_block_rsv *dest, u64 num_bytes)
5440 {
5441         struct btrfs_space_info *space_info = block_rsv->space_info;
5442
5443         spin_lock(&block_rsv->lock);
5444         if (num_bytes == (u64)-1)
5445                 num_bytes = block_rsv->size;
5446         block_rsv->size -= num_bytes;
5447         if (block_rsv->reserved >= block_rsv->size) {
5448                 num_bytes = block_rsv->reserved - block_rsv->size;
5449                 block_rsv->reserved = block_rsv->size;
5450                 block_rsv->full = 1;
5451         } else {
5452                 num_bytes = 0;
5453         }
5454         spin_unlock(&block_rsv->lock);
5455
5456         if (num_bytes > 0) {
5457                 if (dest) {
5458                         spin_lock(&dest->lock);
5459                         if (!dest->full) {
5460                                 u64 bytes_to_add;
5461
5462                                 bytes_to_add = dest->size - dest->reserved;
5463                                 bytes_to_add = min(num_bytes, bytes_to_add);
5464                                 dest->reserved += bytes_to_add;
5465                                 if (dest->reserved >= dest->size)
5466                                         dest->full = 1;
5467                                 num_bytes -= bytes_to_add;
5468                         }
5469                         spin_unlock(&dest->lock);
5470                 }
5471                 if (num_bytes)
5472                         space_info_add_old_bytes(fs_info, space_info,
5473                                                  num_bytes);
5474         }
5475 }
5476
5477 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5478                             struct btrfs_block_rsv *dst, u64 num_bytes,
5479                             int update_size)
5480 {
5481         int ret;
5482
5483         ret = block_rsv_use_bytes(src, num_bytes);
5484         if (ret)
5485                 return ret;
5486
5487         block_rsv_add_bytes(dst, num_bytes, update_size);
5488         return 0;
5489 }
5490
5491 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5492 {
5493         memset(rsv, 0, sizeof(*rsv));
5494         spin_lock_init(&rsv->lock);
5495         rsv->type = type;
5496 }
5497
5498 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
5499                                               unsigned short type)
5500 {
5501         struct btrfs_block_rsv *block_rsv;
5502         struct btrfs_fs_info *fs_info = root->fs_info;
5503
5504         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5505         if (!block_rsv)
5506                 return NULL;
5507
5508         btrfs_init_block_rsv(block_rsv, type);
5509         block_rsv->space_info = __find_space_info(fs_info,
5510                                                   BTRFS_BLOCK_GROUP_METADATA);
5511         return block_rsv;
5512 }
5513
5514 void btrfs_free_block_rsv(struct btrfs_root *root,
5515                           struct btrfs_block_rsv *rsv)
5516 {
5517         if (!rsv)
5518                 return;
5519         btrfs_block_rsv_release(root, rsv, (u64)-1);
5520         kfree(rsv);
5521 }
5522
5523 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
5524 {
5525         kfree(rsv);
5526 }
5527
5528 int btrfs_block_rsv_add(struct btrfs_root *root,
5529                         struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5530                         enum btrfs_reserve_flush_enum flush)
5531 {
5532         int ret;
5533
5534         if (num_bytes == 0)
5535                 return 0;
5536
5537         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5538         if (!ret) {
5539                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
5540                 return 0;
5541         }
5542
5543         return ret;
5544 }
5545
5546 int btrfs_block_rsv_check(struct btrfs_root *root,
5547                           struct btrfs_block_rsv *block_rsv, int min_factor)
5548 {
5549         u64 num_bytes = 0;
5550         int ret = -ENOSPC;
5551
5552         if (!block_rsv)
5553                 return 0;
5554
5555         spin_lock(&block_rsv->lock);
5556         num_bytes = div_factor(block_rsv->size, min_factor);
5557         if (block_rsv->reserved >= num_bytes)
5558                 ret = 0;
5559         spin_unlock(&block_rsv->lock);
5560
5561         return ret;
5562 }
5563
5564 int btrfs_block_rsv_refill(struct btrfs_root *root,
5565                            struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5566                            enum btrfs_reserve_flush_enum flush)
5567 {
5568         u64 num_bytes = 0;
5569         int ret = -ENOSPC;
5570
5571         if (!block_rsv)
5572                 return 0;
5573
5574         spin_lock(&block_rsv->lock);
5575         num_bytes = min_reserved;
5576         if (block_rsv->reserved >= num_bytes)
5577                 ret = 0;
5578         else
5579                 num_bytes -= block_rsv->reserved;
5580         spin_unlock(&block_rsv->lock);
5581
5582         if (!ret)
5583                 return 0;
5584
5585         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5586         if (!ret) {
5587                 block_rsv_add_bytes(block_rsv, num_bytes, 0);
5588                 return 0;
5589         }
5590
5591         return ret;
5592 }
5593
5594 void btrfs_block_rsv_release(struct btrfs_root *root,
5595                              struct btrfs_block_rsv *block_rsv,
5596                              u64 num_bytes)
5597 {
5598         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5599         if (global_rsv == block_rsv ||
5600             block_rsv->space_info != global_rsv->space_info)
5601                 global_rsv = NULL;
5602         block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
5603                                 num_bytes);
5604 }
5605
5606 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5607 {
5608         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5609         struct btrfs_space_info *sinfo = block_rsv->space_info;
5610         u64 num_bytes;
5611
5612         /*
5613          * The global block rsv is based on the size of the extent tree, the
5614          * checksum tree and the root tree.  If the fs is empty we want to set
5615          * it to a minimal amount for safety.
5616          */
5617         num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5618                 btrfs_root_used(&fs_info->csum_root->root_item) +
5619                 btrfs_root_used(&fs_info->tree_root->root_item);
5620         num_bytes = max_t(u64, num_bytes, SZ_16M);
5621
5622         spin_lock(&sinfo->lock);
5623         spin_lock(&block_rsv->lock);
5624
5625         block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5626
5627         if (block_rsv->reserved < block_rsv->size) {
5628                 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
5629                         sinfo->bytes_reserved + sinfo->bytes_readonly +
5630                         sinfo->bytes_may_use;
5631                 if (sinfo->total_bytes > num_bytes) {
5632                         num_bytes = sinfo->total_bytes - num_bytes;
5633                         num_bytes = min(num_bytes,
5634                                         block_rsv->size - block_rsv->reserved);
5635                         block_rsv->reserved += num_bytes;
5636                         sinfo->bytes_may_use += num_bytes;
5637                         trace_btrfs_space_reservation(fs_info, "space_info",
5638                                                       sinfo->flags, num_bytes,
5639                                                       1);
5640                 }
5641         } else if (block_rsv->reserved > block_rsv->size) {
5642                 num_bytes = block_rsv->reserved - block_rsv->size;
5643                 sinfo->bytes_may_use -= num_bytes;
5644                 trace_btrfs_space_reservation(fs_info, "space_info",
5645                                       sinfo->flags, num_bytes, 0);
5646                 block_rsv->reserved = block_rsv->size;
5647         }
5648
5649         if (block_rsv->reserved == block_rsv->size)
5650                 block_rsv->full = 1;
5651         else
5652                 block_rsv->full = 0;
5653
5654         spin_unlock(&block_rsv->lock);
5655         spin_unlock(&sinfo->lock);
5656 }
5657
5658 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5659 {
5660         struct btrfs_space_info *space_info;
5661
5662         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5663         fs_info->chunk_block_rsv.space_info = space_info;
5664
5665         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5666         fs_info->global_block_rsv.space_info = space_info;
5667         fs_info->delalloc_block_rsv.space_info = space_info;
5668         fs_info->trans_block_rsv.space_info = space_info;
5669         fs_info->empty_block_rsv.space_info = space_info;
5670         fs_info->delayed_block_rsv.space_info = space_info;
5671
5672         fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
5673         fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
5674         fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5675         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5676         if (fs_info->quota_root)
5677                 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5678         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5679
5680         update_global_block_rsv(fs_info);
5681 }
5682
5683 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5684 {
5685         block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5686                                 (u64)-1);
5687         WARN_ON(fs_info->delalloc_block_rsv.size > 0);
5688         WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
5689         WARN_ON(fs_info->trans_block_rsv.size > 0);
5690         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5691         WARN_ON(fs_info->chunk_block_rsv.size > 0);
5692         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5693         WARN_ON(fs_info->delayed_block_rsv.size > 0);
5694         WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5695 }
5696
5697 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
5698                                   struct btrfs_root *root)
5699 {
5700         if (!trans->block_rsv)
5701                 return;
5702
5703         if (!trans->bytes_reserved)
5704                 return;
5705
5706         trace_btrfs_space_reservation(root->fs_info, "transaction",
5707                                       trans->transid, trans->bytes_reserved, 0);
5708         btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
5709         trans->bytes_reserved = 0;
5710 }
5711
5712 /*
5713  * To be called after all the new block groups attached to the transaction
5714  * handle have been created (btrfs_create_pending_block_groups()).
5715  */
5716 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5717 {
5718         struct btrfs_fs_info *fs_info = trans->fs_info;
5719
5720         if (!trans->chunk_bytes_reserved)
5721                 return;
5722
5723         WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5724
5725         block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5726                                 trans->chunk_bytes_reserved);
5727         trans->chunk_bytes_reserved = 0;
5728 }
5729
5730 /* Can only return 0 or -ENOSPC */
5731 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
5732                                   struct inode *inode)
5733 {
5734         struct btrfs_root *root = BTRFS_I(inode)->root;
5735         /*
5736          * We always use trans->block_rsv here as we will have reserved space
5737          * for our orphan when starting the transaction, using get_block_rsv()
5738          * here will sometimes make us choose the wrong block rsv as we could be
5739          * doing a reloc inode for a non refcounted root.
5740          */
5741         struct btrfs_block_rsv *src_rsv = trans->block_rsv;
5742         struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
5743
5744         /*
5745          * We need to hold space in order to delete our orphan item once we've
5746          * added it, so this takes the reservation so we can release it later
5747          * when we are truly done with the orphan item.
5748          */
5749         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
5750         trace_btrfs_space_reservation(root->fs_info, "orphan",
5751                                       btrfs_ino(inode), num_bytes, 1);
5752         return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
5753 }
5754
5755 void btrfs_orphan_release_metadata(struct inode *inode)
5756 {
5757         struct btrfs_root *root = BTRFS_I(inode)->root;
5758         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
5759         trace_btrfs_space_reservation(root->fs_info, "orphan",
5760                                       btrfs_ino(inode), num_bytes, 0);
5761         btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
5762 }
5763
5764 /*
5765  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5766  * root: the root of the parent directory
5767  * rsv: block reservation
5768  * items: the number of items that we need do reservation
5769  * qgroup_reserved: used to return the reserved size in qgroup
5770  *
5771  * This function is used to reserve the space for snapshot/subvolume
5772  * creation and deletion. Those operations are different with the
5773  * common file/directory operations, they change two fs/file trees
5774  * and root tree, the number of items that the qgroup reserves is
5775  * different with the free space reservation. So we can not use
5776  * the space reservation mechanism in start_transaction().
5777  */
5778 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5779                                      struct btrfs_block_rsv *rsv,
5780                                      int items,
5781                                      u64 *qgroup_reserved,
5782                                      bool use_global_rsv)
5783 {
5784         u64 num_bytes;
5785         int ret;
5786         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5787
5788         if (root->fs_info->quota_enabled) {
5789                 /* One for parent inode, two for dir entries */
5790                 num_bytes = 3 * root->nodesize;
5791                 ret = btrfs_qgroup_reserve_meta(root, num_bytes);
5792                 if (ret)
5793                         return ret;
5794         } else {
5795                 num_bytes = 0;
5796         }
5797
5798         *qgroup_reserved = num_bytes;
5799
5800         num_bytes = btrfs_calc_trans_metadata_size(root, items);
5801         rsv->space_info = __find_space_info(root->fs_info,
5802                                             BTRFS_BLOCK_GROUP_METADATA);
5803         ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5804                                   BTRFS_RESERVE_FLUSH_ALL);
5805
5806         if (ret == -ENOSPC && use_global_rsv)
5807                 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
5808
5809         if (ret && *qgroup_reserved)
5810                 btrfs_qgroup_free_meta(root, *qgroup_reserved);
5811
5812         return ret;
5813 }
5814
5815 void btrfs_subvolume_release_metadata(struct btrfs_root *root,
5816                                       struct btrfs_block_rsv *rsv,
5817                                       u64 qgroup_reserved)
5818 {
5819         btrfs_block_rsv_release(root, rsv, (u64)-1);
5820 }
5821
5822 /**
5823  * drop_outstanding_extent - drop an outstanding extent
5824  * @inode: the inode we're dropping the extent for
5825  * @num_bytes: the number of bytes we're releasing.
5826  *
5827  * This is called when we are freeing up an outstanding extent, either called
5828  * after an error or after an extent is written.  This will return the number of
5829  * reserved extents that need to be freed.  This must be called with
5830  * BTRFS_I(inode)->lock held.
5831  */
5832 static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
5833 {
5834         unsigned drop_inode_space = 0;
5835         unsigned dropped_extents = 0;
5836         unsigned num_extents = 0;
5837
5838         num_extents = (unsigned)div64_u64(num_bytes +
5839                                           BTRFS_MAX_EXTENT_SIZE - 1,
5840                                           BTRFS_MAX_EXTENT_SIZE);
5841         ASSERT(num_extents);
5842         ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
5843         BTRFS_I(inode)->outstanding_extents -= num_extents;
5844
5845         if (BTRFS_I(inode)->outstanding_extents == 0 &&
5846             test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5847                                &BTRFS_I(inode)->runtime_flags))
5848                 drop_inode_space = 1;
5849
5850         /*
5851          * If we have more or the same amount of outstanding extents than we have
5852          * reserved then we need to leave the reserved extents count alone.
5853          */
5854         if (BTRFS_I(inode)->outstanding_extents >=
5855             BTRFS_I(inode)->reserved_extents)
5856                 return drop_inode_space;
5857
5858         dropped_extents = BTRFS_I(inode)->reserved_extents -
5859                 BTRFS_I(inode)->outstanding_extents;
5860         BTRFS_I(inode)->reserved_extents -= dropped_extents;
5861         return dropped_extents + drop_inode_space;
5862 }
5863
5864 /**
5865  * calc_csum_metadata_size - return the amount of metadata space that must be
5866  *      reserved/freed for the given bytes.
5867  * @inode: the inode we're manipulating
5868  * @num_bytes: the number of bytes in question
5869  * @reserve: 1 if we are reserving space, 0 if we are freeing space
5870  *
5871  * This adjusts the number of csum_bytes in the inode and then returns the
5872  * correct amount of metadata that must either be reserved or freed.  We
5873  * calculate how many checksums we can fit into one leaf and then divide the
5874  * number of bytes that will need to be checksumed by this value to figure out
5875  * how many checksums will be required.  If we are adding bytes then the number
5876  * may go up and we will return the number of additional bytes that must be
5877  * reserved.  If it is going down we will return the number of bytes that must
5878  * be freed.
5879  *
5880  * This must be called with BTRFS_I(inode)->lock held.
5881  */
5882 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
5883                                    int reserve)
5884 {
5885         struct btrfs_root *root = BTRFS_I(inode)->root;
5886         u64 old_csums, num_csums;
5887
5888         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
5889             BTRFS_I(inode)->csum_bytes == 0)
5890                 return 0;
5891
5892         old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5893         if (reserve)
5894                 BTRFS_I(inode)->csum_bytes += num_bytes;
5895         else
5896                 BTRFS_I(inode)->csum_bytes -= num_bytes;
5897         num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5898
5899         /* No change, no need to reserve more */
5900         if (old_csums == num_csums)
5901                 return 0;
5902
5903         if (reserve)
5904                 return btrfs_calc_trans_metadata_size(root,
5905                                                       num_csums - old_csums);
5906
5907         return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
5908 }
5909
5910 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5911 {
5912         struct btrfs_root *root = BTRFS_I(inode)->root;
5913         struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
5914         u64 to_reserve = 0;
5915         u64 csum_bytes;
5916         unsigned nr_extents = 0;
5917         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5918         int ret = 0;
5919         bool delalloc_lock = true;
5920         u64 to_free = 0;
5921         unsigned dropped;
5922         bool release_extra = false;
5923
5924         /* If we are a free space inode we need to not flush since we will be in
5925          * the middle of a transaction commit.  We also don't need the delalloc
5926          * mutex since we won't race with anybody.  We need this mostly to make
5927          * lockdep shut its filthy mouth.
5928          *
5929          * If we have a transaction open (can happen if we call truncate_block
5930          * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
5931          */
5932         if (btrfs_is_free_space_inode(inode)) {
5933                 flush = BTRFS_RESERVE_NO_FLUSH;
5934                 delalloc_lock = false;
5935         } else if (current->journal_info) {
5936                 flush = BTRFS_RESERVE_FLUSH_LIMIT;
5937         }
5938
5939         if (flush != BTRFS_RESERVE_NO_FLUSH &&
5940             btrfs_transaction_in_commit(root->fs_info))
5941                 schedule_timeout(1);
5942
5943         if (delalloc_lock)
5944                 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
5945
5946         num_bytes = ALIGN(num_bytes, root->sectorsize);
5947
5948         spin_lock(&BTRFS_I(inode)->lock);
5949         nr_extents = (unsigned)div64_u64(num_bytes +
5950                                          BTRFS_MAX_EXTENT_SIZE - 1,
5951                                          BTRFS_MAX_EXTENT_SIZE);
5952         BTRFS_I(inode)->outstanding_extents += nr_extents;
5953
5954         nr_extents = 0;
5955         if (BTRFS_I(inode)->outstanding_extents >
5956             BTRFS_I(inode)->reserved_extents)
5957                 nr_extents += BTRFS_I(inode)->outstanding_extents -
5958                         BTRFS_I(inode)->reserved_extents;
5959
5960         /* We always want to reserve a slot for updating the inode. */
5961         to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents + 1);
5962         to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
5963         csum_bytes = BTRFS_I(inode)->csum_bytes;
5964         spin_unlock(&BTRFS_I(inode)->lock);
5965
5966         if (root->fs_info->quota_enabled) {
5967                 ret = btrfs_qgroup_reserve_meta(root,
5968                                 nr_extents * root->nodesize);
5969                 if (ret)
5970                         goto out_fail;
5971         }
5972
5973         ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
5974         if (unlikely(ret)) {
5975                 btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
5976                 goto out_fail;
5977         }
5978
5979         spin_lock(&BTRFS_I(inode)->lock);
5980         if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5981                              &BTRFS_I(inode)->runtime_flags)) {
5982                 to_reserve -= btrfs_calc_trans_metadata_size(root, 1);
5983                 release_extra = true;
5984         }
5985         BTRFS_I(inode)->reserved_extents += nr_extents;
5986         spin_unlock(&BTRFS_I(inode)->lock);
5987
5988         if (delalloc_lock)
5989                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5990
5991         if (to_reserve)
5992                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5993                                               btrfs_ino(inode), to_reserve, 1);
5994         if (release_extra)
5995                 btrfs_block_rsv_release(root, block_rsv,
5996                                         btrfs_calc_trans_metadata_size(root,
5997                                                                        1));
5998         return 0;
5999
6000 out_fail:
6001         spin_lock(&BTRFS_I(inode)->lock);
6002         dropped = drop_outstanding_extent(inode, num_bytes);
6003         /*
6004          * If the inodes csum_bytes is the same as the original
6005          * csum_bytes then we know we haven't raced with any free()ers
6006          * so we can just reduce our inodes csum bytes and carry on.
6007          */
6008         if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
6009                 calc_csum_metadata_size(inode, num_bytes, 0);
6010         } else {
6011                 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
6012                 u64 bytes;
6013
6014                 /*
6015                  * This is tricky, but first we need to figure out how much we
6016                  * freed from any free-ers that occurred during this
6017                  * reservation, so we reset ->csum_bytes to the csum_bytes
6018                  * before we dropped our lock, and then call the free for the
6019                  * number of bytes that were freed while we were trying our
6020                  * reservation.
6021                  */
6022                 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
6023                 BTRFS_I(inode)->csum_bytes = csum_bytes;
6024                 to_free = calc_csum_metadata_size(inode, bytes, 0);
6025
6026
6027                 /*
6028                  * Now we need to see how much we would have freed had we not
6029                  * been making this reservation and our ->csum_bytes were not
6030                  * artificially inflated.
6031                  */
6032                 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
6033                 bytes = csum_bytes - orig_csum_bytes;
6034                 bytes = calc_csum_metadata_size(inode, bytes, 0);
6035
6036                 /*
6037                  * Now reset ->csum_bytes to what it should be.  If bytes is
6038                  * more than to_free then we would have freed more space had we
6039                  * not had an artificially high ->csum_bytes, so we need to free
6040                  * the remainder.  If bytes is the same or less then we don't
6041                  * need to do anything, the other free-ers did the correct
6042                  * thing.
6043                  */
6044                 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
6045                 if (bytes > to_free)
6046                         to_free = bytes - to_free;
6047                 else
6048                         to_free = 0;
6049         }
6050         spin_unlock(&BTRFS_I(inode)->lock);
6051         if (dropped)
6052                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
6053
6054         if (to_free) {
6055                 btrfs_block_rsv_release(root, block_rsv, to_free);
6056                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
6057                                               btrfs_ino(inode), to_free, 0);
6058         }
6059         if (delalloc_lock)
6060                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
6061         return ret;
6062 }
6063
6064 /**
6065  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
6066  * @inode: the inode to release the reservation for
6067  * @num_bytes: the number of bytes we're releasing
6068  *
6069  * This will release the metadata reservation for an inode.  This can be called
6070  * once we complete IO for a given set of bytes to release their metadata
6071  * reservations.
6072  */
6073 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
6074 {
6075         struct btrfs_root *root = BTRFS_I(inode)->root;
6076         u64 to_free = 0;
6077         unsigned dropped;
6078
6079         num_bytes = ALIGN(num_bytes, root->sectorsize);
6080         spin_lock(&BTRFS_I(inode)->lock);
6081         dropped = drop_outstanding_extent(inode, num_bytes);
6082
6083         if (num_bytes)
6084                 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
6085         spin_unlock(&BTRFS_I(inode)->lock);
6086         if (dropped > 0)
6087                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
6088
6089         if (btrfs_is_testing(root->fs_info))
6090                 return;
6091
6092         trace_btrfs_space_reservation(root->fs_info, "delalloc",
6093                                       btrfs_ino(inode), to_free, 0);
6094
6095         btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
6096                                 to_free);
6097 }
6098
6099 /**
6100  * btrfs_delalloc_reserve_space - reserve data and metadata space for
6101  * delalloc
6102  * @inode: inode we're writing to
6103  * @start: start range we are writing to
6104  * @len: how long the range we are writing to
6105  *
6106  * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
6107  *
6108  * This will do the following things
6109  *
6110  * o reserve space in data space info for num bytes
6111  *   and reserve precious corresponding qgroup space
6112  *   (Done in check_data_free_space)
6113  *
6114  * o reserve space for metadata space, based on the number of outstanding
6115  *   extents and how much csums will be needed
6116  *   also reserve metadata space in a per root over-reserve method.
6117  * o add to the inodes->delalloc_bytes
6118  * o add it to the fs_info's delalloc inodes list.
6119  *   (Above 3 all done in delalloc_reserve_metadata)
6120  *
6121  * Return 0 for success
6122  * Return <0 for error(-ENOSPC or -EQUOT)
6123  */
6124 int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
6125 {
6126         int ret;
6127
6128         ret = btrfs_check_data_free_space(inode, start, len);
6129         if (ret < 0)
6130                 return ret;
6131         ret = btrfs_delalloc_reserve_metadata(inode, len);
6132         if (ret < 0)
6133                 btrfs_free_reserved_data_space(inode, start, len);
6134         return ret;
6135 }
6136
6137 /**
6138  * btrfs_delalloc_release_space - release data and metadata space for delalloc
6139  * @inode: inode we're releasing space for
6140  * @start: start position of the space already reserved
6141  * @len: the len of the space already reserved
6142  *
6143  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
6144  * called in the case that we don't need the metadata AND data reservations
6145  * anymore.  So if there is an error or we insert an inline extent.
6146  *
6147  * This function will release the metadata space that was not used and will
6148  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
6149  * list if there are no delalloc bytes left.
6150  * Also it will handle the qgroup reserved space.
6151  */
6152 void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
6153 {
6154         btrfs_delalloc_release_metadata(inode, len);
6155         btrfs_free_reserved_data_space(inode, start, len);
6156 }
6157
6158 static int update_block_group(struct btrfs_trans_handle *trans,
6159                               struct btrfs_root *root, u64 bytenr,
6160                               u64 num_bytes, int alloc)
6161 {
6162         struct btrfs_block_group_cache *cache = NULL;
6163         struct btrfs_fs_info *info = root->fs_info;
6164         u64 total = num_bytes;
6165         u64 old_val;
6166         u64 byte_in_group;
6167         int factor;
6168
6169         /* block accounting for super block */
6170         spin_lock(&info->delalloc_root_lock);
6171         old_val = btrfs_super_bytes_used(info->super_copy);
6172         if (alloc)
6173                 old_val += num_bytes;
6174         else
6175                 old_val -= num_bytes;
6176         btrfs_set_super_bytes_used(info->super_copy, old_val);
6177         spin_unlock(&info->delalloc_root_lock);
6178
6179         while (total) {
6180                 cache = btrfs_lookup_block_group(info, bytenr);
6181                 if (!cache)
6182                         return -ENOENT;
6183                 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
6184                                     BTRFS_BLOCK_GROUP_RAID1 |
6185                                     BTRFS_BLOCK_GROUP_RAID10))
6186                         factor = 2;
6187                 else
6188                         factor = 1;
6189                 /*
6190                  * If this block group has free space cache written out, we
6191                  * need to make sure to load it if we are removing space.  This
6192                  * is because we need the unpinning stage to actually add the
6193                  * space back to the block group, otherwise we will leak space.
6194                  */
6195                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
6196                         cache_block_group(cache, 1);
6197
6198                 byte_in_group = bytenr - cache->key.objectid;
6199                 WARN_ON(byte_in_group > cache->key.offset);
6200
6201                 spin_lock(&cache->space_info->lock);
6202                 spin_lock(&cache->lock);
6203
6204                 if (btrfs_test_opt(root->fs_info, SPACE_CACHE) &&
6205                     cache->disk_cache_state < BTRFS_DC_CLEAR)
6206                         cache->disk_cache_state = BTRFS_DC_CLEAR;
6207
6208                 old_val = btrfs_block_group_used(&cache->item);
6209                 num_bytes = min(total, cache->key.offset - byte_in_group);
6210                 if (alloc) {
6211                         old_val += num_bytes;
6212                         btrfs_set_block_group_used(&cache->item, old_val);
6213                         cache->reserved -= num_bytes;
6214                         cache->space_info->bytes_reserved -= num_bytes;
6215                         cache->space_info->bytes_used += num_bytes;
6216                         cache->space_info->disk_used += num_bytes * factor;
6217                         spin_unlock(&cache->lock);
6218                         spin_unlock(&cache->space_info->lock);
6219                 } else {
6220                         old_val -= num_bytes;
6221                         btrfs_set_block_group_used(&cache->item, old_val);
6222                         cache->pinned += num_bytes;
6223                         cache->space_info->bytes_pinned += num_bytes;
6224                         cache->space_info->bytes_used -= num_bytes;
6225                         cache->space_info->disk_used -= num_bytes * factor;
6226                         spin_unlock(&cache->lock);
6227                         spin_unlock(&cache->space_info->lock);
6228
6229                         trace_btrfs_space_reservation(root->fs_info, "pinned",
6230                                                       cache->space_info->flags,
6231                                                       num_bytes, 1);
6232                         set_extent_dirty(info->pinned_extents,
6233                                          bytenr, bytenr + num_bytes - 1,
6234                                          GFP_NOFS | __GFP_NOFAIL);
6235                 }
6236
6237                 spin_lock(&trans->transaction->dirty_bgs_lock);
6238                 if (list_empty(&cache->dirty_list)) {
6239                         list_add_tail(&cache->dirty_list,
6240                                       &trans->transaction->dirty_bgs);
6241                                 trans->transaction->num_dirty_bgs++;
6242                         btrfs_get_block_group(cache);
6243                 }
6244                 spin_unlock(&trans->transaction->dirty_bgs_lock);
6245
6246                 /*
6247                  * No longer have used bytes in this block group, queue it for
6248                  * deletion. We do this after adding the block group to the
6249                  * dirty list to avoid races between cleaner kthread and space
6250                  * cache writeout.
6251                  */
6252                 if (!alloc && old_val == 0) {
6253                         spin_lock(&info->unused_bgs_lock);
6254                         if (list_empty(&cache->bg_list)) {
6255                                 btrfs_get_block_group(cache);
6256                                 list_add_tail(&cache->bg_list,
6257                                               &info->unused_bgs);
6258                         }
6259                         spin_unlock(&info->unused_bgs_lock);
6260                 }
6261
6262                 btrfs_put_block_group(cache);
6263                 total -= num_bytes;
6264                 bytenr += num_bytes;
6265         }
6266         return 0;
6267 }
6268
6269 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
6270 {
6271         struct btrfs_block_group_cache *cache;
6272         u64 bytenr;
6273
6274         spin_lock(&root->fs_info->block_group_cache_lock);
6275         bytenr = root->fs_info->first_logical_byte;
6276         spin_unlock(&root->fs_info->block_group_cache_lock);
6277
6278         if (bytenr < (u64)-1)
6279                 return bytenr;
6280
6281         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
6282         if (!cache)
6283                 return 0;
6284
6285         bytenr = cache->key.objectid;
6286         btrfs_put_block_group(cache);
6287
6288         return bytenr;
6289 }
6290
6291 static int pin_down_extent(struct btrfs_root *root,
6292                            struct btrfs_block_group_cache *cache,
6293                            u64 bytenr, u64 num_bytes, int reserved)
6294 {
6295         spin_lock(&cache->space_info->lock);
6296         spin_lock(&cache->lock);
6297         cache->pinned += num_bytes;
6298         cache->space_info->bytes_pinned += num_bytes;
6299         if (reserved) {
6300                 cache->reserved -= num_bytes;
6301                 cache->space_info->bytes_reserved -= num_bytes;
6302         }
6303         spin_unlock(&cache->lock);
6304         spin_unlock(&cache->space_info->lock);
6305
6306         trace_btrfs_space_reservation(root->fs_info, "pinned",
6307                                       cache->space_info->flags, num_bytes, 1);
6308         set_extent_dirty(root->fs_info->pinned_extents, bytenr,
6309                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
6310         return 0;
6311 }
6312
6313 /*
6314  * this function must be called within transaction
6315  */
6316 int btrfs_pin_extent(struct btrfs_root *root,
6317                      u64 bytenr, u64 num_bytes, int reserved)
6318 {
6319         struct btrfs_block_group_cache *cache;
6320
6321         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
6322         BUG_ON(!cache); /* Logic error */
6323
6324         pin_down_extent(root, cache, bytenr, num_bytes, reserved);
6325
6326         btrfs_put_block_group(cache);
6327         return 0;
6328 }
6329
6330 /*
6331  * this function must be called within transaction
6332  */
6333 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
6334                                     u64 bytenr, u64 num_bytes)
6335 {
6336         struct btrfs_block_group_cache *cache;
6337         int ret;
6338
6339         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
6340         if (!cache)
6341                 return -EINVAL;
6342
6343         /*
6344          * pull in the free space cache (if any) so that our pin
6345          * removes the free space from the cache.  We have load_only set
6346          * to one because the slow code to read in the free extents does check
6347          * the pinned extents.
6348          */
6349         cache_block_group(cache, 1);
6350
6351         pin_down_extent(root, cache, bytenr, num_bytes, 0);
6352
6353         /* remove us from the free space cache (if we're there at all) */
6354         ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
6355         btrfs_put_block_group(cache);
6356         return ret;
6357 }
6358
6359 static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
6360 {
6361         int ret;
6362         struct btrfs_block_group_cache *block_group;
6363         struct btrfs_caching_control *caching_ctl;
6364
6365         block_group = btrfs_lookup_block_group(root->fs_info, start);
6366         if (!block_group)
6367                 return -EINVAL;
6368
6369         cache_block_group(block_group, 0);
6370         caching_ctl = get_caching_control(block_group);
6371
6372         if (!caching_ctl) {
6373                 /* Logic error */
6374                 BUG_ON(!block_group_cache_done(block_group));
6375                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6376         } else {
6377                 mutex_lock(&caching_ctl->mutex);
6378
6379                 if (start >= caching_ctl->progress) {
6380                         ret = add_excluded_extent(root, start, num_bytes);
6381                 } else if (start + num_bytes <= caching_ctl->progress) {
6382                         ret = btrfs_remove_free_space(block_group,
6383                                                       start, num_bytes);
6384                 } else {
6385                         num_bytes = caching_ctl->progress - start;
6386                         ret = btrfs_remove_free_space(block_group,
6387                                                       start, num_bytes);
6388                         if (ret)
6389                                 goto out_lock;
6390
6391                         num_bytes = (start + num_bytes) -
6392                                 caching_ctl->progress;
6393                         start = caching_ctl->progress;
6394                         ret = add_excluded_extent(root, start, num_bytes);
6395                 }
6396 out_lock:
6397                 mutex_unlock(&caching_ctl->mutex);
6398                 put_caching_control(caching_ctl);
6399         }
6400         btrfs_put_block_group(block_group);
6401         return ret;
6402 }
6403
6404 int btrfs_exclude_logged_extents(struct btrfs_root *log,
6405                                  struct extent_buffer *eb)
6406 {
6407         struct btrfs_file_extent_item *item;
6408         struct btrfs_key key;
6409         int found_type;
6410         int i;
6411
6412         if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
6413                 return 0;
6414
6415         for (i = 0; i < btrfs_header_nritems(eb); i++) {
6416                 btrfs_item_key_to_cpu(eb, &key, i);
6417                 if (key.type != BTRFS_EXTENT_DATA_KEY)
6418                         continue;
6419                 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
6420                 found_type = btrfs_file_extent_type(eb, item);
6421                 if (found_type == BTRFS_FILE_EXTENT_INLINE)
6422                         continue;
6423                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6424                         continue;
6425                 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
6426                 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
6427                 __exclude_logged_extent(log, key.objectid, key.offset);
6428         }
6429
6430         return 0;
6431 }
6432
6433 static void
6434 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
6435 {
6436         atomic_inc(&bg->reservations);
6437 }
6438
6439 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6440                                         const u64 start)
6441 {
6442         struct btrfs_block_group_cache *bg;
6443
6444         bg = btrfs_lookup_block_group(fs_info, start);
6445         ASSERT(bg);
6446         if (atomic_dec_and_test(&bg->reservations))
6447                 wake_up_atomic_t(&bg->reservations);
6448         btrfs_put_block_group(bg);
6449 }
6450
6451 static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
6452 {
6453         schedule();
6454         return 0;
6455 }
6456
6457 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6458 {
6459         struct btrfs_space_info *space_info = bg->space_info;
6460
6461         ASSERT(bg->ro);
6462
6463         if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6464                 return;
6465
6466         /*
6467          * Our block group is read only but before we set it to read only,
6468          * some task might have had allocated an extent from it already, but it
6469          * has not yet created a respective ordered extent (and added it to a
6470          * root's list of ordered extents).
6471          * Therefore wait for any task currently allocating extents, since the
6472          * block group's reservations counter is incremented while a read lock
6473          * on the groups' semaphore is held and decremented after releasing
6474          * the read access on that semaphore and creating the ordered extent.
6475          */
6476         down_write(&space_info->groups_sem);
6477         up_write(&space_info->groups_sem);
6478
6479         wait_on_atomic_t(&bg->reservations,
6480                          btrfs_wait_bg_reservations_atomic_t,
6481                          TASK_UNINTERRUPTIBLE);
6482 }
6483
6484 /**
6485  * btrfs_add_reserved_bytes - update the block_group and space info counters
6486  * @cache:      The cache we are manipulating
6487  * @ram_bytes:  The number of bytes of file content, and will be same to
6488  *              @num_bytes except for the compress path.
6489  * @num_bytes:  The number of bytes in question
6490  * @delalloc:   The blocks are allocated for the delalloc write
6491  *
6492  * This is called by the allocator when it reserves space. Metadata
6493  * reservations should be called with RESERVE_ALLOC so we do the proper
6494  * ENOSPC accounting.  For data we handle the reservation through clearing the
6495  * delalloc bits in the io_tree.  We have to do this since we could end up
6496  * allocating less disk space for the amount of data we have reserved in the
6497  * case of compression.
6498  *
6499  * If this is a reservation and the block group has become read only we cannot
6500  * make the reservation and return -EAGAIN, otherwise this function always
6501  * succeeds.
6502  */
6503 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6504                                     u64 ram_bytes, u64 num_bytes, int delalloc)
6505 {
6506         struct btrfs_space_info *space_info = cache->space_info;
6507         int ret = 0;
6508
6509         spin_lock(&space_info->lock);
6510         spin_lock(&cache->lock);
6511         if (cache->ro) {
6512                 ret = -EAGAIN;
6513         } else {
6514                 cache->reserved += num_bytes;
6515                 space_info->bytes_reserved += num_bytes;
6516
6517                 trace_btrfs_space_reservation(cache->fs_info,
6518                                 "space_info", space_info->flags,
6519                                 ram_bytes, 0);
6520                 space_info->bytes_may_use -= ram_bytes;
6521                 if (delalloc)
6522                         cache->delalloc_bytes += num_bytes;
6523         }
6524         spin_unlock(&cache->lock);
6525         spin_unlock(&space_info->lock);
6526         return ret;
6527 }
6528
6529 /**
6530  * btrfs_free_reserved_bytes - update the block_group and space info counters
6531  * @cache:      The cache we are manipulating
6532  * @num_bytes:  The number of bytes in question
6533  * @delalloc:   The blocks are allocated for the delalloc write
6534  *
6535  * This is called by somebody who is freeing space that was never actually used
6536  * on disk.  For example if you reserve some space for a new leaf in transaction
6537  * A and before transaction A commits you free that leaf, you call this with
6538  * reserve set to 0 in order to clear the reservation.
6539  */
6540
6541 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6542                                      u64 num_bytes, int delalloc)
6543 {
6544         struct btrfs_space_info *space_info = cache->space_info;
6545         int ret = 0;
6546
6547         spin_lock(&space_info->lock);
6548         spin_lock(&cache->lock);
6549         if (cache->ro)
6550                 space_info->bytes_readonly += num_bytes;
6551         cache->reserved -= num_bytes;
6552         space_info->bytes_reserved -= num_bytes;
6553
6554         if (delalloc)
6555                 cache->delalloc_bytes -= num_bytes;
6556         spin_unlock(&cache->lock);
6557         spin_unlock(&space_info->lock);
6558         return ret;
6559 }
6560 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
6561                                 struct btrfs_root *root)
6562 {
6563         struct btrfs_fs_info *fs_info = root->fs_info;
6564         struct btrfs_caching_control *next;
6565         struct btrfs_caching_control *caching_ctl;
6566         struct btrfs_block_group_cache *cache;
6567
6568         down_write(&fs_info->commit_root_sem);
6569
6570         list_for_each_entry_safe(caching_ctl, next,
6571                                  &fs_info->caching_block_groups, list) {
6572                 cache = caching_ctl->block_group;
6573                 if (block_group_cache_done(cache)) {
6574                         cache->last_byte_to_unpin = (u64)-1;
6575                         list_del_init(&caching_ctl->list);
6576                         put_caching_control(caching_ctl);
6577                 } else {
6578                         cache->last_byte_to_unpin = caching_ctl->progress;
6579                 }
6580         }
6581
6582         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6583                 fs_info->pinned_extents = &fs_info->freed_extents[1];
6584         else
6585                 fs_info->pinned_extents = &fs_info->freed_extents[0];
6586
6587         up_write(&fs_info->commit_root_sem);
6588
6589         update_global_block_rsv(fs_info);
6590 }
6591
6592 /*
6593  * Returns the free cluster for the given space info and sets empty_cluster to
6594  * what it should be based on the mount options.
6595  */
6596 static struct btrfs_free_cluster *
6597 fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
6598                    u64 *empty_cluster)
6599 {
6600         struct btrfs_free_cluster *ret = NULL;
6601         bool ssd = btrfs_test_opt(root->fs_info, SSD);
6602
6603         *empty_cluster = 0;
6604         if (btrfs_mixed_space_info(space_info))
6605                 return ret;
6606
6607         if (ssd)
6608                 *empty_cluster = SZ_2M;
6609         if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
6610                 ret = &root->fs_info->meta_alloc_cluster;
6611                 if (!ssd)
6612                         *empty_cluster = SZ_64K;
6613         } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
6614                 ret = &root->fs_info->data_alloc_cluster;
6615         }
6616
6617         return ret;
6618 }
6619
6620 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
6621                               const bool return_free_space)
6622 {
6623         struct btrfs_fs_info *fs_info = root->fs_info;
6624         struct btrfs_block_group_cache *cache = NULL;
6625         struct btrfs_space_info *space_info;
6626         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6627         struct btrfs_free_cluster *cluster = NULL;
6628         u64 len;
6629         u64 total_unpinned = 0;
6630         u64 empty_cluster = 0;
6631         bool readonly;
6632
6633         while (start <= end) {
6634                 readonly = false;
6635                 if (!cache ||
6636                     start >= cache->key.objectid + cache->key.offset) {
6637                         if (cache)
6638                                 btrfs_put_block_group(cache);
6639                         total_unpinned = 0;
6640                         cache = btrfs_lookup_block_group(fs_info, start);
6641                         BUG_ON(!cache); /* Logic error */
6642
6643                         cluster = fetch_cluster_info(root,
6644                                                      cache->space_info,
6645                                                      &empty_cluster);
6646                         empty_cluster <<= 1;
6647                 }
6648
6649                 len = cache->key.objectid + cache->key.offset - start;
6650                 len = min(len, end + 1 - start);
6651
6652                 if (start < cache->last_byte_to_unpin) {
6653                         len = min(len, cache->last_byte_to_unpin - start);
6654                         if (return_free_space)
6655                                 btrfs_add_free_space(cache, start, len);
6656                 }
6657
6658                 start += len;
6659                 total_unpinned += len;
6660                 space_info = cache->space_info;
6661
6662                 /*
6663                  * If this space cluster has been marked as fragmented and we've
6664                  * unpinned enough in this block group to potentially allow a
6665                  * cluster to be created inside of it go ahead and clear the
6666                  * fragmented check.
6667                  */
6668                 if (cluster && cluster->fragmented &&
6669                     total_unpinned > empty_cluster) {
6670                         spin_lock(&cluster->lock);
6671                         cluster->fragmented = 0;
6672                         spin_unlock(&cluster->lock);
6673                 }
6674
6675                 spin_lock(&space_info->lock);
6676                 spin_lock(&cache->lock);
6677                 cache->pinned -= len;
6678                 space_info->bytes_pinned -= len;
6679
6680                 trace_btrfs_space_reservation(fs_info, "pinned",
6681                                               space_info->flags, len, 0);
6682                 space_info->max_extent_size = 0;
6683                 percpu_counter_add(&space_info->total_bytes_pinned, -len);
6684                 if (cache->ro) {
6685                         space_info->bytes_readonly += len;
6686                         readonly = true;
6687                 }
6688                 spin_unlock(&cache->lock);
6689                 if (!readonly && return_free_space &&
6690                     global_rsv->space_info == space_info) {
6691                         u64 to_add = len;
6692                         WARN_ON(!return_free_space);
6693                         spin_lock(&global_rsv->lock);
6694                         if (!global_rsv->full) {
6695                                 to_add = min(len, global_rsv->size -
6696                                              global_rsv->reserved);
6697                                 global_rsv->reserved += to_add;
6698                                 space_info->bytes_may_use += to_add;
6699                                 if (global_rsv->reserved >= global_rsv->size)
6700                                         global_rsv->full = 1;
6701                                 trace_btrfs_space_reservation(fs_info,
6702                                                               "space_info",
6703                                                               space_info->flags,
6704                                                               to_add, 1);
6705                                 len -= to_add;
6706                         }
6707                         spin_unlock(&global_rsv->lock);
6708                         /* Add to any tickets we may have */
6709                         if (len)
6710                                 space_info_add_new_bytes(fs_info, space_info,
6711                                                          len);
6712                 }
6713                 spin_unlock(&space_info->lock);
6714         }
6715
6716         if (cache)
6717                 btrfs_put_block_group(cache);
6718         return 0;
6719 }
6720
6721 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
6722                                struct btrfs_root *root)
6723 {
6724         struct btrfs_fs_info *fs_info = root->fs_info;
6725         struct btrfs_block_group_cache *block_group, *tmp;
6726         struct list_head *deleted_bgs;
6727         struct extent_io_tree *unpin;
6728         u64 start;
6729         u64 end;
6730         int ret;
6731
6732         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6733                 unpin = &fs_info->freed_extents[1];
6734         else
6735                 unpin = &fs_info->freed_extents[0];
6736
6737         while (!trans->aborted) {
6738                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
6739                 ret = find_first_extent_bit(unpin, 0, &start, &end,
6740                                             EXTENT_DIRTY, NULL);
6741                 if (ret) {
6742                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6743                         break;
6744                 }
6745
6746                 if (btrfs_test_opt(root->fs_info, DISCARD))
6747                         ret = btrfs_discard_extent(root, start,
6748                                                    end + 1 - start, NULL);
6749
6750                 clear_extent_dirty(unpin, start, end);
6751                 unpin_extent_range(root, start, end, true);
6752                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6753                 cond_resched();
6754         }
6755
6756         /*
6757          * Transaction is finished.  We don't need the lock anymore.  We
6758          * do need to clean up the block groups in case of a transaction
6759          * abort.
6760          */
6761         deleted_bgs = &trans->transaction->deleted_bgs;
6762         list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6763                 u64 trimmed = 0;
6764
6765                 ret = -EROFS;
6766                 if (!trans->aborted)
6767                         ret = btrfs_discard_extent(root,
6768                                                    block_group->key.objectid,
6769                                                    block_group->key.offset,
6770                                                    &trimmed);
6771
6772                 list_del_init(&block_group->bg_list);
6773                 btrfs_put_block_group_trimming(block_group);
6774                 btrfs_put_block_group(block_group);
6775
6776                 if (ret) {
6777                         const char *errstr = btrfs_decode_error(ret);
6778                         btrfs_warn(fs_info,
6779                                    "Discard failed while removing blockgroup: errno=%d %s\n",
6780                                    ret, errstr);
6781                 }
6782         }
6783
6784         return 0;
6785 }
6786
6787 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
6788                              u64 owner, u64 root_objectid)
6789 {
6790         struct btrfs_space_info *space_info;
6791         u64 flags;
6792
6793         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6794                 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
6795                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
6796                 else
6797                         flags = BTRFS_BLOCK_GROUP_METADATA;
6798         } else {
6799                 flags = BTRFS_BLOCK_GROUP_DATA;
6800         }
6801
6802         space_info = __find_space_info(fs_info, flags);
6803         BUG_ON(!space_info); /* Logic bug */
6804         percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
6805 }
6806
6807
6808 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6809                                 struct btrfs_root *root,
6810                                 struct btrfs_delayed_ref_node *node, u64 parent,
6811                                 u64 root_objectid, u64 owner_objectid,
6812                                 u64 owner_offset, int refs_to_drop,
6813                                 struct btrfs_delayed_extent_op *extent_op)
6814 {
6815         struct btrfs_key key;
6816         struct btrfs_path *path;
6817         struct btrfs_fs_info *info = root->fs_info;
6818         struct btrfs_root *extent_root = info->extent_root;
6819         struct extent_buffer *leaf;
6820         struct btrfs_extent_item *ei;
6821         struct btrfs_extent_inline_ref *iref;
6822         int ret;
6823         int is_data;
6824         int extent_slot = 0;
6825         int found_extent = 0;
6826         int num_to_del = 1;
6827         u32 item_size;
6828         u64 refs;
6829         u64 bytenr = node->bytenr;
6830         u64 num_bytes = node->num_bytes;
6831         int last_ref = 0;
6832         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6833                                                  SKINNY_METADATA);
6834
6835         path = btrfs_alloc_path();
6836         if (!path)
6837                 return -ENOMEM;
6838
6839         path->reada = READA_FORWARD;
6840         path->leave_spinning = 1;
6841
6842         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6843         BUG_ON(!is_data && refs_to_drop != 1);
6844
6845         if (is_data)
6846                 skinny_metadata = 0;
6847
6848         ret = lookup_extent_backref(trans, extent_root, path, &iref,
6849                                     bytenr, num_bytes, parent,
6850                                     root_objectid, owner_objectid,
6851                                     owner_offset);
6852         if (ret == 0) {
6853                 extent_slot = path->slots[0];
6854                 while (extent_slot >= 0) {
6855                         btrfs_item_key_to_cpu(path->nodes[0], &key,
6856                                               extent_slot);
6857                         if (key.objectid != bytenr)
6858                                 break;
6859                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6860                             key.offset == num_bytes) {
6861                                 found_extent = 1;
6862                                 break;
6863                         }
6864                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
6865                             key.offset == owner_objectid) {
6866                                 found_extent = 1;
6867                                 break;
6868                         }
6869                         if (path->slots[0] - extent_slot > 5)
6870                                 break;
6871                         extent_slot--;
6872                 }
6873 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6874                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
6875                 if (found_extent && item_size < sizeof(*ei))
6876                         found_extent = 0;
6877 #endif
6878                 if (!found_extent) {
6879                         BUG_ON(iref);
6880                         ret = remove_extent_backref(trans, extent_root, path,
6881                                                     NULL, refs_to_drop,
6882                                                     is_data, &last_ref);
6883                         if (ret) {
6884                                 btrfs_abort_transaction(trans, ret);
6885                                 goto out;
6886                         }
6887                         btrfs_release_path(path);
6888                         path->leave_spinning = 1;
6889
6890                         key.objectid = bytenr;
6891                         key.type = BTRFS_EXTENT_ITEM_KEY;
6892                         key.offset = num_bytes;
6893
6894                         if (!is_data && skinny_metadata) {
6895                                 key.type = BTRFS_METADATA_ITEM_KEY;
6896                                 key.offset = owner_objectid;
6897                         }
6898
6899                         ret = btrfs_search_slot(trans, extent_root,
6900                                                 &key, path, -1, 1);
6901                         if (ret > 0 && skinny_metadata && path->slots[0]) {
6902                                 /*
6903                                  * Couldn't find our skinny metadata item,
6904                                  * see if we have ye olde extent item.
6905                                  */
6906                                 path->slots[0]--;
6907                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
6908                                                       path->slots[0]);
6909                                 if (key.objectid == bytenr &&
6910                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
6911                                     key.offset == num_bytes)
6912                                         ret = 0;
6913                         }
6914
6915                         if (ret > 0 && skinny_metadata) {
6916                                 skinny_metadata = false;
6917                                 key.objectid = bytenr;
6918                                 key.type = BTRFS_EXTENT_ITEM_KEY;
6919                                 key.offset = num_bytes;
6920                                 btrfs_release_path(path);
6921                                 ret = btrfs_search_slot(trans, extent_root,
6922                                                         &key, path, -1, 1);
6923                         }
6924
6925                         if (ret) {
6926                                 btrfs_err(info, "umm, got %d back from search, was looking for %llu",
6927                                         ret, bytenr);
6928                                 if (ret > 0)
6929                                         btrfs_print_leaf(extent_root,
6930                                                          path->nodes[0]);
6931                         }
6932                         if (ret < 0) {
6933                                 btrfs_abort_transaction(trans, ret);
6934                                 goto out;
6935                         }
6936                         extent_slot = path->slots[0];
6937                 }
6938         } else if (WARN_ON(ret == -ENOENT)) {
6939                 btrfs_print_leaf(extent_root, path->nodes[0]);
6940                 btrfs_err(info,
6941                         "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
6942                         bytenr, parent, root_objectid, owner_objectid,
6943                         owner_offset);
6944                 btrfs_abort_transaction(trans, ret);
6945                 goto out;
6946         } else {
6947                 btrfs_abort_transaction(trans, ret);
6948                 goto out;
6949         }
6950
6951         leaf = path->nodes[0];
6952         item_size = btrfs_item_size_nr(leaf, extent_slot);
6953 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6954         if (item_size < sizeof(*ei)) {
6955                 BUG_ON(found_extent || extent_slot != path->slots[0]);
6956                 ret = convert_extent_item_v0(trans, extent_root, path,
6957                                              owner_objectid, 0);
6958                 if (ret < 0) {
6959                         btrfs_abort_transaction(trans, ret);
6960                         goto out;
6961                 }
6962
6963                 btrfs_release_path(path);
6964                 path->leave_spinning = 1;
6965
6966                 key.objectid = bytenr;
6967                 key.type = BTRFS_EXTENT_ITEM_KEY;
6968                 key.offset = num_bytes;
6969
6970                 ret = btrfs_search_slot(trans, extent_root, &key, path,
6971                                         -1, 1);
6972                 if (ret) {
6973                         btrfs_err(info, "umm, got %d back from search, was looking for %llu",
6974                                 ret, bytenr);
6975                         btrfs_print_leaf(extent_root, path->nodes[0]);
6976                 }
6977                 if (ret < 0) {
6978                         btrfs_abort_transaction(trans, ret);
6979                         goto out;
6980                 }
6981
6982                 extent_slot = path->slots[0];
6983                 leaf = path->nodes[0];
6984                 item_size = btrfs_item_size_nr(leaf, extent_slot);
6985         }
6986 #endif
6987         BUG_ON(item_size < sizeof(*ei));
6988         ei = btrfs_item_ptr(leaf, extent_slot,
6989                             struct btrfs_extent_item);
6990         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
6991             key.type == BTRFS_EXTENT_ITEM_KEY) {
6992                 struct btrfs_tree_block_info *bi;
6993                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
6994                 bi = (struct btrfs_tree_block_info *)(ei + 1);
6995                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
6996         }
6997
6998         refs = btrfs_extent_refs(leaf, ei);
6999         if (refs < refs_to_drop) {
7000                 btrfs_err(info, "trying to drop %d refs but we only have %Lu "
7001                           "for bytenr %Lu", refs_to_drop, refs, bytenr);
7002                 ret = -EINVAL;
7003                 btrfs_abort_transaction(trans, ret);
7004                 goto out;
7005         }
7006         refs -= refs_to_drop;
7007
7008         if (refs > 0) {
7009                 if (extent_op)
7010                         __run_delayed_extent_op(extent_op, leaf, ei);
7011                 /*
7012                  * In the case of inline back ref, reference count will
7013                  * be updated by remove_extent_backref
7014                  */
7015                 if (iref) {
7016                         BUG_ON(!found_extent);
7017                 } else {
7018                         btrfs_set_extent_refs(leaf, ei, refs);
7019                         btrfs_mark_buffer_dirty(leaf);
7020                 }
7021                 if (found_extent) {
7022                         ret = remove_extent_backref(trans, extent_root, path,
7023                                                     iref, refs_to_drop,
7024                                                     is_data, &last_ref);
7025                         if (ret) {
7026                                 btrfs_abort_transaction(trans, ret);
7027                                 goto out;
7028                         }
7029                 }
7030                 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
7031                                  root_objectid);
7032         } else {
7033                 if (found_extent) {
7034                         BUG_ON(is_data && refs_to_drop !=
7035                                extent_data_ref_count(path, iref));
7036                         if (iref) {
7037                                 BUG_ON(path->slots[0] != extent_slot);
7038                         } else {
7039                                 BUG_ON(path->slots[0] != extent_slot + 1);
7040                                 path->slots[0] = extent_slot;
7041                                 num_to_del = 2;
7042                         }
7043                 }
7044
7045                 last_ref = 1;
7046                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
7047                                       num_to_del);
7048                 if (ret) {
7049                         btrfs_abort_transaction(trans, ret);
7050                         goto out;
7051                 }
7052                 btrfs_release_path(path);
7053
7054                 if (is_data) {
7055                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
7056                         if (ret) {
7057                                 btrfs_abort_transaction(trans, ret);
7058                                 goto out;
7059                         }
7060                 }
7061
7062                 ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
7063                                              num_bytes);
7064                 if (ret) {
7065                         btrfs_abort_transaction(trans, ret);
7066                         goto out;
7067                 }
7068
7069                 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
7070                 if (ret) {
7071                         btrfs_abort_transaction(trans, ret);
7072                         goto out;
7073                 }
7074         }
7075         btrfs_release_path(path);
7076
7077 out:
7078         btrfs_free_path(path);
7079         return ret;
7080 }
7081
7082 /*
7083  * when we free an block, it is possible (and likely) that we free the last
7084  * delayed ref for that extent as well.  This searches the delayed ref tree for
7085  * a given extent, and if there are no other delayed refs to be processed, it
7086  * removes it from the tree.
7087  */
7088 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
7089                                       struct btrfs_root *root, u64 bytenr)
7090 {
7091         struct btrfs_delayed_ref_head *head;
7092         struct btrfs_delayed_ref_root *delayed_refs;
7093         int ret = 0;
7094
7095         delayed_refs = &trans->transaction->delayed_refs;
7096         spin_lock(&delayed_refs->lock);
7097         head = btrfs_find_delayed_ref_head(trans, bytenr);
7098         if (!head)
7099                 goto out_delayed_unlock;
7100
7101         spin_lock(&head->lock);
7102         if (!list_empty(&head->ref_list))
7103                 goto out;
7104
7105         if (head->extent_op) {
7106                 if (!head->must_insert_reserved)
7107                         goto out;
7108                 btrfs_free_delayed_extent_op(head->extent_op);
7109                 head->extent_op = NULL;
7110         }
7111
7112         /*
7113          * waiting for the lock here would deadlock.  If someone else has it
7114          * locked they are already in the process of dropping it anyway
7115          */
7116         if (!mutex_trylock(&head->mutex))
7117                 goto out;
7118
7119         /*
7120          * at this point we have a head with no other entries.  Go
7121          * ahead and process it.
7122          */
7123         head->node.in_tree = 0;
7124         rb_erase(&head->href_node, &delayed_refs->href_root);
7125
7126         atomic_dec(&delayed_refs->num_entries);
7127
7128         /*
7129          * we don't take a ref on the node because we're removing it from the
7130          * tree, so we just steal the ref the tree was holding.
7131          */
7132         delayed_refs->num_heads--;
7133         if (head->processing == 0)
7134                 delayed_refs->num_heads_ready--;
7135         head->processing = 0;
7136         spin_unlock(&head->lock);
7137         spin_unlock(&delayed_refs->lock);
7138
7139         BUG_ON(head->extent_op);
7140         if (head->must_insert_reserved)
7141                 ret = 1;
7142
7143         mutex_unlock(&head->mutex);
7144         btrfs_put_delayed_ref(&head->node);
7145         return ret;
7146 out:
7147         spin_unlock(&head->lock);
7148
7149 out_delayed_unlock:
7150         spin_unlock(&delayed_refs->lock);
7151         return 0;
7152 }
7153
7154 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7155                            struct btrfs_root *root,
7156                            struct extent_buffer *buf,
7157                            u64 parent, int last_ref)
7158 {
7159         int pin = 1;
7160         int ret;
7161
7162         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7163                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
7164                                         buf->start, buf->len,
7165                                         parent, root->root_key.objectid,
7166                                         btrfs_header_level(buf),
7167                                         BTRFS_DROP_DELAYED_REF, NULL);
7168                 BUG_ON(ret); /* -ENOMEM */
7169         }
7170
7171         if (!last_ref)
7172                 return;
7173
7174         if (btrfs_header_generation(buf) == trans->transid) {
7175                 struct btrfs_block_group_cache *cache;
7176
7177                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7178                         ret = check_ref_cleanup(trans, root, buf->start);
7179                         if (!ret)
7180                                 goto out;
7181                 }
7182
7183                 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
7184
7185                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
7186                         pin_down_extent(root, cache, buf->start, buf->len, 1);
7187                         btrfs_put_block_group(cache);
7188                         goto out;
7189                 }
7190
7191                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
7192
7193                 btrfs_add_free_space(cache, buf->start, buf->len);
7194                 btrfs_free_reserved_bytes(cache, buf->len, 0);
7195                 btrfs_put_block_group(cache);
7196                 trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
7197                 pin = 0;
7198         }
7199 out:
7200         if (pin)
7201                 add_pinned_bytes(root->fs_info, buf->len,
7202                                  btrfs_header_level(buf),
7203                                  root->root_key.objectid);
7204
7205         /*
7206          * Deleting the buffer, clear the corrupt flag since it doesn't matter
7207          * anymore.
7208          */
7209         clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
7210 }
7211
7212 /* Can return -ENOMEM */
7213 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
7214                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
7215                       u64 owner, u64 offset)
7216 {
7217         int ret;
7218         struct btrfs_fs_info *fs_info = root->fs_info;
7219
7220         if (btrfs_is_testing(fs_info))
7221                 return 0;
7222
7223         add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
7224
7225         /*
7226          * tree log blocks never actually go into the extent allocation
7227          * tree, just update pinning info and exit early.
7228          */
7229         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
7230                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
7231                 /* unlocks the pinned mutex */
7232                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
7233                 ret = 0;
7234         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
7235                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
7236                                         num_bytes,
7237                                         parent, root_objectid, (int)owner,
7238                                         BTRFS_DROP_DELAYED_REF, NULL);
7239         } else {
7240                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
7241                                                 num_bytes,
7242                                                 parent, root_objectid, owner,
7243                                                 offset, 0,
7244                                                 BTRFS_DROP_DELAYED_REF, NULL);
7245         }
7246         return ret;
7247 }
7248
7249 /*
7250  * when we wait for progress in the block group caching, its because
7251  * our allocation attempt failed at least once.  So, we must sleep
7252  * and let some progress happen before we try again.
7253  *
7254  * This function will sleep at least once waiting for new free space to
7255  * show up, and then it will check the block group free space numbers
7256  * for our min num_bytes.  Another option is to have it go ahead
7257  * and look in the rbtree for a free extent of a given size, but this
7258  * is a good start.
7259  *
7260  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
7261  * any of the information in this block group.
7262  */
7263 static noinline void
7264 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
7265                                 u64 num_bytes)
7266 {
7267         struct btrfs_caching_control *caching_ctl;
7268
7269         caching_ctl = get_caching_control(cache);
7270         if (!caching_ctl)
7271                 return;
7272
7273         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
7274                    (cache->free_space_ctl->free_space >= num_bytes));
7275
7276         put_caching_control(caching_ctl);
7277 }
7278
7279 static noinline int
7280 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7281 {
7282         struct btrfs_caching_control *caching_ctl;
7283         int ret = 0;
7284
7285         caching_ctl = get_caching_control(cache);
7286         if (!caching_ctl)
7287                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
7288
7289         wait_event(caching_ctl->wait, block_group_cache_done(cache));
7290         if (cache->cached == BTRFS_CACHE_ERROR)
7291                 ret = -EIO;
7292         put_caching_control(caching_ctl);
7293         return ret;
7294 }
7295
7296 int __get_raid_index(u64 flags)
7297 {
7298         if (flags & BTRFS_BLOCK_GROUP_RAID10)
7299                 return BTRFS_RAID_RAID10;
7300         else if (flags & BTRFS_BLOCK_GROUP_RAID1)
7301                 return BTRFS_RAID_RAID1;
7302         else if (flags & BTRFS_BLOCK_GROUP_DUP)
7303                 return BTRFS_RAID_DUP;
7304         else if (flags & BTRFS_BLOCK_GROUP_RAID0)
7305                 return BTRFS_RAID_RAID0;
7306         else if (flags & BTRFS_BLOCK_GROUP_RAID5)
7307                 return BTRFS_RAID_RAID5;
7308         else if (flags & BTRFS_BLOCK_GROUP_RAID6)
7309                 return BTRFS_RAID_RAID6;
7310
7311         return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
7312 }
7313
7314 int get_block_group_index(struct btrfs_block_group_cache *cache)
7315 {
7316         return __get_raid_index(cache->flags);
7317 }
7318
7319 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
7320         [BTRFS_RAID_RAID10]     = "raid10",
7321         [BTRFS_RAID_RAID1]      = "raid1",
7322         [BTRFS_RAID_DUP]        = "dup",
7323         [BTRFS_RAID_RAID0]      = "raid0",
7324         [BTRFS_RAID_SINGLE]     = "single",
7325         [BTRFS_RAID_RAID5]      = "raid5",
7326         [BTRFS_RAID_RAID6]      = "raid6",
7327 };
7328
7329 static const char *get_raid_name(enum btrfs_raid_types type)
7330 {
7331         if (type >= BTRFS_NR_RAID_TYPES)
7332                 return NULL;
7333
7334         return btrfs_raid_type_names[type];
7335 }
7336
7337 enum btrfs_loop_type {
7338         LOOP_CACHING_NOWAIT = 0,
7339         LOOP_CACHING_WAIT = 1,
7340         LOOP_ALLOC_CHUNK = 2,
7341         LOOP_NO_EMPTY_SIZE = 3,
7342 };
7343
7344 static inline void
7345 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
7346                        int delalloc)
7347 {
7348         if (delalloc)
7349                 down_read(&cache->data_rwsem);
7350 }
7351
7352 static inline void
7353 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
7354                        int delalloc)
7355 {
7356         btrfs_get_block_group(cache);
7357         if (delalloc)
7358                 down_read(&cache->data_rwsem);
7359 }
7360
7361 static struct btrfs_block_group_cache *
7362 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
7363                    struct btrfs_free_cluster *cluster,
7364                    int delalloc)
7365 {
7366         struct btrfs_block_group_cache *used_bg = NULL;
7367
7368         spin_lock(&cluster->refill_lock);
7369         while (1) {
7370                 used_bg = cluster->block_group;
7371                 if (!used_bg)
7372                         return NULL;
7373
7374                 if (used_bg == block_group)
7375                         return used_bg;
7376
7377                 btrfs_get_block_group(used_bg);
7378
7379                 if (!delalloc)
7380                         return used_bg;
7381
7382                 if (down_read_trylock(&used_bg->data_rwsem))
7383                         return used_bg;
7384
7385                 spin_unlock(&cluster->refill_lock);
7386
7387                 down_read(&used_bg->data_rwsem);
7388
7389                 spin_lock(&cluster->refill_lock);
7390                 if (used_bg == cluster->block_group)
7391                         return used_bg;
7392
7393                 up_read(&used_bg->data_rwsem);
7394                 btrfs_put_block_group(used_bg);
7395         }
7396 }
7397
7398 static inline void
7399 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7400                          int delalloc)
7401 {
7402         if (delalloc)
7403                 up_read(&cache->data_rwsem);
7404         btrfs_put_block_group(cache);
7405 }
7406
7407 /*
7408  * walks the btree of allocated extents and find a hole of a given size.
7409  * The key ins is changed to record the hole:
7410  * ins->objectid == start position
7411  * ins->flags = BTRFS_EXTENT_ITEM_KEY
7412  * ins->offset == the size of the hole.
7413  * Any available blocks before search_start are skipped.
7414  *
7415  * If there is no suitable free space, we will record the max size of
7416  * the free space extent currently.
7417  */
7418 static noinline int find_free_extent(struct btrfs_root *orig_root,
7419                                 u64 ram_bytes, u64 num_bytes, u64 empty_size,
7420                                 u64 hint_byte, struct btrfs_key *ins,
7421                                 u64 flags, int delalloc)
7422 {
7423         int ret = 0;
7424         struct btrfs_root *root = orig_root->fs_info->extent_root;
7425         struct btrfs_free_cluster *last_ptr = NULL;
7426         struct btrfs_block_group_cache *block_group = NULL;
7427         u64 search_start = 0;
7428         u64 max_extent_size = 0;
7429         u64 empty_cluster = 0;
7430         struct btrfs_space_info *space_info;
7431         int loop = 0;
7432         int index = __get_raid_index(flags);
7433         bool failed_cluster_refill = false;
7434         bool failed_alloc = false;
7435         bool use_cluster = true;
7436         bool have_caching_bg = false;
7437         bool orig_have_caching_bg = false;
7438         bool full_search = false;
7439
7440         WARN_ON(num_bytes < root->sectorsize);
7441         ins->type = BTRFS_EXTENT_ITEM_KEY;
7442         ins->objectid = 0;
7443         ins->offset = 0;
7444
7445         trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
7446
7447         space_info = __find_space_info(root->fs_info, flags);
7448         if (!space_info) {
7449                 btrfs_err(root->fs_info, "No space info for %llu", flags);
7450                 return -ENOSPC;
7451         }
7452
7453         /*
7454          * If our free space is heavily fragmented we may not be able to make
7455          * big contiguous allocations, so instead of doing the expensive search
7456          * for free space, simply return ENOSPC with our max_extent_size so we
7457          * can go ahead and search for a more manageable chunk.
7458          *
7459          * If our max_extent_size is large enough for our allocation simply
7460          * disable clustering since we will likely not be able to find enough
7461          * space to create a cluster and induce latency trying.
7462          */
7463         if (unlikely(space_info->max_extent_size)) {
7464                 spin_lock(&space_info->lock);
7465                 if (space_info->max_extent_size &&
7466                     num_bytes > space_info->max_extent_size) {
7467                         ins->offset = space_info->max_extent_size;
7468                         spin_unlock(&space_info->lock);
7469                         return -ENOSPC;
7470                 } else if (space_info->max_extent_size) {
7471                         use_cluster = false;
7472                 }
7473                 spin_unlock(&space_info->lock);
7474         }
7475
7476         last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
7477         if (last_ptr) {
7478                 spin_lock(&last_ptr->lock);
7479                 if (last_ptr->block_group)
7480                         hint_byte = last_ptr->window_start;
7481                 if (last_ptr->fragmented) {
7482                         /*
7483                          * We still set window_start so we can keep track of the
7484                          * last place we found an allocation to try and save
7485                          * some time.
7486                          */
7487                         hint_byte = last_ptr->window_start;
7488                         use_cluster = false;
7489                 }
7490                 spin_unlock(&last_ptr->lock);
7491         }
7492
7493         search_start = max(search_start, first_logical_byte(root, 0));
7494         search_start = max(search_start, hint_byte);
7495         if (search_start == hint_byte) {
7496                 block_group = btrfs_lookup_block_group(root->fs_info,
7497                                                        search_start);
7498                 /*
7499                  * we don't want to use the block group if it doesn't match our
7500                  * allocation bits, or if its not cached.
7501                  *
7502                  * However if we are re-searching with an ideal block group
7503                  * picked out then we don't care that the block group is cached.
7504                  */
7505                 if (block_group && block_group_bits(block_group, flags) &&
7506                     block_group->cached != BTRFS_CACHE_NO) {
7507                         down_read(&space_info->groups_sem);
7508                         if (list_empty(&block_group->list) ||
7509                             block_group->ro) {
7510                                 /*
7511                                  * someone is removing this block group,
7512                                  * we can't jump into the have_block_group
7513                                  * target because our list pointers are not
7514                                  * valid
7515                                  */
7516                                 btrfs_put_block_group(block_group);
7517                                 up_read(&space_info->groups_sem);
7518                         } else {
7519                                 index = get_block_group_index(block_group);
7520                                 btrfs_lock_block_group(block_group, delalloc);
7521                                 goto have_block_group;
7522                         }
7523                 } else if (block_group) {
7524                         btrfs_put_block_group(block_group);
7525                 }
7526         }
7527 search:
7528         have_caching_bg = false;
7529         if (index == 0 || index == __get_raid_index(flags))
7530                 full_search = true;
7531         down_read(&space_info->groups_sem);
7532         list_for_each_entry(block_group, &space_info->block_groups[index],
7533                             list) {
7534                 u64 offset;
7535                 int cached;
7536
7537                 btrfs_grab_block_group(block_group, delalloc);
7538                 search_start = block_group->key.objectid;
7539
7540                 /*
7541                  * this can happen if we end up cycling through all the
7542                  * raid types, but we want to make sure we only allocate
7543                  * for the proper type.
7544                  */
7545                 if (!block_group_bits(block_group, flags)) {
7546                     u64 extra = BTRFS_BLOCK_GROUP_DUP |
7547                                 BTRFS_BLOCK_GROUP_RAID1 |
7548                                 BTRFS_BLOCK_GROUP_RAID5 |
7549                                 BTRFS_BLOCK_GROUP_RAID6 |
7550                                 BTRFS_BLOCK_GROUP_RAID10;
7551
7552                         /*
7553                          * if they asked for extra copies and this block group
7554                          * doesn't provide them, bail.  This does allow us to
7555                          * fill raid0 from raid1.
7556                          */
7557                         if ((flags & extra) && !(block_group->flags & extra))
7558                                 goto loop;
7559                 }
7560
7561 have_block_group:
7562                 cached = block_group_cache_done(block_group);
7563                 if (unlikely(!cached)) {
7564                         have_caching_bg = true;
7565                         ret = cache_block_group(block_group, 0);
7566                         BUG_ON(ret < 0);
7567                         ret = 0;
7568                 }
7569
7570                 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
7571                         goto loop;
7572                 if (unlikely(block_group->ro))
7573                         goto loop;
7574
7575                 /*
7576                  * Ok we want to try and use the cluster allocator, so
7577                  * lets look there
7578                  */
7579                 if (last_ptr && use_cluster) {
7580                         struct btrfs_block_group_cache *used_block_group;
7581                         unsigned long aligned_cluster;
7582                         /*
7583                          * the refill lock keeps out other
7584                          * people trying to start a new cluster
7585                          */
7586                         used_block_group = btrfs_lock_cluster(block_group,
7587                                                               last_ptr,
7588                                                               delalloc);
7589                         if (!used_block_group)
7590                                 goto refill_cluster;
7591
7592                         if (used_block_group != block_group &&
7593                             (used_block_group->ro ||
7594                              !block_group_bits(used_block_group, flags)))
7595                                 goto release_cluster;
7596
7597                         offset = btrfs_alloc_from_cluster(used_block_group,
7598                                                 last_ptr,
7599                                                 num_bytes,
7600                                                 used_block_group->key.objectid,
7601                                                 &max_extent_size);
7602                         if (offset) {
7603                                 /* we have a block, we're done */
7604                                 spin_unlock(&last_ptr->refill_lock);
7605                                 trace_btrfs_reserve_extent_cluster(root,
7606                                                 used_block_group,
7607                                                 search_start, num_bytes);
7608                                 if (used_block_group != block_group) {
7609                                         btrfs_release_block_group(block_group,
7610                                                                   delalloc);
7611                                         block_group = used_block_group;
7612                                 }
7613                                 goto checks;
7614                         }
7615
7616                         WARN_ON(last_ptr->block_group != used_block_group);
7617 release_cluster:
7618                         /* If we are on LOOP_NO_EMPTY_SIZE, we can't
7619                          * set up a new clusters, so lets just skip it
7620                          * and let the allocator find whatever block
7621                          * it can find.  If we reach this point, we
7622                          * will have tried the cluster allocator
7623                          * plenty of times and not have found
7624                          * anything, so we are likely way too
7625                          * fragmented for the clustering stuff to find
7626                          * anything.
7627                          *
7628                          * However, if the cluster is taken from the
7629                          * current block group, release the cluster
7630                          * first, so that we stand a better chance of
7631                          * succeeding in the unclustered
7632                          * allocation.  */
7633                         if (loop >= LOOP_NO_EMPTY_SIZE &&
7634                             used_block_group != block_group) {
7635                                 spin_unlock(&last_ptr->refill_lock);
7636                                 btrfs_release_block_group(used_block_group,
7637                                                           delalloc);
7638                                 goto unclustered_alloc;
7639                         }
7640
7641                         /*
7642                          * this cluster didn't work out, free it and
7643                          * start over
7644                          */
7645                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7646
7647                         if (used_block_group != block_group)
7648                                 btrfs_release_block_group(used_block_group,
7649                                                           delalloc);
7650 refill_cluster:
7651                         if (loop >= LOOP_NO_EMPTY_SIZE) {
7652                                 spin_unlock(&last_ptr->refill_lock);
7653                                 goto unclustered_alloc;
7654                         }
7655
7656                         aligned_cluster = max_t(unsigned long,
7657                                                 empty_cluster + empty_size,
7658                                               block_group->full_stripe_len);
7659
7660                         /* allocate a cluster in this block group */
7661                         ret = btrfs_find_space_cluster(root, block_group,
7662                                                        last_ptr, search_start,
7663                                                        num_bytes,
7664                                                        aligned_cluster);
7665                         if (ret == 0) {
7666                                 /*
7667                                  * now pull our allocation out of this
7668                                  * cluster
7669                                  */
7670                                 offset = btrfs_alloc_from_cluster(block_group,
7671                                                         last_ptr,
7672                                                         num_bytes,
7673                                                         search_start,
7674                                                         &max_extent_size);
7675                                 if (offset) {
7676                                         /* we found one, proceed */
7677                                         spin_unlock(&last_ptr->refill_lock);
7678                                         trace_btrfs_reserve_extent_cluster(root,
7679                                                 block_group, search_start,
7680                                                 num_bytes);
7681                                         goto checks;
7682                                 }
7683                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
7684                                    && !failed_cluster_refill) {
7685                                 spin_unlock(&last_ptr->refill_lock);
7686
7687                                 failed_cluster_refill = true;
7688                                 wait_block_group_cache_progress(block_group,
7689                                        num_bytes + empty_cluster + empty_size);
7690                                 goto have_block_group;
7691                         }
7692
7693                         /*
7694                          * at this point we either didn't find a cluster
7695                          * or we weren't able to allocate a block from our
7696                          * cluster.  Free the cluster we've been trying
7697                          * to use, and go to the next block group
7698                          */
7699                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7700                         spin_unlock(&last_ptr->refill_lock);
7701                         goto loop;
7702                 }
7703
7704 unclustered_alloc:
7705                 /*
7706                  * We are doing an unclustered alloc, set the fragmented flag so
7707                  * we don't bother trying to setup a cluster again until we get
7708                  * more space.
7709                  */
7710                 if (unlikely(last_ptr)) {
7711                         spin_lock(&last_ptr->lock);
7712                         last_ptr->fragmented = 1;
7713                         spin_unlock(&last_ptr->lock);
7714                 }
7715                 spin_lock(&block_group->free_space_ctl->tree_lock);
7716                 if (cached &&
7717                     block_group->free_space_ctl->free_space <
7718                     num_bytes + empty_cluster + empty_size) {
7719                         if (block_group->free_space_ctl->free_space >
7720                             max_extent_size)
7721                                 max_extent_size =
7722                                         block_group->free_space_ctl->free_space;
7723                         spin_unlock(&block_group->free_space_ctl->tree_lock);
7724                         goto loop;
7725                 }
7726                 spin_unlock(&block_group->free_space_ctl->tree_lock);
7727
7728                 offset = btrfs_find_space_for_alloc(block_group, search_start,
7729                                                     num_bytes, empty_size,
7730                                                     &max_extent_size);
7731                 /*
7732                  * If we didn't find a chunk, and we haven't failed on this
7733                  * block group before, and this block group is in the middle of
7734                  * caching and we are ok with waiting, then go ahead and wait
7735                  * for progress to be made, and set failed_alloc to true.
7736                  *
7737                  * If failed_alloc is true then we've already waited on this
7738                  * block group once and should move on to the next block group.
7739                  */
7740                 if (!offset && !failed_alloc && !cached &&
7741                     loop > LOOP_CACHING_NOWAIT) {
7742                         wait_block_group_cache_progress(block_group,
7743                                                 num_bytes + empty_size);
7744                         failed_alloc = true;
7745                         goto have_block_group;
7746                 } else if (!offset) {
7747                         goto loop;
7748                 }
7749 checks:
7750                 search_start = ALIGN(offset, root->stripesize);
7751
7752                 /* move on to the next group */
7753                 if (search_start + num_bytes >
7754                     block_group->key.objectid + block_group->key.offset) {
7755                         btrfs_add_free_space(block_group, offset, num_bytes);
7756                         goto loop;
7757                 }
7758
7759                 if (offset < search_start)
7760                         btrfs_add_free_space(block_group, offset,
7761                                              search_start - offset);
7762                 BUG_ON(offset > search_start);
7763
7764                 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
7765                                 num_bytes, delalloc);
7766                 if (ret == -EAGAIN) {
7767                         btrfs_add_free_space(block_group, offset, num_bytes);
7768                         goto loop;
7769                 }
7770                 btrfs_inc_block_group_reservations(block_group);
7771
7772                 /* we are all good, lets return */
7773                 ins->objectid = search_start;
7774                 ins->offset = num_bytes;
7775
7776                 trace_btrfs_reserve_extent(orig_root, block_group,
7777                                            search_start, num_bytes);
7778                 btrfs_release_block_group(block_group, delalloc);
7779                 break;
7780 loop:
7781                 failed_cluster_refill = false;
7782                 failed_alloc = false;
7783                 BUG_ON(index != get_block_group_index(block_group));
7784                 btrfs_release_block_group(block_group, delalloc);
7785         }
7786         up_read(&space_info->groups_sem);
7787
7788         if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
7789                 && !orig_have_caching_bg)
7790                 orig_have_caching_bg = true;
7791
7792         if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
7793                 goto search;
7794
7795         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
7796                 goto search;
7797
7798         /*
7799          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7800          *                      caching kthreads as we move along
7801          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7802          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7803          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7804          *                      again
7805          */
7806         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7807                 index = 0;
7808                 if (loop == LOOP_CACHING_NOWAIT) {
7809                         /*
7810                          * We want to skip the LOOP_CACHING_WAIT step if we
7811                          * don't have any uncached bgs and we've already done a
7812                          * full search through.
7813                          */
7814                         if (orig_have_caching_bg || !full_search)
7815                                 loop = LOOP_CACHING_WAIT;
7816                         else
7817                                 loop = LOOP_ALLOC_CHUNK;
7818                 } else {
7819                         loop++;
7820                 }
7821
7822                 if (loop == LOOP_ALLOC_CHUNK) {
7823                         struct btrfs_trans_handle *trans;
7824                         int exist = 0;
7825
7826                         trans = current->journal_info;
7827                         if (trans)
7828                                 exist = 1;
7829                         else
7830                                 trans = btrfs_join_transaction(root);
7831
7832                         if (IS_ERR(trans)) {
7833                                 ret = PTR_ERR(trans);
7834                                 goto out;
7835                         }
7836
7837                         ret = do_chunk_alloc(trans, root, flags,
7838                                              CHUNK_ALLOC_FORCE);
7839
7840                         /*
7841                          * If we can't allocate a new chunk we've already looped
7842                          * through at least once, move on to the NO_EMPTY_SIZE
7843                          * case.
7844                          */
7845                         if (ret == -ENOSPC)
7846                                 loop = LOOP_NO_EMPTY_SIZE;
7847
7848                         /*
7849                          * Do not bail out on ENOSPC since we
7850                          * can do more things.
7851                          */
7852                         if (ret < 0 && ret != -ENOSPC)
7853                                 btrfs_abort_transaction(trans, ret);
7854                         else
7855                                 ret = 0;
7856                         if (!exist)
7857                                 btrfs_end_transaction(trans, root);
7858                         if (ret)
7859                                 goto out;
7860                 }
7861
7862                 if (loop == LOOP_NO_EMPTY_SIZE) {
7863                         /*
7864                          * Don't loop again if we already have no empty_size and
7865                          * no empty_cluster.
7866                          */
7867                         if (empty_size == 0 &&
7868                             empty_cluster == 0) {
7869                                 ret = -ENOSPC;
7870                                 goto out;
7871                         }
7872                         empty_size = 0;
7873                         empty_cluster = 0;
7874                 }
7875
7876                 goto search;
7877         } else if (!ins->objectid) {
7878                 ret = -ENOSPC;
7879         } else if (ins->objectid) {
7880                 if (!use_cluster && last_ptr) {
7881                         spin_lock(&last_ptr->lock);
7882                         last_ptr->window_start = ins->objectid;
7883                         spin_unlock(&last_ptr->lock);
7884                 }
7885                 ret = 0;
7886         }
7887 out:
7888         if (ret == -ENOSPC) {
7889                 spin_lock(&space_info->lock);
7890                 space_info->max_extent_size = max_extent_size;
7891                 spin_unlock(&space_info->lock);
7892                 ins->offset = max_extent_size;
7893         }
7894         return ret;
7895 }
7896
7897 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
7898                             int dump_block_groups)
7899 {
7900         struct btrfs_block_group_cache *cache;
7901         int index = 0;
7902
7903         spin_lock(&info->lock);
7904         printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
7905                info->flags,
7906                info->total_bytes - info->bytes_used - info->bytes_pinned -
7907                info->bytes_reserved - info->bytes_readonly -
7908                info->bytes_may_use, (info->full) ? "" : "not ");
7909         printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
7910                "reserved=%llu, may_use=%llu, readonly=%llu\n",
7911                info->total_bytes, info->bytes_used, info->bytes_pinned,
7912                info->bytes_reserved, info->bytes_may_use,
7913                info->bytes_readonly);
7914         spin_unlock(&info->lock);
7915
7916         if (!dump_block_groups)
7917                 return;
7918
7919         down_read(&info->groups_sem);
7920 again:
7921         list_for_each_entry(cache, &info->block_groups[index], list) {
7922                 spin_lock(&cache->lock);
7923                 printk(KERN_INFO "BTRFS: "
7924                            "block group %llu has %llu bytes, "
7925                            "%llu used %llu pinned %llu reserved %s\n",
7926                        cache->key.objectid, cache->key.offset,
7927                        btrfs_block_group_used(&cache->item), cache->pinned,
7928                        cache->reserved, cache->ro ? "[readonly]" : "");
7929                 btrfs_dump_free_space(cache, bytes);
7930                 spin_unlock(&cache->lock);
7931         }
7932         if (++index < BTRFS_NR_RAID_TYPES)
7933                 goto again;
7934         up_read(&info->groups_sem);
7935 }
7936
7937 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
7938                          u64 num_bytes, u64 min_alloc_size,
7939                          u64 empty_size, u64 hint_byte,
7940                          struct btrfs_key *ins, int is_data, int delalloc)
7941 {
7942         bool final_tried = num_bytes == min_alloc_size;
7943         u64 flags;
7944         int ret;
7945
7946         flags = btrfs_get_alloc_profile(root, is_data);
7947 again:
7948         WARN_ON(num_bytes < root->sectorsize);
7949         ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
7950                                hint_byte, ins, flags, delalloc);
7951         if (!ret && !is_data) {
7952                 btrfs_dec_block_group_reservations(root->fs_info,
7953                                                    ins->objectid);
7954         } else if (ret == -ENOSPC) {
7955                 if (!final_tried && ins->offset) {
7956                         num_bytes = min(num_bytes >> 1, ins->offset);
7957                         num_bytes = round_down(num_bytes, root->sectorsize);
7958                         num_bytes = max(num_bytes, min_alloc_size);
7959                         ram_bytes = num_bytes;
7960                         if (num_bytes == min_alloc_size)
7961                                 final_tried = true;
7962                         goto again;
7963                 } else if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
7964                         struct btrfs_space_info *sinfo;
7965
7966                         sinfo = __find_space_info(root->fs_info, flags);
7967                         btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
7968                                 flags, num_bytes);
7969                         if (sinfo)
7970                                 dump_space_info(sinfo, num_bytes, 1);
7971                 }
7972         }
7973
7974         return ret;
7975 }
7976
7977 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
7978                                         u64 start, u64 len,
7979                                         int pin, int delalloc)
7980 {
7981         struct btrfs_block_group_cache *cache;
7982         int ret = 0;
7983
7984         cache = btrfs_lookup_block_group(root->fs_info, start);
7985         if (!cache) {
7986                 btrfs_err(root->fs_info, "Unable to find block group for %llu",
7987                         start);
7988                 return -ENOSPC;
7989         }
7990
7991         if (pin)
7992                 pin_down_extent(root, cache, start, len, 1);
7993         else {
7994                 if (btrfs_test_opt(root->fs_info, DISCARD))
7995                         ret = btrfs_discard_extent(root, start, len, NULL);
7996                 btrfs_add_free_space(cache, start, len);
7997                 btrfs_free_reserved_bytes(cache, len, delalloc);
7998                 trace_btrfs_reserved_extent_free(root, start, len);
7999         }
8000
8001         btrfs_put_block_group(cache);
8002         return ret;
8003 }
8004
8005 int btrfs_free_reserved_extent(struct btrfs_root *root,
8006                                u64 start, u64 len, int delalloc)
8007 {
8008         return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
8009 }
8010
8011 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
8012                                        u64 start, u64 len)
8013 {
8014         return __btrfs_free_reserved_extent(root, start, len, 1, 0);
8015 }
8016
8017 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8018                                       struct btrfs_root *root,
8019                                       u64 parent, u64 root_objectid,
8020                                       u64 flags, u64 owner, u64 offset,
8021                                       struct btrfs_key *ins, int ref_mod)
8022 {
8023         int ret;
8024         struct btrfs_fs_info *fs_info = root->fs_info;
8025         struct btrfs_extent_item *extent_item;
8026         struct btrfs_extent_inline_ref *iref;
8027         struct btrfs_path *path;
8028         struct extent_buffer *leaf;
8029         int type;
8030         u32 size;
8031
8032         if (parent > 0)
8033                 type = BTRFS_SHARED_DATA_REF_KEY;
8034         else
8035                 type = BTRFS_EXTENT_DATA_REF_KEY;
8036
8037         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
8038
8039         path = btrfs_alloc_path();
8040         if (!path)
8041                 return -ENOMEM;
8042
8043         path->leave_spinning = 1;
8044         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8045                                       ins, size);
8046         if (ret) {
8047                 btrfs_free_path(path);
8048                 return ret;
8049         }
8050
8051         leaf = path->nodes[0];
8052         extent_item = btrfs_item_ptr(leaf, path->slots[0],
8053                                      struct btrfs_extent_item);
8054         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
8055         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8056         btrfs_set_extent_flags(leaf, extent_item,
8057                                flags | BTRFS_EXTENT_FLAG_DATA);
8058
8059         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8060         btrfs_set_extent_inline_ref_type(leaf, iref, type);
8061         if (parent > 0) {
8062                 struct btrfs_shared_data_ref *ref;
8063                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
8064                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8065                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
8066         } else {
8067                 struct btrfs_extent_data_ref *ref;
8068                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
8069                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
8070                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
8071                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
8072                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
8073         }
8074
8075         btrfs_mark_buffer_dirty(path->nodes[0]);
8076         btrfs_free_path(path);
8077
8078         ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
8079                                           ins->offset);
8080         if (ret)
8081                 return ret;
8082
8083         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
8084         if (ret) { /* -ENOENT, logic error */
8085                 btrfs_err(fs_info, "update block group failed for %llu %llu",
8086                         ins->objectid, ins->offset);
8087                 BUG();
8088         }
8089         trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
8090         return ret;
8091 }
8092
8093 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
8094                                      struct btrfs_root *root,
8095                                      u64 parent, u64 root_objectid,
8096                                      u64 flags, struct btrfs_disk_key *key,
8097                                      int level, struct btrfs_key *ins)
8098 {
8099         int ret;
8100         struct btrfs_fs_info *fs_info = root->fs_info;
8101         struct btrfs_extent_item *extent_item;
8102         struct btrfs_tree_block_info *block_info;
8103         struct btrfs_extent_inline_ref *iref;
8104         struct btrfs_path *path;
8105         struct extent_buffer *leaf;
8106         u32 size = sizeof(*extent_item) + sizeof(*iref);
8107         u64 num_bytes = ins->offset;
8108         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
8109                                                  SKINNY_METADATA);
8110
8111         if (!skinny_metadata)
8112                 size += sizeof(*block_info);
8113
8114         path = btrfs_alloc_path();
8115         if (!path) {
8116                 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
8117                                                    root->nodesize);
8118                 return -ENOMEM;
8119         }
8120
8121         path->leave_spinning = 1;
8122         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8123                                       ins, size);
8124         if (ret) {
8125                 btrfs_free_path(path);
8126                 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
8127                                                    root->nodesize);
8128                 return ret;
8129         }
8130
8131         leaf = path->nodes[0];
8132         extent_item = btrfs_item_ptr(leaf, path->slots[0],
8133                                      struct btrfs_extent_item);
8134         btrfs_set_extent_refs(leaf, extent_item, 1);
8135         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8136         btrfs_set_extent_flags(leaf, extent_item,
8137                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
8138
8139         if (skinny_metadata) {
8140                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8141                 num_bytes = root->nodesize;
8142         } else {
8143                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
8144                 btrfs_set_tree_block_key(leaf, block_info, key);
8145                 btrfs_set_tree_block_level(leaf, block_info, level);
8146                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
8147         }
8148
8149         if (parent > 0) {
8150                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
8151                 btrfs_set_extent_inline_ref_type(leaf, iref,
8152                                                  BTRFS_SHARED_BLOCK_REF_KEY);
8153                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8154         } else {
8155                 btrfs_set_extent_inline_ref_type(leaf, iref,
8156                                                  BTRFS_TREE_BLOCK_REF_KEY);
8157                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
8158         }
8159
8160         btrfs_mark_buffer_dirty(leaf);
8161         btrfs_free_path(path);
8162
8163         ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
8164                                           num_bytes);
8165         if (ret)
8166                 return ret;
8167
8168         ret = update_block_group(trans, root, ins->objectid, root->nodesize,
8169                                  1);
8170         if (ret) { /* -ENOENT, logic error */
8171                 btrfs_err(fs_info, "update block group failed for %llu %llu",
8172                         ins->objectid, ins->offset);
8173                 BUG();
8174         }
8175
8176         trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
8177         return ret;
8178 }
8179
8180 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8181                                      struct btrfs_root *root,
8182                                      u64 root_objectid, u64 owner,
8183                                      u64 offset, u64 ram_bytes,
8184                                      struct btrfs_key *ins)
8185 {
8186         int ret;
8187
8188         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
8189
8190         ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
8191                                          ins->offset, 0,
8192                                          root_objectid, owner, offset,
8193                                          ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
8194                                          NULL);
8195         return ret;
8196 }
8197
8198 /*
8199  * this is used by the tree logging recovery code.  It records that
8200  * an extent has been allocated and makes sure to clear the free
8201  * space cache bits as well
8202  */
8203 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
8204                                    struct btrfs_root *root,
8205                                    u64 root_objectid, u64 owner, u64 offset,
8206                                    struct btrfs_key *ins)
8207 {
8208         int ret;
8209         struct btrfs_block_group_cache *block_group;
8210
8211         /*
8212          * Mixed block groups will exclude before processing the log so we only
8213          * need to do the exclude dance if this fs isn't mixed.
8214          */
8215         if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
8216                 ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
8217                 if (ret)
8218                         return ret;
8219         }
8220
8221         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
8222         if (!block_group)
8223                 return -EINVAL;
8224
8225         ret = btrfs_add_reserved_bytes(block_group, ins->offset,
8226                                        ins->offset, 0);
8227         BUG_ON(ret); /* logic error */
8228         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
8229                                          0, owner, offset, ins, 1);
8230         btrfs_put_block_group(block_group);
8231         return ret;
8232 }
8233
8234 static struct extent_buffer *
8235 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8236                       u64 bytenr, int level)
8237 {
8238         struct extent_buffer *buf;
8239
8240         buf = btrfs_find_create_tree_block(root, bytenr);
8241         if (IS_ERR(buf))
8242                 return buf;
8243
8244         btrfs_set_header_generation(buf, trans->transid);
8245         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8246         btrfs_tree_lock(buf);
8247         clean_tree_block(trans, root->fs_info, buf);
8248         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
8249
8250         btrfs_set_lock_blocking(buf);
8251         set_extent_buffer_uptodate(buf);
8252
8253         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
8254                 buf->log_index = root->log_transid % 2;
8255                 /*
8256                  * we allow two log transactions at a time, use different
8257                  * EXENT bit to differentiate dirty pages.
8258                  */
8259                 if (buf->log_index == 0)
8260                         set_extent_dirty(&root->dirty_log_pages, buf->start,
8261                                         buf->start + buf->len - 1, GFP_NOFS);
8262                 else
8263                         set_extent_new(&root->dirty_log_pages, buf->start,
8264                                         buf->start + buf->len - 1);
8265         } else {
8266                 buf->log_index = -1;
8267                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
8268                          buf->start + buf->len - 1, GFP_NOFS);
8269         }
8270         trans->dirty = true;
8271         /* this returns a buffer locked for blocking */
8272         return buf;
8273 }
8274
8275 static struct btrfs_block_rsv *
8276 use_block_rsv(struct btrfs_trans_handle *trans,
8277               struct btrfs_root *root, u32 blocksize)
8278 {
8279         struct btrfs_block_rsv *block_rsv;
8280         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
8281         int ret;
8282         bool global_updated = false;
8283
8284         block_rsv = get_block_rsv(trans, root);
8285
8286         if (unlikely(block_rsv->size == 0))
8287                 goto try_reserve;
8288 again:
8289         ret = block_rsv_use_bytes(block_rsv, blocksize);
8290         if (!ret)
8291                 return block_rsv;
8292
8293         if (block_rsv->failfast)
8294                 return ERR_PTR(ret);
8295
8296         if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8297                 global_updated = true;
8298                 update_global_block_rsv(root->fs_info);
8299                 goto again;
8300         }
8301
8302         if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
8303                 static DEFINE_RATELIMIT_STATE(_rs,
8304                                 DEFAULT_RATELIMIT_INTERVAL * 10,
8305                                 /*DEFAULT_RATELIMIT_BURST*/ 1);
8306                 if (__ratelimit(&_rs))
8307                         WARN(1, KERN_DEBUG
8308                                 "BTRFS: block rsv returned %d\n", ret);
8309         }
8310 try_reserve:
8311         ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8312                                      BTRFS_RESERVE_NO_FLUSH);
8313         if (!ret)
8314                 return block_rsv;
8315         /*
8316          * If we couldn't reserve metadata bytes try and use some from
8317          * the global reserve if its space type is the same as the global
8318          * reservation.
8319          */
8320         if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8321             block_rsv->space_info == global_rsv->space_info) {
8322                 ret = block_rsv_use_bytes(global_rsv, blocksize);
8323                 if (!ret)
8324                         return global_rsv;
8325         }
8326         return ERR_PTR(ret);
8327 }
8328
8329 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8330                             struct btrfs_block_rsv *block_rsv, u32 blocksize)
8331 {
8332         block_rsv_add_bytes(block_rsv, blocksize, 0);
8333         block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
8334 }
8335
8336 /*
8337  * finds a free extent and does all the dirty work required for allocation
8338  * returns the tree buffer or an ERR_PTR on error.
8339  */
8340 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8341                                         struct btrfs_root *root,
8342                                         u64 parent, u64 root_objectid,
8343                                         struct btrfs_disk_key *key, int level,
8344                                         u64 hint, u64 empty_size)
8345 {
8346         struct btrfs_key ins;
8347         struct btrfs_block_rsv *block_rsv;
8348         struct extent_buffer *buf;
8349         struct btrfs_delayed_extent_op *extent_op;
8350         u64 flags = 0;
8351         int ret;
8352         u32 blocksize = root->nodesize;
8353         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
8354                                                  SKINNY_METADATA);
8355
8356 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8357         if (btrfs_is_testing(root->fs_info)) {
8358                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8359                                             level);
8360                 if (!IS_ERR(buf))
8361                         root->alloc_bytenr += blocksize;
8362                 return buf;
8363         }
8364 #endif
8365
8366         block_rsv = use_block_rsv(trans, root, blocksize);
8367         if (IS_ERR(block_rsv))
8368                 return ERR_CAST(block_rsv);
8369
8370         ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
8371                                    empty_size, hint, &ins, 0, 0);
8372         if (ret)
8373                 goto out_unuse;
8374
8375         buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
8376         if (IS_ERR(buf)) {
8377                 ret = PTR_ERR(buf);
8378                 goto out_free_reserved;
8379         }
8380
8381         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
8382                 if (parent == 0)
8383                         parent = ins.objectid;
8384                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
8385         } else
8386                 BUG_ON(parent > 0);
8387
8388         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
8389                 extent_op = btrfs_alloc_delayed_extent_op();
8390                 if (!extent_op) {
8391                         ret = -ENOMEM;
8392                         goto out_free_buf;
8393                 }
8394                 if (key)
8395                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
8396                 else
8397                         memset(&extent_op->key, 0, sizeof(extent_op->key));
8398                 extent_op->flags_to_set = flags;
8399                 extent_op->update_key = skinny_metadata ? false : true;
8400                 extent_op->update_flags = true;
8401                 extent_op->is_data = false;
8402                 extent_op->level = level;
8403
8404                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
8405                                                  ins.objectid, ins.offset,
8406                                                  parent, root_objectid, level,
8407                                                  BTRFS_ADD_DELAYED_EXTENT,
8408                                                  extent_op);
8409                 if (ret)
8410                         goto out_free_delayed;
8411         }
8412         return buf;
8413
8414 out_free_delayed:
8415         btrfs_free_delayed_extent_op(extent_op);
8416 out_free_buf:
8417         free_extent_buffer(buf);
8418 out_free_reserved:
8419         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0);
8420 out_unuse:
8421         unuse_block_rsv(root->fs_info, block_rsv, blocksize);
8422         return ERR_PTR(ret);
8423 }
8424
8425 struct walk_control {
8426         u64 refs[BTRFS_MAX_LEVEL];
8427         u64 flags[BTRFS_MAX_LEVEL];
8428         struct btrfs_key update_progress;
8429         int stage;
8430         int level;
8431         int shared_level;
8432         int update_ref;
8433         int keep_locks;
8434         int reada_slot;
8435         int reada_count;
8436         int for_reloc;
8437 };
8438
8439 #define DROP_REFERENCE  1
8440 #define UPDATE_BACKREF  2
8441
8442 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
8443                                      struct btrfs_root *root,
8444                                      struct walk_control *wc,
8445                                      struct btrfs_path *path)
8446 {
8447         u64 bytenr;
8448         u64 generation;
8449         u64 refs;
8450         u64 flags;
8451         u32 nritems;
8452         u32 blocksize;
8453         struct btrfs_key key;
8454         struct extent_buffer *eb;
8455         int ret;
8456         int slot;
8457         int nread = 0;
8458
8459         if (path->slots[wc->level] < wc->reada_slot) {
8460                 wc->reada_count = wc->reada_count * 2 / 3;
8461                 wc->reada_count = max(wc->reada_count, 2);
8462         } else {
8463                 wc->reada_count = wc->reada_count * 3 / 2;
8464                 wc->reada_count = min_t(int, wc->reada_count,
8465                                         BTRFS_NODEPTRS_PER_BLOCK(root));
8466         }
8467
8468         eb = path->nodes[wc->level];
8469         nritems = btrfs_header_nritems(eb);
8470         blocksize = root->nodesize;
8471
8472         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
8473                 if (nread >= wc->reada_count)
8474                         break;
8475
8476                 cond_resched();
8477                 bytenr = btrfs_node_blockptr(eb, slot);
8478                 generation = btrfs_node_ptr_generation(eb, slot);
8479
8480                 if (slot == path->slots[wc->level])
8481                         goto reada;
8482
8483                 if (wc->stage == UPDATE_BACKREF &&
8484                     generation <= root->root_key.offset)
8485                         continue;
8486
8487                 /* We don't lock the tree block, it's OK to be racy here */
8488                 ret = btrfs_lookup_extent_info(trans, root, bytenr,
8489                                                wc->level - 1, 1, &refs,
8490                                                &flags);
8491                 /* We don't care about errors in readahead. */
8492                 if (ret < 0)
8493                         continue;
8494                 BUG_ON(refs == 0);
8495
8496                 if (wc->stage == DROP_REFERENCE) {
8497                         if (refs == 1)
8498                                 goto reada;
8499
8500                         if (wc->level == 1 &&
8501                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8502                                 continue;
8503                         if (!wc->update_ref ||
8504                             generation <= root->root_key.offset)
8505                                 continue;
8506                         btrfs_node_key_to_cpu(eb, &key, slot);
8507                         ret = btrfs_comp_cpu_keys(&key,
8508                                                   &wc->update_progress);
8509                         if (ret < 0)
8510                                 continue;
8511                 } else {
8512                         if (wc->level == 1 &&
8513                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8514                                 continue;
8515                 }
8516 reada:
8517                 readahead_tree_block(root, bytenr);
8518                 nread++;
8519         }
8520         wc->reada_slot = slot;
8521 }
8522
8523 static int account_leaf_items(struct btrfs_trans_handle *trans,
8524                               struct btrfs_root *root,
8525                               struct extent_buffer *eb)
8526 {
8527         int nr = btrfs_header_nritems(eb);
8528         int i, extent_type, ret;
8529         struct btrfs_key key;
8530         struct btrfs_file_extent_item *fi;
8531         u64 bytenr, num_bytes;
8532
8533         /* We can be called directly from walk_up_proc() */
8534         if (!root->fs_info->quota_enabled)
8535                 return 0;
8536
8537         for (i = 0; i < nr; i++) {
8538                 btrfs_item_key_to_cpu(eb, &key, i);
8539
8540                 if (key.type != BTRFS_EXTENT_DATA_KEY)
8541                         continue;
8542
8543                 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
8544                 /* filter out non qgroup-accountable extents  */
8545                 extent_type = btrfs_file_extent_type(eb, fi);
8546
8547                 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
8548                         continue;
8549
8550                 bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
8551                 if (!bytenr)
8552                         continue;
8553
8554                 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
8555
8556                 ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
8557                                 bytenr, num_bytes, GFP_NOFS);
8558                 if (ret)
8559                         return ret;
8560         }
8561         return 0;
8562 }
8563
8564 /*
8565  * Walk up the tree from the bottom, freeing leaves and any interior
8566  * nodes which have had all slots visited. If a node (leaf or
8567  * interior) is freed, the node above it will have it's slot
8568  * incremented. The root node will never be freed.
8569  *
8570  * At the end of this function, we should have a path which has all
8571  * slots incremented to the next position for a search. If we need to
8572  * read a new node it will be NULL and the node above it will have the
8573  * correct slot selected for a later read.
8574  *
8575  * If we increment the root nodes slot counter past the number of
8576  * elements, 1 is returned to signal completion of the search.
8577  */
8578 static int adjust_slots_upwards(struct btrfs_root *root,
8579                                 struct btrfs_path *path, int root_level)
8580 {
8581         int level = 0;
8582         int nr, slot;
8583         struct extent_buffer *eb;
8584
8585         if (root_level == 0)
8586                 return 1;
8587
8588         while (level <= root_level) {
8589                 eb = path->nodes[level];
8590                 nr = btrfs_header_nritems(eb);
8591                 path->slots[level]++;
8592                 slot = path->slots[level];
8593                 if (slot >= nr || level == 0) {
8594                         /*
8595                          * Don't free the root -  we will detect this
8596                          * condition after our loop and return a
8597                          * positive value for caller to stop walking the tree.
8598                          */
8599                         if (level != root_level) {
8600                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8601                                 path->locks[level] = 0;
8602
8603                                 free_extent_buffer(eb);
8604                                 path->nodes[level] = NULL;
8605                                 path->slots[level] = 0;
8606                         }
8607                 } else {
8608                         /*
8609                          * We have a valid slot to walk back down
8610                          * from. Stop here so caller can process these
8611                          * new nodes.
8612                          */
8613                         break;
8614                 }
8615
8616                 level++;
8617         }
8618
8619         eb = path->nodes[root_level];
8620         if (path->slots[root_level] >= btrfs_header_nritems(eb))
8621                 return 1;
8622
8623         return 0;
8624 }
8625
8626 /*
8627  * root_eb is the subtree root and is locked before this function is called.
8628  */
8629 static int account_shared_subtree(struct btrfs_trans_handle *trans,
8630                                   struct btrfs_root *root,
8631                                   struct extent_buffer *root_eb,
8632                                   u64 root_gen,
8633                                   int root_level)
8634 {
8635         int ret = 0;
8636         int level;
8637         struct extent_buffer *eb = root_eb;
8638         struct btrfs_path *path = NULL;
8639
8640         BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
8641         BUG_ON(root_eb == NULL);
8642
8643         if (!root->fs_info->quota_enabled)
8644                 return 0;
8645
8646         if (!extent_buffer_uptodate(root_eb)) {
8647                 ret = btrfs_read_buffer(root_eb, root_gen);
8648                 if (ret)
8649                         goto out;
8650         }
8651
8652         if (root_level == 0) {
8653                 ret = account_leaf_items(trans, root, root_eb);
8654                 goto out;
8655         }
8656
8657         path = btrfs_alloc_path();
8658         if (!path)
8659                 return -ENOMEM;
8660
8661         /*
8662          * Walk down the tree.  Missing extent blocks are filled in as
8663          * we go. Metadata is accounted every time we read a new
8664          * extent block.
8665          *
8666          * When we reach a leaf, we account for file extent items in it,
8667          * walk back up the tree (adjusting slot pointers as we go)
8668          * and restart the search process.
8669          */
8670         extent_buffer_get(root_eb); /* For path */
8671         path->nodes[root_level] = root_eb;
8672         path->slots[root_level] = 0;
8673         path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
8674 walk_down:
8675         level = root_level;
8676         while (level >= 0) {
8677                 if (path->nodes[level] == NULL) {
8678                         int parent_slot;
8679                         u64 child_gen;
8680                         u64 child_bytenr;
8681
8682                         /* We need to get child blockptr/gen from
8683                          * parent before we can read it. */
8684                         eb = path->nodes[level + 1];
8685                         parent_slot = path->slots[level + 1];
8686                         child_bytenr = btrfs_node_blockptr(eb, parent_slot);
8687                         child_gen = btrfs_node_ptr_generation(eb, parent_slot);
8688
8689                         eb = read_tree_block(root, child_bytenr, child_gen);
8690                         if (IS_ERR(eb)) {
8691                                 ret = PTR_ERR(eb);
8692                                 goto out;
8693                         } else if (!extent_buffer_uptodate(eb)) {
8694                                 free_extent_buffer(eb);
8695                                 ret = -EIO;
8696                                 goto out;
8697                         }
8698
8699                         path->nodes[level] = eb;
8700                         path->slots[level] = 0;
8701
8702                         btrfs_tree_read_lock(eb);
8703                         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
8704                         path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
8705
8706                         ret = btrfs_qgroup_insert_dirty_extent(trans,
8707                                         root->fs_info, child_bytenr,
8708                                         root->nodesize, GFP_NOFS);
8709                         if (ret)
8710                                 goto out;
8711                 }
8712
8713                 if (level == 0) {
8714                         ret = account_leaf_items(trans, root, path->nodes[level]);
8715                         if (ret)
8716                                 goto out;
8717
8718                         /* Nonzero return here means we completed our search */
8719                         ret = adjust_slots_upwards(root, path, root_level);
8720                         if (ret)
8721                                 break;
8722
8723                         /* Restart search with new slots */
8724                         goto walk_down;
8725                 }
8726
8727                 level--;
8728         }
8729
8730         ret = 0;
8731 out:
8732         btrfs_free_path(path);
8733
8734         return ret;
8735 }
8736
8737 /*
8738  * helper to process tree block while walking down the tree.
8739  *
8740  * when wc->stage == UPDATE_BACKREF, this function updates
8741  * back refs for pointers in the block.
8742  *
8743  * NOTE: return value 1 means we should stop walking down.
8744  */
8745 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8746                                    struct btrfs_root *root,
8747                                    struct btrfs_path *path,
8748                                    struct walk_control *wc, int lookup_info)
8749 {
8750         int level = wc->level;
8751         struct extent_buffer *eb = path->nodes[level];
8752         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8753         int ret;
8754
8755         if (wc->stage == UPDATE_BACKREF &&
8756             btrfs_header_owner(eb) != root->root_key.objectid)
8757                 return 1;
8758
8759         /*
8760          * when reference count of tree block is 1, it won't increase
8761          * again. once full backref flag is set, we never clear it.
8762          */
8763         if (lookup_info &&
8764             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8765              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8766                 BUG_ON(!path->locks[level]);
8767                 ret = btrfs_lookup_extent_info(trans, root,
8768                                                eb->start, level, 1,
8769                                                &wc->refs[level],
8770                                                &wc->flags[level]);
8771                 BUG_ON(ret == -ENOMEM);
8772                 if (ret)
8773                         return ret;
8774                 BUG_ON(wc->refs[level] == 0);
8775         }
8776
8777         if (wc->stage == DROP_REFERENCE) {
8778                 if (wc->refs[level] > 1)
8779                         return 1;
8780
8781                 if (path->locks[level] && !wc->keep_locks) {
8782                         btrfs_tree_unlock_rw(eb, path->locks[level]);
8783                         path->locks[level] = 0;
8784                 }
8785                 return 0;
8786         }
8787
8788         /* wc->stage == UPDATE_BACKREF */
8789         if (!(wc->flags[level] & flag)) {
8790                 BUG_ON(!path->locks[level]);
8791                 ret = btrfs_inc_ref(trans, root, eb, 1);
8792                 BUG_ON(ret); /* -ENOMEM */
8793                 ret = btrfs_dec_ref(trans, root, eb, 0);
8794                 BUG_ON(ret); /* -ENOMEM */
8795                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
8796                                                   eb->len, flag,
8797                                                   btrfs_header_level(eb), 0);
8798                 BUG_ON(ret); /* -ENOMEM */
8799                 wc->flags[level] |= flag;
8800         }
8801
8802         /*
8803          * the block is shared by multiple trees, so it's not good to
8804          * keep the tree lock
8805          */
8806         if (path->locks[level] && level > 0) {
8807                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8808                 path->locks[level] = 0;
8809         }
8810         return 0;
8811 }
8812
8813 /*
8814  * helper to process tree block pointer.
8815  *
8816  * when wc->stage == DROP_REFERENCE, this function checks
8817  * reference count of the block pointed to. if the block
8818  * is shared and we need update back refs for the subtree
8819  * rooted at the block, this function changes wc->stage to
8820  * UPDATE_BACKREF. if the block is shared and there is no
8821  * need to update back, this function drops the reference
8822  * to the block.
8823  *
8824  * NOTE: return value 1 means we should stop walking down.
8825  */
8826 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8827                                  struct btrfs_root *root,
8828                                  struct btrfs_path *path,
8829                                  struct walk_control *wc, int *lookup_info)
8830 {
8831         u64 bytenr;
8832         u64 generation;
8833         u64 parent;
8834         u32 blocksize;
8835         struct btrfs_key key;
8836         struct extent_buffer *next;
8837         int level = wc->level;
8838         int reada = 0;
8839         int ret = 0;
8840         bool need_account = false;
8841
8842         generation = btrfs_node_ptr_generation(path->nodes[level],
8843                                                path->slots[level]);
8844         /*
8845          * if the lower level block was created before the snapshot
8846          * was created, we know there is no need to update back refs
8847          * for the subtree
8848          */
8849         if (wc->stage == UPDATE_BACKREF &&
8850             generation <= root->root_key.offset) {
8851                 *lookup_info = 1;
8852                 return 1;
8853         }
8854
8855         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8856         blocksize = root->nodesize;
8857
8858         next = btrfs_find_tree_block(root->fs_info, bytenr);
8859         if (!next) {
8860                 next = btrfs_find_create_tree_block(root, bytenr);
8861                 if (IS_ERR(next))
8862                         return PTR_ERR(next);
8863
8864                 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8865                                                level - 1);
8866                 reada = 1;
8867         }
8868         btrfs_tree_lock(next);
8869         btrfs_set_lock_blocking(next);
8870
8871         ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
8872                                        &wc->refs[level - 1],
8873                                        &wc->flags[level - 1]);
8874         if (ret < 0) {
8875                 btrfs_tree_unlock(next);
8876                 return ret;
8877         }
8878
8879         if (unlikely(wc->refs[level - 1] == 0)) {
8880                 btrfs_err(root->fs_info, "Missing references.");
8881                 BUG();
8882         }
8883         *lookup_info = 0;
8884
8885         if (wc->stage == DROP_REFERENCE) {
8886                 if (wc->refs[level - 1] > 1) {
8887                         need_account = true;
8888                         if (level == 1 &&
8889                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8890                                 goto skip;
8891
8892                         if (!wc->update_ref ||
8893                             generation <= root->root_key.offset)
8894                                 goto skip;
8895
8896                         btrfs_node_key_to_cpu(path->nodes[level], &key,
8897                                               path->slots[level]);
8898                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8899                         if (ret < 0)
8900                                 goto skip;
8901
8902                         wc->stage = UPDATE_BACKREF;
8903                         wc->shared_level = level - 1;
8904                 }
8905         } else {
8906                 if (level == 1 &&
8907                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8908                         goto skip;
8909         }
8910
8911         if (!btrfs_buffer_uptodate(next, generation, 0)) {
8912                 btrfs_tree_unlock(next);
8913                 free_extent_buffer(next);
8914                 next = NULL;
8915                 *lookup_info = 1;
8916         }
8917
8918         if (!next) {
8919                 if (reada && level == 1)
8920                         reada_walk_down(trans, root, wc, path);
8921                 next = read_tree_block(root, bytenr, generation);
8922                 if (IS_ERR(next)) {
8923                         return PTR_ERR(next);
8924                 } else if (!extent_buffer_uptodate(next)) {
8925                         free_extent_buffer(next);
8926                         return -EIO;
8927                 }
8928                 btrfs_tree_lock(next);
8929                 btrfs_set_lock_blocking(next);
8930         }
8931
8932         level--;
8933         BUG_ON(level != btrfs_header_level(next));
8934         path->nodes[level] = next;
8935         path->slots[level] = 0;
8936         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8937         wc->level = level;
8938         if (wc->level == 1)
8939                 wc->reada_slot = 0;
8940         return 0;
8941 skip:
8942         wc->refs[level - 1] = 0;
8943         wc->flags[level - 1] = 0;
8944         if (wc->stage == DROP_REFERENCE) {
8945                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
8946                         parent = path->nodes[level]->start;
8947                 } else {
8948                         BUG_ON(root->root_key.objectid !=
8949                                btrfs_header_owner(path->nodes[level]));
8950                         parent = 0;
8951                 }
8952
8953                 if (need_account) {
8954                         ret = account_shared_subtree(trans, root, next,
8955                                                      generation, level - 1);
8956                         if (ret) {
8957                                 btrfs_err_rl(root->fs_info,
8958                                         "Error "
8959                                         "%d accounting shared subtree. Quota "
8960                                         "is out of sync, rescan required.",
8961                                         ret);
8962                         }
8963                 }
8964                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
8965                                 root->root_key.objectid, level - 1, 0);
8966                 BUG_ON(ret); /* -ENOMEM */
8967         }
8968         btrfs_tree_unlock(next);
8969         free_extent_buffer(next);
8970         *lookup_info = 1;
8971         return 1;
8972 }
8973
8974 /*
8975  * helper to process tree block while walking up the tree.
8976  *
8977  * when wc->stage == DROP_REFERENCE, this function drops
8978  * reference count on the block.
8979  *
8980  * when wc->stage == UPDATE_BACKREF, this function changes
8981  * wc->stage back to DROP_REFERENCE if we changed wc->stage
8982  * to UPDATE_BACKREF previously while processing the block.
8983  *
8984  * NOTE: return value 1 means we should stop walking up.
8985  */
8986 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
8987                                  struct btrfs_root *root,
8988                                  struct btrfs_path *path,
8989                                  struct walk_control *wc)
8990 {
8991         int ret;
8992         int level = wc->level;
8993         struct extent_buffer *eb = path->nodes[level];
8994         u64 parent = 0;
8995
8996         if (wc->stage == UPDATE_BACKREF) {
8997                 BUG_ON(wc->shared_level < level);
8998                 if (level < wc->shared_level)
8999                         goto out;
9000
9001                 ret = find_next_key(path, level + 1, &wc->update_progress);
9002                 if (ret > 0)
9003                         wc->update_ref = 0;
9004
9005                 wc->stage = DROP_REFERENCE;
9006                 wc->shared_level = -1;
9007                 path->slots[level] = 0;
9008
9009                 /*
9010                  * check reference count again if the block isn't locked.
9011                  * we should start walking down the tree again if reference
9012                  * count is one.
9013                  */
9014                 if (!path->locks[level]) {
9015                         BUG_ON(level == 0);
9016                         btrfs_tree_lock(eb);
9017                         btrfs_set_lock_blocking(eb);
9018                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9019
9020                         ret = btrfs_lookup_extent_info(trans, root,
9021                                                        eb->start, level, 1,
9022                                                        &wc->refs[level],
9023                                                        &wc->flags[level]);
9024                         if (ret < 0) {
9025                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
9026                                 path->locks[level] = 0;
9027                                 return ret;
9028                         }
9029                         BUG_ON(wc->refs[level] == 0);
9030                         if (wc->refs[level] == 1) {
9031                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
9032                                 path->locks[level] = 0;
9033                                 return 1;
9034                         }
9035                 }
9036         }
9037
9038         /* wc->stage == DROP_REFERENCE */
9039         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
9040
9041         if (wc->refs[level] == 1) {
9042                 if (level == 0) {
9043                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9044                                 ret = btrfs_dec_ref(trans, root, eb, 1);
9045                         else
9046                                 ret = btrfs_dec_ref(trans, root, eb, 0);
9047                         BUG_ON(ret); /* -ENOMEM */
9048                         ret = account_leaf_items(trans, root, eb);
9049                         if (ret) {
9050                                 btrfs_err_rl(root->fs_info,
9051                                         "error "
9052                                         "%d accounting leaf items. Quota "
9053                                         "is out of sync, rescan required.",
9054                                         ret);
9055                         }
9056                 }
9057                 /* make block locked assertion in clean_tree_block happy */
9058                 if (!path->locks[level] &&
9059                     btrfs_header_generation(eb) == trans->transid) {
9060                         btrfs_tree_lock(eb);
9061                         btrfs_set_lock_blocking(eb);
9062                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9063                 }
9064                 clean_tree_block(trans, root->fs_info, eb);
9065         }
9066
9067         if (eb == root->node) {
9068                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9069                         parent = eb->start;
9070                 else
9071                         BUG_ON(root->root_key.objectid !=
9072                                btrfs_header_owner(eb));
9073         } else {
9074                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9075                         parent = path->nodes[level + 1]->start;
9076                 else
9077                         BUG_ON(root->root_key.objectid !=
9078                                btrfs_header_owner(path->nodes[level + 1]));
9079         }
9080
9081         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
9082 out:
9083         wc->refs[level] = 0;
9084         wc->flags[level] = 0;
9085         return 0;
9086 }
9087
9088 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
9089                                    struct btrfs_root *root,
9090                                    struct btrfs_path *path,
9091                                    struct walk_control *wc)
9092 {
9093         int level = wc->level;
9094         int lookup_info = 1;
9095         int ret;
9096
9097         while (level >= 0) {
9098                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
9099                 if (ret > 0)
9100                         break;
9101
9102                 if (level == 0)
9103                         break;
9104
9105                 if (path->slots[level] >=
9106                     btrfs_header_nritems(path->nodes[level]))
9107                         break;
9108
9109                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
9110                 if (ret > 0) {
9111                         path->slots[level]++;
9112                         continue;
9113                 } else if (ret < 0)
9114                         return ret;
9115                 level = wc->level;
9116         }
9117         return 0;
9118 }
9119
9120 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
9121                                  struct btrfs_root *root,
9122                                  struct btrfs_path *path,
9123                                  struct walk_control *wc, int max_level)
9124 {
9125         int level = wc->level;
9126         int ret;
9127
9128         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
9129         while (level < max_level && path->nodes[level]) {
9130                 wc->level = level;
9131                 if (path->slots[level] + 1 <
9132                     btrfs_header_nritems(path->nodes[level])) {
9133                         path->slots[level]++;
9134                         return 0;
9135                 } else {
9136                         ret = walk_up_proc(trans, root, path, wc);
9137                         if (ret > 0)
9138                                 return 0;
9139
9140                         if (path->locks[level]) {
9141                                 btrfs_tree_unlock_rw(path->nodes[level],
9142                                                      path->locks[level]);
9143                                 path->locks[level] = 0;
9144                         }
9145                         free_extent_buffer(path->nodes[level]);
9146                         path->nodes[level] = NULL;
9147                         level++;
9148                 }
9149         }
9150         return 1;
9151 }
9152
9153 /*
9154  * drop a subvolume tree.
9155  *
9156  * this function traverses the tree freeing any blocks that only
9157  * referenced by the tree.
9158  *
9159  * when a shared tree block is found. this function decreases its
9160  * reference count by one. if update_ref is true, this function
9161  * also make sure backrefs for the shared block and all lower level
9162  * blocks are properly updated.
9163  *
9164  * If called with for_reloc == 0, may exit early with -EAGAIN
9165  */
9166 int btrfs_drop_snapshot(struct btrfs_root *root,
9167                          struct btrfs_block_rsv *block_rsv, int update_ref,
9168                          int for_reloc)
9169 {
9170         struct btrfs_path *path;
9171         struct btrfs_trans_handle *trans;
9172         struct btrfs_root *tree_root = root->fs_info->tree_root;
9173         struct btrfs_root_item *root_item = &root->root_item;
9174         struct walk_control *wc;
9175         struct btrfs_key key;
9176         int err = 0;
9177         int ret;
9178         int level;
9179         bool root_dropped = false;
9180
9181         btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
9182
9183         path = btrfs_alloc_path();
9184         if (!path) {
9185                 err = -ENOMEM;
9186                 goto out;
9187         }
9188
9189         wc = kzalloc(sizeof(*wc), GFP_NOFS);
9190         if (!wc) {
9191                 btrfs_free_path(path);
9192                 err = -ENOMEM;
9193                 goto out;
9194         }
9195
9196         trans = btrfs_start_transaction(tree_root, 0);
9197         if (IS_ERR(trans)) {
9198                 err = PTR_ERR(trans);
9199                 goto out_free;
9200         }
9201
9202         if (block_rsv)
9203                 trans->block_rsv = block_rsv;
9204
9205         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
9206                 level = btrfs_header_level(root->node);
9207                 path->nodes[level] = btrfs_lock_root_node(root);
9208                 btrfs_set_lock_blocking(path->nodes[level]);
9209                 path->slots[level] = 0;
9210                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9211                 memset(&wc->update_progress, 0,
9212                        sizeof(wc->update_progress));
9213         } else {
9214                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
9215                 memcpy(&wc->update_progress, &key,
9216                        sizeof(wc->update_progress));
9217
9218                 level = root_item->drop_level;
9219                 BUG_ON(level == 0);
9220                 path->lowest_level = level;
9221                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
9222                 path->lowest_level = 0;
9223                 if (ret < 0) {
9224                         err = ret;
9225                         goto out_end_trans;
9226                 }
9227                 WARN_ON(ret > 0);
9228
9229                 /*
9230                  * unlock our path, this is safe because only this
9231                  * function is allowed to delete this snapshot
9232                  */
9233                 btrfs_unlock_up_safe(path, 0);
9234
9235                 level = btrfs_header_level(root->node);
9236                 while (1) {
9237                         btrfs_tree_lock(path->nodes[level]);
9238                         btrfs_set_lock_blocking(path->nodes[level]);
9239                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9240
9241                         ret = btrfs_lookup_extent_info(trans, root,
9242                                                 path->nodes[level]->start,
9243                                                 level, 1, &wc->refs[level],
9244                                                 &wc->flags[level]);
9245                         if (ret < 0) {
9246                                 err = ret;
9247                                 goto out_end_trans;
9248                         }
9249                         BUG_ON(wc->refs[level] == 0);
9250
9251                         if (level == root_item->drop_level)
9252                                 break;
9253
9254                         btrfs_tree_unlock(path->nodes[level]);
9255                         path->locks[level] = 0;
9256                         WARN_ON(wc->refs[level] != 1);
9257                         level--;
9258                 }
9259         }
9260
9261         wc->level = level;
9262         wc->shared_level = -1;
9263         wc->stage = DROP_REFERENCE;
9264         wc->update_ref = update_ref;
9265         wc->keep_locks = 0;
9266         wc->for_reloc = for_reloc;
9267         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
9268
9269         while (1) {
9270
9271                 ret = walk_down_tree(trans, root, path, wc);
9272                 if (ret < 0) {
9273                         err = ret;
9274                         break;
9275                 }
9276
9277                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
9278                 if (ret < 0) {
9279                         err = ret;
9280                         break;
9281                 }
9282
9283                 if (ret > 0) {
9284                         BUG_ON(wc->stage != DROP_REFERENCE);
9285                         break;
9286                 }
9287
9288                 if (wc->stage == DROP_REFERENCE) {
9289                         level = wc->level;
9290                         btrfs_node_key(path->nodes[level],
9291                                        &root_item->drop_progress,
9292                                        path->slots[level]);
9293                         root_item->drop_level = level;
9294                 }
9295
9296                 BUG_ON(wc->level == 0);
9297                 if (btrfs_should_end_transaction(trans, tree_root) ||
9298                     (!for_reloc && btrfs_need_cleaner_sleep(root))) {
9299                         ret = btrfs_update_root(trans, tree_root,
9300                                                 &root->root_key,
9301                                                 root_item);
9302                         if (ret) {
9303                                 btrfs_abort_transaction(trans, ret);
9304                                 err = ret;
9305                                 goto out_end_trans;
9306                         }
9307
9308                         btrfs_end_transaction_throttle(trans, tree_root);
9309                         if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
9310                                 pr_debug("BTRFS: drop snapshot early exit\n");
9311                                 err = -EAGAIN;
9312                                 goto out_free;
9313                         }
9314
9315                         trans = btrfs_start_transaction(tree_root, 0);
9316                         if (IS_ERR(trans)) {
9317                                 err = PTR_ERR(trans);
9318                                 goto out_free;
9319                         }
9320                         if (block_rsv)
9321                                 trans->block_rsv = block_rsv;
9322                 }
9323         }
9324         btrfs_release_path(path);
9325         if (err)
9326                 goto out_end_trans;
9327
9328         ret = btrfs_del_root(trans, tree_root, &root->root_key);
9329         if (ret) {
9330                 btrfs_abort_transaction(trans, ret);
9331                 goto out_end_trans;
9332         }
9333
9334         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
9335                 ret = btrfs_find_root(tree_root, &root->root_key, path,
9336                                       NULL, NULL);
9337                 if (ret < 0) {
9338                         btrfs_abort_transaction(trans, ret);
9339                         err = ret;
9340                         goto out_end_trans;
9341                 } else if (ret > 0) {
9342                         /* if we fail to delete the orphan item this time
9343                          * around, it'll get picked up the next time.
9344                          *
9345                          * The most common failure here is just -ENOENT.
9346                          */
9347                         btrfs_del_orphan_item(trans, tree_root,
9348                                               root->root_key.objectid);
9349                 }
9350         }
9351
9352         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
9353                 btrfs_add_dropped_root(trans, root);
9354         } else {
9355                 free_extent_buffer(root->node);
9356                 free_extent_buffer(root->commit_root);
9357                 btrfs_put_fs_root(root);
9358         }
9359         root_dropped = true;
9360 out_end_trans:
9361         btrfs_end_transaction_throttle(trans, tree_root);
9362 out_free:
9363         kfree(wc);
9364         btrfs_free_path(path);
9365 out:
9366         /*
9367          * So if we need to stop dropping the snapshot for whatever reason we
9368          * need to make sure to add it back to the dead root list so that we
9369          * keep trying to do the work later.  This also cleans up roots if we
9370          * don't have it in the radix (like when we recover after a power fail
9371          * or unmount) so we don't leak memory.
9372          */
9373         if (!for_reloc && root_dropped == false)
9374                 btrfs_add_dead_root(root);
9375         if (err && err != -EAGAIN)
9376                 btrfs_handle_fs_error(root->fs_info, err, NULL);
9377         return err;
9378 }
9379
9380 /*
9381  * drop subtree rooted at tree block 'node'.
9382  *
9383  * NOTE: this function will unlock and release tree block 'node'
9384  * only used by relocation code
9385  */
9386 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
9387                         struct btrfs_root *root,
9388                         struct extent_buffer *node,
9389                         struct extent_buffer *parent)
9390 {
9391         struct btrfs_path *path;
9392         struct walk_control *wc;
9393         int level;
9394         int parent_level;
9395         int ret = 0;
9396         int wret;
9397
9398         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
9399
9400         path = btrfs_alloc_path();
9401         if (!path)
9402                 return -ENOMEM;
9403
9404         wc = kzalloc(sizeof(*wc), GFP_NOFS);
9405         if (!wc) {
9406                 btrfs_free_path(path);
9407                 return -ENOMEM;
9408         }
9409
9410         btrfs_assert_tree_locked(parent);
9411         parent_level = btrfs_header_level(parent);
9412         extent_buffer_get(parent);
9413         path->nodes[parent_level] = parent;
9414         path->slots[parent_level] = btrfs_header_nritems(parent);
9415
9416         btrfs_assert_tree_locked(node);
9417         level = btrfs_header_level(node);
9418         path->nodes[level] = node;
9419         path->slots[level] = 0;
9420         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9421
9422         wc->refs[parent_level] = 1;
9423         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
9424         wc->level = level;
9425         wc->shared_level = -1;
9426         wc->stage = DROP_REFERENCE;
9427         wc->update_ref = 0;
9428         wc->keep_locks = 1;
9429         wc->for_reloc = 1;
9430         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
9431
9432         while (1) {
9433                 wret = walk_down_tree(trans, root, path, wc);
9434                 if (wret < 0) {
9435                         ret = wret;
9436                         break;
9437                 }
9438
9439                 wret = walk_up_tree(trans, root, path, wc, parent_level);
9440                 if (wret < 0)
9441                         ret = wret;
9442                 if (wret != 0)
9443                         break;
9444         }
9445
9446         kfree(wc);
9447         btrfs_free_path(path);
9448         return ret;
9449 }
9450
9451 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
9452 {
9453         u64 num_devices;
9454         u64 stripped;
9455
9456         /*
9457          * if restripe for this chunk_type is on pick target profile and
9458          * return, otherwise do the usual balance
9459          */
9460         stripped = get_restripe_target(root->fs_info, flags);
9461         if (stripped)
9462                 return extended_to_chunk(stripped);
9463
9464         num_devices = root->fs_info->fs_devices->rw_devices;
9465
9466         stripped = BTRFS_BLOCK_GROUP_RAID0 |
9467                 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
9468                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
9469
9470         if (num_devices == 1) {
9471                 stripped |= BTRFS_BLOCK_GROUP_DUP;
9472                 stripped = flags & ~stripped;
9473
9474                 /* turn raid0 into single device chunks */
9475                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
9476                         return stripped;
9477
9478                 /* turn mirroring into duplication */
9479                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
9480                              BTRFS_BLOCK_GROUP_RAID10))
9481                         return stripped | BTRFS_BLOCK_GROUP_DUP;
9482         } else {
9483                 /* they already had raid on here, just return */
9484                 if (flags & stripped)
9485                         return flags;
9486
9487                 stripped |= BTRFS_BLOCK_GROUP_DUP;
9488                 stripped = flags & ~stripped;
9489
9490                 /* switch duplicated blocks with raid1 */
9491                 if (flags & BTRFS_BLOCK_GROUP_DUP)
9492                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
9493
9494                 /* this is drive concat, leave it alone */
9495         }
9496
9497         return flags;
9498 }
9499
9500 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9501 {
9502         struct btrfs_space_info *sinfo = cache->space_info;
9503         u64 num_bytes;
9504         u64 min_allocable_bytes;
9505         int ret = -ENOSPC;
9506
9507         /*
9508          * We need some metadata space and system metadata space for
9509          * allocating chunks in some corner cases until we force to set
9510          * it to be readonly.
9511          */
9512         if ((sinfo->flags &
9513              (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9514             !force)
9515                 min_allocable_bytes = SZ_1M;
9516         else
9517                 min_allocable_bytes = 0;
9518
9519         spin_lock(&sinfo->lock);
9520         spin_lock(&cache->lock);
9521
9522         if (cache->ro) {
9523                 cache->ro++;
9524                 ret = 0;
9525                 goto out;
9526         }
9527
9528         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9529                     cache->bytes_super - btrfs_block_group_used(&cache->item);
9530
9531         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
9532             sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
9533             min_allocable_bytes <= sinfo->total_bytes) {
9534                 sinfo->bytes_readonly += num_bytes;
9535                 cache->ro++;
9536                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9537                 ret = 0;
9538         }
9539 out:
9540         spin_unlock(&cache->lock);
9541         spin_unlock(&sinfo->lock);
9542         return ret;
9543 }
9544
9545 int btrfs_inc_block_group_ro(struct btrfs_root *root,
9546                              struct btrfs_block_group_cache *cache)
9547
9548 {
9549         struct btrfs_trans_handle *trans;
9550         u64 alloc_flags;
9551         int ret;
9552
9553 again:
9554         trans = btrfs_join_transaction(root);
9555         if (IS_ERR(trans))
9556                 return PTR_ERR(trans);
9557
9558         /*
9559          * we're not allowed to set block groups readonly after the dirty
9560          * block groups cache has started writing.  If it already started,
9561          * back off and let this transaction commit
9562          */
9563         mutex_lock(&root->fs_info->ro_block_group_mutex);
9564         if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9565                 u64 transid = trans->transid;
9566
9567                 mutex_unlock(&root->fs_info->ro_block_group_mutex);
9568                 btrfs_end_transaction(trans, root);
9569
9570                 ret = btrfs_wait_for_commit(root, transid);
9571                 if (ret)
9572                         return ret;
9573                 goto again;
9574         }
9575
9576         /*
9577          * if we are changing raid levels, try to allocate a corresponding
9578          * block group with the new raid level.
9579          */
9580         alloc_flags = update_block_group_flags(root, cache->flags);
9581         if (alloc_flags != cache->flags) {
9582                 ret = do_chunk_alloc(trans, root, alloc_flags,
9583                                      CHUNK_ALLOC_FORCE);
9584                 /*
9585                  * ENOSPC is allowed here, we may have enough space
9586                  * already allocated at the new raid level to
9587                  * carry on
9588                  */
9589                 if (ret == -ENOSPC)
9590                         ret = 0;
9591                 if (ret < 0)
9592                         goto out;
9593         }
9594
9595         ret = inc_block_group_ro(cache, 0);
9596         if (!ret)
9597                 goto out;
9598         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
9599         ret = do_chunk_alloc(trans, root, alloc_flags,
9600                              CHUNK_ALLOC_FORCE);
9601         if (ret < 0)
9602                 goto out;
9603         ret = inc_block_group_ro(cache, 0);
9604 out:
9605         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9606                 alloc_flags = update_block_group_flags(root, cache->flags);
9607                 lock_chunks(root->fs_info->chunk_root);
9608                 check_system_chunk(trans, root, alloc_flags);
9609                 unlock_chunks(root->fs_info->chunk_root);
9610         }
9611         mutex_unlock(&root->fs_info->ro_block_group_mutex);
9612
9613         btrfs_end_transaction(trans, root);
9614         return ret;
9615 }
9616
9617 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
9618                             struct btrfs_root *root, u64 type)
9619 {
9620         u64 alloc_flags = get_alloc_profile(root, type);
9621         return do_chunk_alloc(trans, root, alloc_flags,
9622                               CHUNK_ALLOC_FORCE);
9623 }
9624
9625 /*
9626  * helper to account the unused space of all the readonly block group in the
9627  * space_info. takes mirrors into account.
9628  */
9629 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9630 {
9631         struct btrfs_block_group_cache *block_group;
9632         u64 free_bytes = 0;
9633         int factor;
9634
9635         /* It's df, we don't care if it's racy */
9636         if (list_empty(&sinfo->ro_bgs))
9637                 return 0;
9638
9639         spin_lock(&sinfo->lock);
9640         list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
9641                 spin_lock(&block_group->lock);
9642
9643                 if (!block_group->ro) {
9644                         spin_unlock(&block_group->lock);
9645                         continue;
9646                 }
9647
9648                 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
9649                                           BTRFS_BLOCK_GROUP_RAID10 |
9650                                           BTRFS_BLOCK_GROUP_DUP))
9651                         factor = 2;
9652                 else
9653                         factor = 1;
9654
9655                 free_bytes += (block_group->key.offset -
9656                                btrfs_block_group_used(&block_group->item)) *
9657                                factor;
9658
9659                 spin_unlock(&block_group->lock);
9660         }
9661         spin_unlock(&sinfo->lock);
9662
9663         return free_bytes;
9664 }
9665
9666 void btrfs_dec_block_group_ro(struct btrfs_root *root,
9667                               struct btrfs_block_group_cache *cache)
9668 {
9669         struct btrfs_space_info *sinfo = cache->space_info;
9670         u64 num_bytes;
9671
9672         BUG_ON(!cache->ro);
9673
9674         spin_lock(&sinfo->lock);
9675         spin_lock(&cache->lock);
9676         if (!--cache->ro) {
9677                 num_bytes = cache->key.offset - cache->reserved -
9678                             cache->pinned - cache->bytes_super -
9679                             btrfs_block_group_used(&cache->item);
9680                 sinfo->bytes_readonly -= num_bytes;
9681                 list_del_init(&cache->ro_list);
9682         }
9683         spin_unlock(&cache->lock);
9684         spin_unlock(&sinfo->lock);
9685 }
9686
9687 /*
9688  * checks to see if its even possible to relocate this block group.
9689  *
9690  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9691  * ok to go ahead and try.
9692  */
9693 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
9694 {
9695         struct btrfs_block_group_cache *block_group;
9696         struct btrfs_space_info *space_info;
9697         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
9698         struct btrfs_device *device;
9699         struct btrfs_trans_handle *trans;
9700         u64 min_free;
9701         u64 dev_min = 1;
9702         u64 dev_nr = 0;
9703         u64 target;
9704         int debug;
9705         int index;
9706         int full = 0;
9707         int ret = 0;
9708
9709         debug = btrfs_test_opt(root->fs_info, ENOSPC_DEBUG);
9710
9711         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
9712
9713         /* odd, couldn't find the block group, leave it alone */
9714         if (!block_group) {
9715                 if (debug)
9716                         btrfs_warn(root->fs_info,
9717                                    "can't find block group for bytenr %llu",
9718                                    bytenr);
9719                 return -1;
9720         }
9721
9722         min_free = btrfs_block_group_used(&block_group->item);
9723
9724         /* no bytes used, we're good */
9725         if (!min_free)
9726                 goto out;
9727
9728         space_info = block_group->space_info;
9729         spin_lock(&space_info->lock);
9730
9731         full = space_info->full;
9732
9733         /*
9734          * if this is the last block group we have in this space, we can't
9735          * relocate it unless we're able to allocate a new chunk below.
9736          *
9737          * Otherwise, we need to make sure we have room in the space to handle
9738          * all of the extents from this block group.  If we can, we're good
9739          */
9740         if ((space_info->total_bytes != block_group->key.offset) &&
9741             (space_info->bytes_used + space_info->bytes_reserved +
9742              space_info->bytes_pinned + space_info->bytes_readonly +
9743              min_free < space_info->total_bytes)) {
9744                 spin_unlock(&space_info->lock);
9745                 goto out;
9746         }
9747         spin_unlock(&space_info->lock);
9748
9749         /*
9750          * ok we don't have enough space, but maybe we have free space on our
9751          * devices to allocate new chunks for relocation, so loop through our
9752          * alloc devices and guess if we have enough space.  if this block
9753          * group is going to be restriped, run checks against the target
9754          * profile instead of the current one.
9755          */
9756         ret = -1;
9757
9758         /*
9759          * index:
9760          *      0: raid10
9761          *      1: raid1
9762          *      2: dup
9763          *      3: raid0
9764          *      4: single
9765          */
9766         target = get_restripe_target(root->fs_info, block_group->flags);
9767         if (target) {
9768                 index = __get_raid_index(extended_to_chunk(target));
9769         } else {
9770                 /*
9771                  * this is just a balance, so if we were marked as full
9772                  * we know there is no space for a new chunk
9773                  */
9774                 if (full) {
9775                         if (debug)
9776                                 btrfs_warn(root->fs_info,
9777                                         "no space to alloc new chunk for block group %llu",
9778                                         block_group->key.objectid);
9779                         goto out;
9780                 }
9781
9782                 index = get_block_group_index(block_group);
9783         }
9784
9785         if (index == BTRFS_RAID_RAID10) {
9786                 dev_min = 4;
9787                 /* Divide by 2 */
9788                 min_free >>= 1;
9789         } else if (index == BTRFS_RAID_RAID1) {
9790                 dev_min = 2;
9791         } else if (index == BTRFS_RAID_DUP) {
9792                 /* Multiply by 2 */
9793                 min_free <<= 1;
9794         } else if (index == BTRFS_RAID_RAID0) {
9795                 dev_min = fs_devices->rw_devices;
9796                 min_free = div64_u64(min_free, dev_min);
9797         }
9798
9799         /* We need to do this so that we can look at pending chunks */
9800         trans = btrfs_join_transaction(root);
9801         if (IS_ERR(trans)) {
9802                 ret = PTR_ERR(trans);
9803                 goto out;
9804         }
9805
9806         mutex_lock(&root->fs_info->chunk_mutex);
9807         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9808                 u64 dev_offset;
9809
9810                 /*
9811                  * check to make sure we can actually find a chunk with enough
9812                  * space to fit our block group in.
9813                  */
9814                 if (device->total_bytes > device->bytes_used + min_free &&
9815                     !device->is_tgtdev_for_dev_replace) {
9816                         ret = find_free_dev_extent(trans, device, min_free,
9817                                                    &dev_offset, NULL);
9818                         if (!ret)
9819                                 dev_nr++;
9820
9821                         if (dev_nr >= dev_min)
9822                                 break;
9823
9824                         ret = -1;
9825                 }
9826         }
9827         if (debug && ret == -1)
9828                 btrfs_warn(root->fs_info,
9829                         "no space to allocate a new chunk for block group %llu",
9830                         block_group->key.objectid);
9831         mutex_unlock(&root->fs_info->chunk_mutex);
9832         btrfs_end_transaction(trans, root);
9833 out:
9834         btrfs_put_block_group(block_group);
9835         return ret;
9836 }
9837
9838 static int find_first_block_group(struct btrfs_root *root,
9839                 struct btrfs_path *path, struct btrfs_key *key)
9840 {
9841         int ret = 0;
9842         struct btrfs_key found_key;
9843         struct extent_buffer *leaf;
9844         int slot;
9845
9846         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9847         if (ret < 0)
9848                 goto out;
9849
9850         while (1) {
9851                 slot = path->slots[0];
9852                 leaf = path->nodes[0];
9853                 if (slot >= btrfs_header_nritems(leaf)) {
9854                         ret = btrfs_next_leaf(root, path);
9855                         if (ret == 0)
9856                                 continue;
9857                         if (ret < 0)
9858                                 goto out;
9859                         break;
9860                 }
9861                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
9862
9863                 if (found_key.objectid >= key->objectid &&
9864                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9865                         struct extent_map_tree *em_tree;
9866                         struct extent_map *em;
9867
9868                         em_tree = &root->fs_info->mapping_tree.map_tree;
9869                         read_lock(&em_tree->lock);
9870                         em = lookup_extent_mapping(em_tree, found_key.objectid,
9871                                                    found_key.offset);
9872                         read_unlock(&em_tree->lock);
9873                         if (!em) {
9874                                 btrfs_err(root->fs_info,
9875                         "logical %llu len %llu found bg but no related chunk",
9876                                           found_key.objectid, found_key.offset);
9877                                 ret = -ENOENT;
9878                         } else {
9879                                 ret = 0;
9880                         }
9881                         goto out;
9882                 }
9883                 path->slots[0]++;
9884         }
9885 out:
9886         return ret;
9887 }
9888
9889 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9890 {
9891         struct btrfs_block_group_cache *block_group;
9892         u64 last = 0;
9893
9894         while (1) {
9895                 struct inode *inode;
9896
9897                 block_group = btrfs_lookup_first_block_group(info, last);
9898                 while (block_group) {
9899                         spin_lock(&block_group->lock);
9900                         if (block_group->iref)
9901                                 break;
9902                         spin_unlock(&block_group->lock);
9903                         block_group = next_block_group(info->tree_root,
9904                                                        block_group);
9905                 }
9906                 if (!block_group) {
9907                         if (last == 0)
9908                                 break;
9909                         last = 0;
9910                         continue;
9911                 }
9912
9913                 inode = block_group->inode;
9914                 block_group->iref = 0;
9915                 block_group->inode = NULL;
9916                 spin_unlock(&block_group->lock);
9917                 ASSERT(block_group->io_ctl.inode == NULL);
9918                 iput(inode);
9919                 last = block_group->key.objectid + block_group->key.offset;
9920                 btrfs_put_block_group(block_group);
9921         }
9922 }
9923
9924 int btrfs_free_block_groups(struct btrfs_fs_info *info)
9925 {
9926         struct btrfs_block_group_cache *block_group;
9927         struct btrfs_space_info *space_info;
9928         struct btrfs_caching_control *caching_ctl;
9929         struct rb_node *n;
9930
9931         down_write(&info->commit_root_sem);
9932         while (!list_empty(&info->caching_block_groups)) {
9933                 caching_ctl = list_entry(info->caching_block_groups.next,
9934                                          struct btrfs_caching_control, list);
9935                 list_del(&caching_ctl->list);
9936                 put_caching_control(caching_ctl);
9937         }
9938         up_write(&info->commit_root_sem);
9939
9940         spin_lock(&info->unused_bgs_lock);
9941         while (!list_empty(&info->unused_bgs)) {
9942                 block_group = list_first_entry(&info->unused_bgs,
9943                                                struct btrfs_block_group_cache,
9944                                                bg_list);
9945                 list_del_init(&block_group->bg_list);
9946                 btrfs_put_block_group(block_group);
9947         }
9948         spin_unlock(&info->unused_bgs_lock);
9949
9950         spin_lock(&info->block_group_cache_lock);
9951         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9952                 block_group = rb_entry(n, struct btrfs_block_group_cache,
9953                                        cache_node);
9954                 rb_erase(&block_group->cache_node,
9955                          &info->block_group_cache_tree);
9956                 RB_CLEAR_NODE(&block_group->cache_node);
9957                 spin_unlock(&info->block_group_cache_lock);
9958
9959                 down_write(&block_group->space_info->groups_sem);
9960                 list_del(&block_group->list);
9961                 up_write(&block_group->space_info->groups_sem);
9962
9963                 if (block_group->cached == BTRFS_CACHE_STARTED)
9964                         wait_block_group_cache_done(block_group);
9965
9966                 /*
9967                  * We haven't cached this block group, which means we could
9968                  * possibly have excluded extents on this block group.
9969                  */
9970                 if (block_group->cached == BTRFS_CACHE_NO ||
9971                     block_group->cached == BTRFS_CACHE_ERROR)
9972                         free_excluded_extents(info->extent_root, block_group);
9973
9974                 btrfs_remove_free_space_cache(block_group);
9975                 ASSERT(list_empty(&block_group->dirty_list));
9976                 ASSERT(list_empty(&block_group->io_list));
9977                 ASSERT(list_empty(&block_group->bg_list));
9978                 ASSERT(atomic_read(&block_group->count) == 1);
9979                 btrfs_put_block_group(block_group);
9980
9981                 spin_lock(&info->block_group_cache_lock);
9982         }
9983         spin_unlock(&info->block_group_cache_lock);
9984
9985         /* now that all the block groups are freed, go through and
9986          * free all the space_info structs.  This is only called during
9987          * the final stages of unmount, and so we know nobody is
9988          * using them.  We call synchronize_rcu() once before we start,
9989          * just to be on the safe side.
9990          */
9991         synchronize_rcu();
9992
9993         release_global_block_rsv(info);
9994
9995         while (!list_empty(&info->space_info)) {
9996                 int i;
9997
9998                 space_info = list_entry(info->space_info.next,
9999                                         struct btrfs_space_info,
10000                                         list);
10001
10002                 /*
10003                  * Do not hide this behind enospc_debug, this is actually
10004                  * important and indicates a real bug if this happens.
10005                  */
10006                 if (WARN_ON(space_info->bytes_pinned > 0 ||
10007                             space_info->bytes_reserved > 0 ||
10008                             space_info->bytes_may_use > 0))
10009                         dump_space_info(space_info, 0, 0);
10010                 list_del(&space_info->list);
10011                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
10012                         struct kobject *kobj;
10013                         kobj = space_info->block_group_kobjs[i];
10014                         space_info->block_group_kobjs[i] = NULL;
10015                         if (kobj) {
10016                                 kobject_del(kobj);
10017                                 kobject_put(kobj);
10018                         }
10019                 }
10020                 kobject_del(&space_info->kobj);
10021                 kobject_put(&space_info->kobj);
10022         }
10023         return 0;
10024 }
10025
10026 static void __link_block_group(struct btrfs_space_info *space_info,
10027                                struct btrfs_block_group_cache *cache)
10028 {
10029         int index = get_block_group_index(cache);
10030         bool first = false;
10031
10032         down_write(&space_info->groups_sem);
10033         if (list_empty(&space_info->block_groups[index]))
10034                 first = true;
10035         list_add_tail(&cache->list, &space_info->block_groups[index]);
10036         up_write(&space_info->groups_sem);
10037
10038         if (first) {
10039                 struct raid_kobject *rkobj;
10040                 int ret;
10041
10042                 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
10043                 if (!rkobj)
10044                         goto out_err;
10045                 rkobj->raid_type = index;
10046                 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
10047                 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
10048                                   "%s", get_raid_name(index));
10049                 if (ret) {
10050                         kobject_put(&rkobj->kobj);
10051                         goto out_err;
10052                 }
10053                 space_info->block_group_kobjs[index] = &rkobj->kobj;
10054         }
10055
10056         return;
10057 out_err:
10058         pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
10059 }
10060
10061 static struct btrfs_block_group_cache *
10062 btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
10063 {
10064         struct btrfs_block_group_cache *cache;
10065
10066         cache = kzalloc(sizeof(*cache), GFP_NOFS);
10067         if (!cache)
10068                 return NULL;
10069
10070         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
10071                                         GFP_NOFS);
10072         if (!cache->free_space_ctl) {
10073                 kfree(cache);
10074                 return NULL;
10075         }
10076
10077         cache->key.objectid = start;
10078         cache->key.offset = size;
10079         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10080
10081         cache->sectorsize = root->sectorsize;
10082         cache->fs_info = root->fs_info;
10083         cache->full_stripe_len = btrfs_full_stripe_len(root,
10084                                                &root->fs_info->mapping_tree,
10085                                                start);
10086         set_free_space_tree_thresholds(cache);
10087
10088         atomic_set(&cache->count, 1);
10089         spin_lock_init(&cache->lock);
10090         init_rwsem(&cache->data_rwsem);
10091         INIT_LIST_HEAD(&cache->list);
10092         INIT_LIST_HEAD(&cache->cluster_list);
10093         INIT_LIST_HEAD(&cache->bg_list);
10094         INIT_LIST_HEAD(&cache->ro_list);
10095         INIT_LIST_HEAD(&cache->dirty_list);
10096         INIT_LIST_HEAD(&cache->io_list);
10097         btrfs_init_free_space_ctl(cache);
10098         atomic_set(&cache->trimming, 0);
10099         mutex_init(&cache->free_space_lock);
10100
10101         return cache;
10102 }
10103
10104 int btrfs_read_block_groups(struct btrfs_root *root)
10105 {
10106         struct btrfs_path *path;
10107         int ret;
10108         struct btrfs_block_group_cache *cache;
10109         struct btrfs_fs_info *info = root->fs_info;
10110         struct btrfs_space_info *space_info;
10111         struct btrfs_key key;
10112         struct btrfs_key found_key;
10113         struct extent_buffer *leaf;
10114         int need_clear = 0;
10115         u64 cache_gen;
10116
10117         root = info->extent_root;
10118         key.objectid = 0;
10119         key.offset = 0;
10120         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10121         path = btrfs_alloc_path();
10122         if (!path)
10123                 return -ENOMEM;
10124         path->reada = READA_FORWARD;
10125
10126         cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
10127         if (btrfs_test_opt(root->fs_info, SPACE_CACHE) &&
10128             btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
10129                 need_clear = 1;
10130         if (btrfs_test_opt(root->fs_info, CLEAR_CACHE))
10131                 need_clear = 1;
10132
10133         while (1) {
10134                 ret = find_first_block_group(root, path, &key);
10135                 if (ret > 0)
10136                         break;
10137                 if (ret != 0)
10138                         goto error;
10139
10140                 leaf = path->nodes[0];
10141                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
10142
10143                 cache = btrfs_create_block_group_cache(root, found_key.objectid,
10144                                                        found_key.offset);
10145                 if (!cache) {
10146                         ret = -ENOMEM;
10147                         goto error;
10148                 }
10149
10150                 if (need_clear) {
10151                         /*
10152                          * When we mount with old space cache, we need to
10153                          * set BTRFS_DC_CLEAR and set dirty flag.
10154                          *
10155                          * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
10156                          *    truncate the old free space cache inode and
10157                          *    setup a new one.
10158                          * b) Setting 'dirty flag' makes sure that we flush
10159                          *    the new space cache info onto disk.
10160                          */
10161                         if (btrfs_test_opt(root->fs_info, SPACE_CACHE))
10162                                 cache->disk_cache_state = BTRFS_DC_CLEAR;
10163                 }
10164
10165                 read_extent_buffer(leaf, &cache->item,
10166                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
10167                                    sizeof(cache->item));
10168                 cache->flags = btrfs_block_group_flags(&cache->item);
10169
10170                 key.objectid = found_key.objectid + found_key.offset;
10171                 btrfs_release_path(path);
10172
10173                 /*
10174                  * We need to exclude the super stripes now so that the space
10175                  * info has super bytes accounted for, otherwise we'll think
10176                  * we have more space than we actually do.
10177                  */
10178                 ret = exclude_super_stripes(root, cache);
10179                 if (ret) {
10180                         /*
10181                          * We may have excluded something, so call this just in
10182                          * case.
10183                          */
10184                         free_excluded_extents(root, cache);
10185                         btrfs_put_block_group(cache);
10186                         goto error;
10187                 }
10188
10189                 /*
10190                  * check for two cases, either we are full, and therefore
10191                  * don't need to bother with the caching work since we won't
10192                  * find any space, or we are empty, and we can just add all
10193                  * the space in and be done with it.  This saves us _alot_ of
10194                  * time, particularly in the full case.
10195                  */
10196                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10197                         cache->last_byte_to_unpin = (u64)-1;
10198                         cache->cached = BTRFS_CACHE_FINISHED;
10199                         free_excluded_extents(root, cache);
10200                 } else if (btrfs_block_group_used(&cache->item) == 0) {
10201                         cache->last_byte_to_unpin = (u64)-1;
10202                         cache->cached = BTRFS_CACHE_FINISHED;
10203                         add_new_free_space(cache, root->fs_info,
10204                                            found_key.objectid,
10205                                            found_key.objectid +
10206                                            found_key.offset);
10207                         free_excluded_extents(root, cache);
10208                 }
10209
10210                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
10211                 if (ret) {
10212                         btrfs_remove_free_space_cache(cache);
10213                         btrfs_put_block_group(cache);
10214                         goto error;
10215                 }
10216
10217                 trace_btrfs_add_block_group(root->fs_info, cache, 0);
10218                 ret = update_space_info(info, cache->flags, found_key.offset,
10219                                         btrfs_block_group_used(&cache->item),
10220                                         cache->bytes_super, &space_info);
10221                 if (ret) {
10222                         btrfs_remove_free_space_cache(cache);
10223                         spin_lock(&info->block_group_cache_lock);
10224                         rb_erase(&cache->cache_node,
10225                                  &info->block_group_cache_tree);
10226                         RB_CLEAR_NODE(&cache->cache_node);
10227                         spin_unlock(&info->block_group_cache_lock);
10228                         btrfs_put_block_group(cache);
10229                         goto error;
10230                 }
10231
10232                 cache->space_info = space_info;
10233
10234                 __link_block_group(space_info, cache);
10235
10236                 set_avail_alloc_bits(root->fs_info, cache->flags);
10237                 if (btrfs_chunk_readonly(root, cache->key.objectid)) {
10238                         inc_block_group_ro(cache, 1);
10239                 } else if (btrfs_block_group_used(&cache->item) == 0) {
10240                         spin_lock(&info->unused_bgs_lock);
10241                         /* Should always be true but just in case. */
10242                         if (list_empty(&cache->bg_list)) {
10243                                 btrfs_get_block_group(cache);
10244                                 list_add_tail(&cache->bg_list,
10245                                               &info->unused_bgs);
10246                         }
10247                         spin_unlock(&info->unused_bgs_lock);
10248                 }
10249         }
10250
10251         list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
10252                 if (!(get_alloc_profile(root, space_info->flags) &
10253                       (BTRFS_BLOCK_GROUP_RAID10 |
10254                        BTRFS_BLOCK_GROUP_RAID1 |
10255                        BTRFS_BLOCK_GROUP_RAID5 |
10256                        BTRFS_BLOCK_GROUP_RAID6 |
10257                        BTRFS_BLOCK_GROUP_DUP)))
10258                         continue;
10259                 /*
10260                  * avoid allocating from un-mirrored block group if there are
10261                  * mirrored block groups.
10262                  */
10263                 list_for_each_entry(cache,
10264                                 &space_info->block_groups[BTRFS_RAID_RAID0],
10265                                 list)
10266                         inc_block_group_ro(cache, 1);
10267                 list_for_each_entry(cache,
10268                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
10269                                 list)
10270                         inc_block_group_ro(cache, 1);
10271         }
10272
10273         init_global_block_rsv(info);
10274         ret = 0;
10275 error:
10276         btrfs_free_path(path);
10277         return ret;
10278 }
10279
10280 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
10281                                        struct btrfs_root *root)
10282 {
10283         struct btrfs_block_group_cache *block_group, *tmp;
10284         struct btrfs_root *extent_root = root->fs_info->extent_root;
10285         struct btrfs_block_group_item item;
10286         struct btrfs_key key;
10287         int ret = 0;
10288         bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
10289
10290         trans->can_flush_pending_bgs = false;
10291         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
10292                 if (ret)
10293                         goto next;
10294
10295                 spin_lock(&block_group->lock);
10296                 memcpy(&item, &block_group->item, sizeof(item));
10297                 memcpy(&key, &block_group->key, sizeof(key));
10298                 spin_unlock(&block_group->lock);
10299
10300                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
10301                                         sizeof(item));
10302                 if (ret)
10303                         btrfs_abort_transaction(trans, ret);
10304                 ret = btrfs_finish_chunk_alloc(trans, extent_root,
10305                                                key.objectid, key.offset);
10306                 if (ret)
10307                         btrfs_abort_transaction(trans, ret);
10308                 add_block_group_free_space(trans, root->fs_info, block_group);
10309                 /* already aborted the transaction if it failed. */
10310 next:
10311                 list_del_init(&block_group->bg_list);
10312         }
10313         trans->can_flush_pending_bgs = can_flush_pending_bgs;
10314 }
10315
10316 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
10317                            struct btrfs_root *root, u64 bytes_used,
10318                            u64 type, u64 chunk_objectid, u64 chunk_offset,
10319                            u64 size)
10320 {
10321         int ret;
10322         struct btrfs_root *extent_root;
10323         struct btrfs_block_group_cache *cache;
10324         extent_root = root->fs_info->extent_root;
10325
10326         btrfs_set_log_full_commit(root->fs_info, trans);
10327
10328         cache = btrfs_create_block_group_cache(root, chunk_offset, size);
10329         if (!cache)
10330                 return -ENOMEM;
10331
10332         btrfs_set_block_group_used(&cache->item, bytes_used);
10333         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
10334         btrfs_set_block_group_flags(&cache->item, type);
10335
10336         cache->flags = type;
10337         cache->last_byte_to_unpin = (u64)-1;
10338         cache->cached = BTRFS_CACHE_FINISHED;
10339         cache->needs_free_space = 1;
10340         ret = exclude_super_stripes(root, cache);
10341         if (ret) {
10342                 /*
10343                  * We may have excluded something, so call this just in
10344                  * case.
10345                  */
10346                 free_excluded_extents(root, cache);
10347                 btrfs_put_block_group(cache);
10348                 return ret;
10349         }
10350
10351         add_new_free_space(cache, root->fs_info, chunk_offset,
10352                            chunk_offset + size);
10353
10354         free_excluded_extents(root, cache);
10355
10356 #ifdef CONFIG_BTRFS_DEBUG
10357         if (btrfs_should_fragment_free_space(root, cache)) {
10358                 u64 new_bytes_used = size - bytes_used;
10359
10360                 bytes_used += new_bytes_used >> 1;
10361                 fragment_free_space(root, cache);
10362         }
10363 #endif
10364         /*
10365          * Call to ensure the corresponding space_info object is created and
10366          * assigned to our block group, but don't update its counters just yet.
10367          * We want our bg to be added to the rbtree with its ->space_info set.
10368          */
10369         ret = update_space_info(root->fs_info, cache->flags, 0, 0, 0,
10370                                 &cache->space_info);
10371         if (ret) {
10372                 btrfs_remove_free_space_cache(cache);
10373                 btrfs_put_block_group(cache);
10374                 return ret;
10375         }
10376
10377         ret = btrfs_add_block_group_cache(root->fs_info, cache);
10378         if (ret) {
10379                 btrfs_remove_free_space_cache(cache);
10380                 btrfs_put_block_group(cache);
10381                 return ret;
10382         }
10383
10384         /*
10385          * Now that our block group has its ->space_info set and is inserted in
10386          * the rbtree, update the space info's counters.
10387          */
10388         trace_btrfs_add_block_group(root->fs_info, cache, 1);
10389         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
10390                                 cache->bytes_super, &cache->space_info);
10391         if (ret) {
10392                 btrfs_remove_free_space_cache(cache);
10393                 spin_lock(&root->fs_info->block_group_cache_lock);
10394                 rb_erase(&cache->cache_node,
10395                          &root->fs_info->block_group_cache_tree);
10396                 RB_CLEAR_NODE(&cache->cache_node);
10397                 spin_unlock(&root->fs_info->block_group_cache_lock);
10398                 btrfs_put_block_group(cache);
10399                 return ret;
10400         }
10401         update_global_block_rsv(root->fs_info);
10402
10403         __link_block_group(cache->space_info, cache);
10404
10405         list_add_tail(&cache->bg_list, &trans->new_bgs);
10406
10407         set_avail_alloc_bits(extent_root->fs_info, type);
10408         return 0;
10409 }
10410
10411 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10412 {
10413         u64 extra_flags = chunk_to_extended(flags) &
10414                                 BTRFS_EXTENDED_PROFILE_MASK;
10415
10416         write_seqlock(&fs_info->profiles_lock);
10417         if (flags & BTRFS_BLOCK_GROUP_DATA)
10418                 fs_info->avail_data_alloc_bits &= ~extra_flags;
10419         if (flags & BTRFS_BLOCK_GROUP_METADATA)
10420                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10421         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10422                 fs_info->avail_system_alloc_bits &= ~extra_flags;
10423         write_sequnlock(&fs_info->profiles_lock);
10424 }
10425
10426 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10427                              struct btrfs_root *root, u64 group_start,
10428                              struct extent_map *em)
10429 {
10430         struct btrfs_path *path;
10431         struct btrfs_block_group_cache *block_group;
10432         struct btrfs_free_cluster *cluster;
10433         struct btrfs_root *tree_root = root->fs_info->tree_root;
10434         struct btrfs_key key;
10435         struct inode *inode;
10436         struct kobject *kobj = NULL;
10437         int ret;
10438         int index;
10439         int factor;
10440         struct btrfs_caching_control *caching_ctl = NULL;
10441         bool remove_em;
10442
10443         root = root->fs_info->extent_root;
10444
10445         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
10446         BUG_ON(!block_group);
10447         BUG_ON(!block_group->ro);
10448
10449         /*
10450          * Free the reserved super bytes from this block group before
10451          * remove it.
10452          */
10453         free_excluded_extents(root, block_group);
10454
10455         memcpy(&key, &block_group->key, sizeof(key));
10456         index = get_block_group_index(block_group);
10457         if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
10458                                   BTRFS_BLOCK_GROUP_RAID1 |
10459                                   BTRFS_BLOCK_GROUP_RAID10))
10460                 factor = 2;
10461         else
10462                 factor = 1;
10463
10464         /* make sure this block group isn't part of an allocation cluster */
10465         cluster = &root->fs_info->data_alloc_cluster;
10466         spin_lock(&cluster->refill_lock);
10467         btrfs_return_cluster_to_free_space(block_group, cluster);
10468         spin_unlock(&cluster->refill_lock);
10469
10470         /*
10471          * make sure this block group isn't part of a metadata
10472          * allocation cluster
10473          */
10474         cluster = &root->fs_info->meta_alloc_cluster;
10475         spin_lock(&cluster->refill_lock);
10476         btrfs_return_cluster_to_free_space(block_group, cluster);
10477         spin_unlock(&cluster->refill_lock);
10478
10479         path = btrfs_alloc_path();
10480         if (!path) {
10481                 ret = -ENOMEM;
10482                 goto out;
10483         }
10484
10485         /*
10486          * get the inode first so any iput calls done for the io_list
10487          * aren't the final iput (no unlinks allowed now)
10488          */
10489         inode = lookup_free_space_inode(tree_root, block_group, path);
10490
10491         mutex_lock(&trans->transaction->cache_write_mutex);
10492         /*
10493          * make sure our free spache cache IO is done before remove the
10494          * free space inode
10495          */
10496         spin_lock(&trans->transaction->dirty_bgs_lock);
10497         if (!list_empty(&block_group->io_list)) {
10498                 list_del_init(&block_group->io_list);
10499
10500                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10501
10502                 spin_unlock(&trans->transaction->dirty_bgs_lock);
10503                 btrfs_wait_cache_io(root, trans, block_group,
10504                                     &block_group->io_ctl, path,
10505                                     block_group->key.objectid);
10506                 btrfs_put_block_group(block_group);
10507                 spin_lock(&trans->transaction->dirty_bgs_lock);
10508         }
10509
10510         if (!list_empty(&block_group->dirty_list)) {
10511                 list_del_init(&block_group->dirty_list);
10512                 btrfs_put_block_group(block_group);
10513         }
10514         spin_unlock(&trans->transaction->dirty_bgs_lock);
10515         mutex_unlock(&trans->transaction->cache_write_mutex);
10516
10517         if (!IS_ERR(inode)) {
10518                 ret = btrfs_orphan_add(trans, inode);
10519                 if (ret) {
10520                         btrfs_add_delayed_iput(inode);
10521                         goto out;
10522                 }
10523                 clear_nlink(inode);
10524                 /* One for the block groups ref */
10525                 spin_lock(&block_group->lock);
10526                 if (block_group->iref) {
10527                         block_group->iref = 0;
10528                         block_group->inode = NULL;
10529                         spin_unlock(&block_group->lock);
10530                         iput(inode);
10531                 } else {
10532                         spin_unlock(&block_group->lock);
10533                 }
10534                 /* One for our lookup ref */
10535                 btrfs_add_delayed_iput(inode);
10536         }
10537
10538         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10539         key.offset = block_group->key.objectid;
10540         key.type = 0;
10541
10542         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10543         if (ret < 0)
10544                 goto out;
10545         if (ret > 0)
10546                 btrfs_release_path(path);
10547         if (ret == 0) {
10548                 ret = btrfs_del_item(trans, tree_root, path);
10549                 if (ret)
10550                         goto out;
10551                 btrfs_release_path(path);
10552         }
10553
10554         spin_lock(&root->fs_info->block_group_cache_lock);
10555         rb_erase(&block_group->cache_node,
10556                  &root->fs_info->block_group_cache_tree);
10557         RB_CLEAR_NODE(&block_group->cache_node);
10558
10559         if (root->fs_info->first_logical_byte == block_group->key.objectid)
10560                 root->fs_info->first_logical_byte = (u64)-1;
10561         spin_unlock(&root->fs_info->block_group_cache_lock);
10562
10563         down_write(&block_group->space_info->groups_sem);
10564         /*
10565          * we must use list_del_init so people can check to see if they
10566          * are still on the list after taking the semaphore
10567          */
10568         list_del_init(&block_group->list);
10569         if (list_empty(&block_group->space_info->block_groups[index])) {
10570                 kobj = block_group->space_info->block_group_kobjs[index];
10571                 block_group->space_info->block_group_kobjs[index] = NULL;
10572                 clear_avail_alloc_bits(root->fs_info, block_group->flags);
10573         }
10574         up_write(&block_group->space_info->groups_sem);
10575         if (kobj) {
10576                 kobject_del(kobj);
10577                 kobject_put(kobj);
10578         }
10579
10580         if (block_group->has_caching_ctl)
10581                 caching_ctl = get_caching_control(block_group);
10582         if (block_group->cached == BTRFS_CACHE_STARTED)
10583                 wait_block_group_cache_done(block_group);
10584         if (block_group->has_caching_ctl) {
10585                 down_write(&root->fs_info->commit_root_sem);
10586                 if (!caching_ctl) {
10587                         struct btrfs_caching_control *ctl;
10588
10589                         list_for_each_entry(ctl,
10590                                     &root->fs_info->caching_block_groups, list)
10591                                 if (ctl->block_group == block_group) {
10592                                         caching_ctl = ctl;
10593                                         atomic_inc(&caching_ctl->count);
10594                                         break;
10595                                 }
10596                 }
10597                 if (caching_ctl)
10598                         list_del_init(&caching_ctl->list);
10599                 up_write(&root->fs_info->commit_root_sem);
10600                 if (caching_ctl) {
10601                         /* Once for the caching bgs list and once for us. */
10602                         put_caching_control(caching_ctl);
10603                         put_caching_control(caching_ctl);
10604                 }
10605         }
10606
10607         spin_lock(&trans->transaction->dirty_bgs_lock);
10608         if (!list_empty(&block_group->dirty_list)) {
10609                 WARN_ON(1);
10610         }
10611         if (!list_empty(&block_group->io_list)) {
10612                 WARN_ON(1);
10613         }
10614         spin_unlock(&trans->transaction->dirty_bgs_lock);
10615         btrfs_remove_free_space_cache(block_group);
10616
10617         spin_lock(&block_group->space_info->lock);
10618         list_del_init(&block_group->ro_list);
10619
10620         if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
10621                 WARN_ON(block_group->space_info->total_bytes
10622                         < block_group->key.offset);
10623                 WARN_ON(block_group->space_info->bytes_readonly
10624                         < block_group->key.offset);
10625                 WARN_ON(block_group->space_info->disk_total
10626                         < block_group->key.offset * factor);
10627         }
10628         block_group->space_info->total_bytes -= block_group->key.offset;
10629         block_group->space_info->bytes_readonly -= block_group->key.offset;
10630         block_group->space_info->disk_total -= block_group->key.offset * factor;
10631
10632         spin_unlock(&block_group->space_info->lock);
10633
10634         memcpy(&key, &block_group->key, sizeof(key));
10635
10636         lock_chunks(root);
10637         if (!list_empty(&em->list)) {
10638                 /* We're in the transaction->pending_chunks list. */
10639                 free_extent_map(em);
10640         }
10641         spin_lock(&block_group->lock);
10642         block_group->removed = 1;
10643         /*
10644          * At this point trimming can't start on this block group, because we
10645          * removed the block group from the tree fs_info->block_group_cache_tree
10646          * so no one can't find it anymore and even if someone already got this
10647          * block group before we removed it from the rbtree, they have already
10648          * incremented block_group->trimming - if they didn't, they won't find
10649          * any free space entries because we already removed them all when we
10650          * called btrfs_remove_free_space_cache().
10651          *
10652          * And we must not remove the extent map from the fs_info->mapping_tree
10653          * to prevent the same logical address range and physical device space
10654          * ranges from being reused for a new block group. This is because our
10655          * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10656          * completely transactionless, so while it is trimming a range the
10657          * currently running transaction might finish and a new one start,
10658          * allowing for new block groups to be created that can reuse the same
10659          * physical device locations unless we take this special care.
10660          *
10661          * There may also be an implicit trim operation if the file system
10662          * is mounted with -odiscard. The same protections must remain
10663          * in place until the extents have been discarded completely when
10664          * the transaction commit has completed.
10665          */
10666         remove_em = (atomic_read(&block_group->trimming) == 0);
10667         /*
10668          * Make sure a trimmer task always sees the em in the pinned_chunks list
10669          * if it sees block_group->removed == 1 (needs to lock block_group->lock
10670          * before checking block_group->removed).
10671          */
10672         if (!remove_em) {
10673                 /*
10674                  * Our em might be in trans->transaction->pending_chunks which
10675                  * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10676                  * and so is the fs_info->pinned_chunks list.
10677                  *
10678                  * So at this point we must be holding the chunk_mutex to avoid
10679                  * any races with chunk allocation (more specifically at
10680                  * volumes.c:contains_pending_extent()), to ensure it always
10681                  * sees the em, either in the pending_chunks list or in the
10682                  * pinned_chunks list.
10683                  */
10684                 list_move_tail(&em->list, &root->fs_info->pinned_chunks);
10685         }
10686         spin_unlock(&block_group->lock);
10687
10688         if (remove_em) {
10689                 struct extent_map_tree *em_tree;
10690
10691                 em_tree = &root->fs_info->mapping_tree.map_tree;
10692                 write_lock(&em_tree->lock);
10693                 /*
10694                  * The em might be in the pending_chunks list, so make sure the
10695                  * chunk mutex is locked, since remove_extent_mapping() will
10696                  * delete us from that list.
10697                  */
10698                 remove_extent_mapping(em_tree, em);
10699                 write_unlock(&em_tree->lock);
10700                 /* once for the tree */
10701                 free_extent_map(em);
10702         }
10703
10704         unlock_chunks(root);
10705
10706         ret = remove_block_group_free_space(trans, root->fs_info, block_group);
10707         if (ret)
10708                 goto out;
10709
10710         btrfs_put_block_group(block_group);
10711         btrfs_put_block_group(block_group);
10712
10713         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10714         if (ret > 0)
10715                 ret = -EIO;
10716         if (ret < 0)
10717                 goto out;
10718
10719         ret = btrfs_del_item(trans, root, path);
10720 out:
10721         btrfs_free_path(path);
10722         return ret;
10723 }
10724
10725 struct btrfs_trans_handle *
10726 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10727                                      const u64 chunk_offset)
10728 {
10729         struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10730         struct extent_map *em;
10731         struct map_lookup *map;
10732         unsigned int num_items;
10733
10734         read_lock(&em_tree->lock);
10735         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10736         read_unlock(&em_tree->lock);
10737         ASSERT(em && em->start == chunk_offset);
10738
10739         /*
10740          * We need to reserve 3 + N units from the metadata space info in order
10741          * to remove a block group (done at btrfs_remove_chunk() and at
10742          * btrfs_remove_block_group()), which are used for:
10743          *
10744          * 1 unit for adding the free space inode's orphan (located in the tree
10745          * of tree roots).
10746          * 1 unit for deleting the block group item (located in the extent
10747          * tree).
10748          * 1 unit for deleting the free space item (located in tree of tree
10749          * roots).
10750          * N units for deleting N device extent items corresponding to each
10751          * stripe (located in the device tree).
10752          *
10753          * In order to remove a block group we also need to reserve units in the
10754          * system space info in order to update the chunk tree (update one or
10755          * more device items and remove one chunk item), but this is done at
10756          * btrfs_remove_chunk() through a call to check_system_chunk().
10757          */
10758         map = em->map_lookup;
10759         num_items = 3 + map->num_stripes;
10760         free_extent_map(em);
10761
10762         return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10763                                                            num_items, 1);
10764 }
10765
10766 /*
10767  * Process the unused_bgs list and remove any that don't have any allocated
10768  * space inside of them.
10769  */
10770 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10771 {
10772         struct btrfs_block_group_cache *block_group;
10773         struct btrfs_space_info *space_info;
10774         struct btrfs_root *root = fs_info->extent_root;
10775         struct btrfs_trans_handle *trans;
10776         int ret = 0;
10777
10778         if (!fs_info->open)
10779                 return;
10780
10781         spin_lock(&fs_info->unused_bgs_lock);
10782         while (!list_empty(&fs_info->unused_bgs)) {
10783                 u64 start, end;
10784                 int trimming;
10785
10786                 block_group = list_first_entry(&fs_info->unused_bgs,
10787                                                struct btrfs_block_group_cache,
10788                                                bg_list);
10789                 list_del_init(&block_group->bg_list);
10790
10791                 space_info = block_group->space_info;
10792
10793                 if (ret || btrfs_mixed_space_info(space_info)) {
10794                         btrfs_put_block_group(block_group);
10795                         continue;
10796                 }
10797                 spin_unlock(&fs_info->unused_bgs_lock);
10798
10799                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
10800
10801                 /* Don't want to race with allocators so take the groups_sem */
10802                 down_write(&space_info->groups_sem);
10803                 spin_lock(&block_group->lock);
10804                 if (block_group->reserved ||
10805                     btrfs_block_group_used(&block_group->item) ||
10806                     block_group->ro ||
10807                     list_is_singular(&block_group->list)) {
10808                         /*
10809                          * We want to bail if we made new allocations or have
10810                          * outstanding allocations in this block group.  We do
10811                          * the ro check in case balance is currently acting on
10812                          * this block group.
10813                          */
10814                         spin_unlock(&block_group->lock);
10815                         up_write(&space_info->groups_sem);
10816                         goto next;
10817                 }
10818                 spin_unlock(&block_group->lock);
10819
10820                 /* We don't want to force the issue, only flip if it's ok. */
10821                 ret = inc_block_group_ro(block_group, 0);
10822                 up_write(&space_info->groups_sem);
10823                 if (ret < 0) {
10824                         ret = 0;
10825                         goto next;
10826                 }
10827
10828                 /*
10829                  * Want to do this before we do anything else so we can recover
10830                  * properly if we fail to join the transaction.
10831                  */
10832                 trans = btrfs_start_trans_remove_block_group(fs_info,
10833                                                      block_group->key.objectid);
10834                 if (IS_ERR(trans)) {
10835                         btrfs_dec_block_group_ro(root, block_group);
10836                         ret = PTR_ERR(trans);
10837                         goto next;
10838                 }
10839
10840                 /*
10841                  * We could have pending pinned extents for this block group,
10842                  * just delete them, we don't care about them anymore.
10843                  */
10844                 start = block_group->key.objectid;
10845                 end = start + block_group->key.offset - 1;
10846                 /*
10847                  * Hold the unused_bg_unpin_mutex lock to avoid racing with
10848                  * btrfs_finish_extent_commit(). If we are at transaction N,
10849                  * another task might be running finish_extent_commit() for the
10850                  * previous transaction N - 1, and have seen a range belonging
10851                  * to the block group in freed_extents[] before we were able to
10852                  * clear the whole block group range from freed_extents[]. This
10853                  * means that task can lookup for the block group after we
10854                  * unpinned it from freed_extents[] and removed it, leading to
10855                  * a BUG_ON() at btrfs_unpin_extent_range().
10856                  */
10857                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
10858                 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10859                                   EXTENT_DIRTY);
10860                 if (ret) {
10861                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10862                         btrfs_dec_block_group_ro(root, block_group);
10863                         goto end_trans;
10864                 }
10865                 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10866                                   EXTENT_DIRTY);
10867                 if (ret) {
10868                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10869                         btrfs_dec_block_group_ro(root, block_group);
10870                         goto end_trans;
10871                 }
10872                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10873
10874                 /* Reset pinned so btrfs_put_block_group doesn't complain */
10875                 spin_lock(&space_info->lock);
10876                 spin_lock(&block_group->lock);
10877
10878                 space_info->bytes_pinned -= block_group->pinned;
10879                 space_info->bytes_readonly += block_group->pinned;
10880                 percpu_counter_add(&space_info->total_bytes_pinned,
10881                                    -block_group->pinned);
10882                 block_group->pinned = 0;
10883
10884                 spin_unlock(&block_group->lock);
10885                 spin_unlock(&space_info->lock);
10886
10887                 /* DISCARD can flip during remount */
10888                 trimming = btrfs_test_opt(root->fs_info, DISCARD);
10889
10890                 /* Implicit trim during transaction commit. */
10891                 if (trimming)
10892                         btrfs_get_block_group_trimming(block_group);
10893
10894                 /*
10895                  * Btrfs_remove_chunk will abort the transaction if things go
10896                  * horribly wrong.
10897                  */
10898                 ret = btrfs_remove_chunk(trans, root,
10899                                          block_group->key.objectid);
10900
10901                 if (ret) {
10902                         if (trimming)
10903                                 btrfs_put_block_group_trimming(block_group);
10904                         goto end_trans;
10905                 }
10906
10907                 /*
10908                  * If we're not mounted with -odiscard, we can just forget
10909                  * about this block group. Otherwise we'll need to wait
10910                  * until transaction commit to do the actual discard.
10911                  */
10912                 if (trimming) {
10913                         spin_lock(&fs_info->unused_bgs_lock);
10914                         /*
10915                          * A concurrent scrub might have added us to the list
10916                          * fs_info->unused_bgs, so use a list_move operation
10917                          * to add the block group to the deleted_bgs list.
10918                          */
10919                         list_move(&block_group->bg_list,
10920                                   &trans->transaction->deleted_bgs);
10921                         spin_unlock(&fs_info->unused_bgs_lock);
10922                         btrfs_get_block_group(block_group);
10923                 }
10924 end_trans:
10925                 btrfs_end_transaction(trans, root);
10926 next:
10927                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10928                 btrfs_put_block_group(block_group);
10929                 spin_lock(&fs_info->unused_bgs_lock);
10930         }
10931         spin_unlock(&fs_info->unused_bgs_lock);
10932 }
10933
10934 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10935 {
10936         struct btrfs_space_info *space_info;
10937         struct btrfs_super_block *disk_super;
10938         u64 features;
10939         u64 flags;
10940         int mixed = 0;
10941         int ret;
10942
10943         disk_super = fs_info->super_copy;
10944         if (!btrfs_super_root(disk_super))
10945                 return -EINVAL;
10946
10947         features = btrfs_super_incompat_flags(disk_super);
10948         if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
10949                 mixed = 1;
10950
10951         flags = BTRFS_BLOCK_GROUP_SYSTEM;
10952         ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
10953         if (ret)
10954                 goto out;
10955
10956         if (mixed) {
10957                 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
10958                 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
10959         } else {
10960                 flags = BTRFS_BLOCK_GROUP_METADATA;
10961                 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
10962                 if (ret)
10963                         goto out;
10964
10965                 flags = BTRFS_BLOCK_GROUP_DATA;
10966                 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
10967         }
10968 out:
10969         return ret;
10970 }
10971
10972 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
10973 {
10974         return unpin_extent_range(root, start, end, false);
10975 }
10976
10977 /*
10978  * It used to be that old block groups would be left around forever.
10979  * Iterating over them would be enough to trim unused space.  Since we
10980  * now automatically remove them, we also need to iterate over unallocated
10981  * space.
10982  *
10983  * We don't want a transaction for this since the discard may take a
10984  * substantial amount of time.  We don't require that a transaction be
10985  * running, but we do need to take a running transaction into account
10986  * to ensure that we're not discarding chunks that were released in
10987  * the current transaction.
10988  *
10989  * Holding the chunks lock will prevent other threads from allocating
10990  * or releasing chunks, but it won't prevent a running transaction
10991  * from committing and releasing the memory that the pending chunks
10992  * list head uses.  For that, we need to take a reference to the
10993  * transaction.
10994  */
10995 static int btrfs_trim_free_extents(struct btrfs_device *device,
10996                                    u64 minlen, u64 *trimmed)
10997 {
10998         u64 start = 0, len = 0;
10999         int ret;
11000
11001         *trimmed = 0;
11002
11003         /* Not writeable = nothing to do. */
11004         if (!device->writeable)
11005                 return 0;
11006
11007         /* No free space = nothing to do. */
11008         if (device->total_bytes <= device->bytes_used)
11009                 return 0;
11010
11011         ret = 0;
11012
11013         while (1) {
11014                 struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
11015                 struct btrfs_transaction *trans;
11016                 u64 bytes;
11017
11018                 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
11019                 if (ret)
11020                         return ret;
11021
11022                 down_read(&fs_info->commit_root_sem);
11023
11024                 spin_lock(&fs_info->trans_lock);
11025                 trans = fs_info->running_transaction;
11026                 if (trans)
11027                         atomic_inc(&trans->use_count);
11028                 spin_unlock(&fs_info->trans_lock);
11029
11030                 ret = find_free_dev_extent_start(trans, device, minlen, start,
11031                                                  &start, &len);
11032                 if (trans)
11033                         btrfs_put_transaction(trans);
11034
11035                 if (ret) {
11036                         up_read(&fs_info->commit_root_sem);
11037                         mutex_unlock(&fs_info->chunk_mutex);
11038                         if (ret == -ENOSPC)
11039                                 ret = 0;
11040                         break;
11041                 }
11042
11043                 ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
11044                 up_read(&fs_info->commit_root_sem);
11045                 mutex_unlock(&fs_info->chunk_mutex);
11046
11047                 if (ret)
11048                         break;
11049
11050                 start += len;
11051                 *trimmed += bytes;
11052
11053                 if (fatal_signal_pending(current)) {
11054                         ret = -ERESTARTSYS;
11055                         break;
11056                 }
11057
11058                 cond_resched();
11059         }
11060
11061         return ret;
11062 }
11063
11064 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
11065 {
11066         struct btrfs_fs_info *fs_info = root->fs_info;
11067         struct btrfs_block_group_cache *cache = NULL;
11068         struct btrfs_device *device;
11069         struct list_head *devices;
11070         u64 group_trimmed;
11071         u64 start;
11072         u64 end;
11073         u64 trimmed = 0;
11074         u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
11075         int ret = 0;
11076
11077         /*
11078          * try to trim all FS space, our block group may start from non-zero.
11079          */
11080         if (range->len == total_bytes)
11081                 cache = btrfs_lookup_first_block_group(fs_info, range->start);
11082         else
11083                 cache = btrfs_lookup_block_group(fs_info, range->start);
11084
11085         while (cache) {
11086                 if (cache->key.objectid >= (range->start + range->len)) {
11087                         btrfs_put_block_group(cache);
11088                         break;
11089                 }
11090
11091                 start = max(range->start, cache->key.objectid);
11092                 end = min(range->start + range->len,
11093                                 cache->key.objectid + cache->key.offset);
11094
11095                 if (end - start >= range->minlen) {
11096                         if (!block_group_cache_done(cache)) {
11097                                 ret = cache_block_group(cache, 0);
11098                                 if (ret) {
11099                                         btrfs_put_block_group(cache);
11100                                         break;
11101                                 }
11102                                 ret = wait_block_group_cache_done(cache);
11103                                 if (ret) {
11104                                         btrfs_put_block_group(cache);
11105                                         break;
11106                                 }
11107                         }
11108                         ret = btrfs_trim_block_group(cache,
11109                                                      &group_trimmed,
11110                                                      start,
11111                                                      end,
11112                                                      range->minlen);
11113
11114                         trimmed += group_trimmed;
11115                         if (ret) {
11116                                 btrfs_put_block_group(cache);
11117                                 break;
11118                         }
11119                 }
11120
11121                 cache = next_block_group(fs_info->tree_root, cache);
11122         }
11123
11124         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
11125         devices = &root->fs_info->fs_devices->alloc_list;
11126         list_for_each_entry(device, devices, dev_alloc_list) {
11127                 ret = btrfs_trim_free_extents(device, range->minlen,
11128                                               &group_trimmed);
11129                 if (ret)
11130                         break;
11131
11132                 trimmed += group_trimmed;
11133         }
11134         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
11135
11136         range->len = trimmed;
11137         return ret;
11138 }
11139
11140 /*
11141  * btrfs_{start,end}_write_no_snapshoting() are similar to
11142  * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
11143  * data into the page cache through nocow before the subvolume is snapshoted,
11144  * but flush the data into disk after the snapshot creation, or to prevent
11145  * operations while snapshoting is ongoing and that cause the snapshot to be
11146  * inconsistent (writes followed by expanding truncates for example).
11147  */
11148 void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
11149 {
11150         percpu_counter_dec(&root->subv_writers->counter);
11151         /*
11152          * Make sure counter is updated before we wake up waiters.
11153          */
11154         smp_mb();
11155         if (waitqueue_active(&root->subv_writers->wait))
11156                 wake_up(&root->subv_writers->wait);
11157 }
11158
11159 int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
11160 {
11161         if (atomic_read(&root->will_be_snapshoted))
11162                 return 0;
11163
11164         percpu_counter_inc(&root->subv_writers->counter);
11165         /*
11166          * Make sure counter is updated before we check for snapshot creation.
11167          */
11168         smp_mb();
11169         if (atomic_read(&root->will_be_snapshoted)) {
11170                 btrfs_end_write_no_snapshoting(root);
11171                 return 0;
11172         }
11173         return 1;
11174 }
11175
11176 static int wait_snapshoting_atomic_t(atomic_t *a)
11177 {
11178         schedule();
11179         return 0;
11180 }
11181
11182 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11183 {
11184         while (true) {
11185                 int ret;
11186
11187                 ret = btrfs_start_write_no_snapshoting(root);
11188                 if (ret)
11189                         break;
11190                 wait_on_atomic_t(&root->will_be_snapshoted,
11191                                  wait_snapshoting_atomic_t,
11192                                  TASK_UNINTERRUPTIBLE);
11193         }
11194 }