Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 21 Mar 2015 17:53:37 +0000 (10:53 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 21 Mar 2015 17:53:37 +0000 (10:53 -0700)
Pull btrfs fixes from Chris Mason:
 "Most of these are fixing extent reservation accounting, or corners
  with tree writeback during commit.

  Josef's set does add a test, which isn't strictly a fix, but it'll
  keep us from making this same mistake again"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs:
  Btrfs: fix outstanding_extents accounting in DIO
  Btrfs: add sanity test for outstanding_extents accounting
  Btrfs: just free dummy extent buffers
  Btrfs: account merges/splits properly
  Btrfs: prepare block group cache before writing
  Btrfs: fix ASSERT(list_empty(&cur_trans->dirty_bgs_list)
  Btrfs: account for the correct number of extents for delalloc reservations
  Btrfs: fix merge delalloc logic
  Btrfs: fix comp_oper to get right order
  Btrfs: catch transaction abortion after waiting for it
  btrfs: fix sizeof format specifier in btrfs_check_super_valid()

1  2 
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/inode.c
fs/btrfs/transaction.c

diff --combined fs/btrfs/ctree.h
@@@ -1176,7 -1176,6 +1176,7 @@@ struct btrfs_space_info 
        struct percpu_counter total_bytes_pinned;
  
        struct list_head list;
 +      /* Protected by the spinlock 'lock'. */
        struct list_head ro_bgs;
  
        struct rw_semaphore groups_sem;
@@@ -3387,6 -3386,8 +3387,8 @@@ int btrfs_inc_extent_ref(struct btrfs_t
  
  int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root);
+ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root);
  int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
  int btrfs_free_block_groups(struct btrfs_fs_info *info);
  int btrfs_read_block_groups(struct btrfs_root *root);
@@@ -3909,6 -3910,9 +3911,9 @@@ int btrfs_prealloc_file_range_trans(str
                                    loff_t actual_len, u64 *alloc_hint);
  int btrfs_inode_check_errors(struct inode *inode);
  extern const struct dentry_operations btrfs_dentry_operations;
+ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ void btrfs_test_inode_set_ops(struct inode *inode);
+ #endif
  
  /* ioctl.c */
  long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
diff --combined fs/btrfs/disk-io.c
@@@ -1724,11 -1724,12 +1724,11 @@@ static int setup_bdi(struct btrfs_fs_in
  {
        int err;
  
 -      bdi->capabilities = BDI_CAP_MAP_COPY;
 -      err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
 +      err = bdi_setup_and_register(bdi, "btrfs");
        if (err)
                return err;
  
 -      bdi->ra_pages   = default_backing_dev_info.ra_pages;
 +      bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
        bdi->congested_fn       = btrfs_congested_fn;
        bdi->congested_data     = info;
        return 0;
@@@ -2328,6 -2329,7 +2328,6 @@@ int open_ctree(struct super_block *sb
         */
        fs_info->btree_inode->i_size = OFFSET_MAX;
        fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
 -      fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
  
        RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
@@@ -3921,7 -3923,7 +3921,7 @@@ static int btrfs_check_super_valid(stru
        }
        if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
                        + sizeof(struct btrfs_chunk)) {
-               printk(KERN_ERR "BTRFS: system chunk array too small %u < %lu\n",
+               printk(KERN_ERR "BTRFS: system chunk array too small %u < %zu\n",
                                btrfs_super_sys_array_size(sb),
                                sizeof(struct btrfs_disk_key)
                                + sizeof(struct btrfs_chunk));
diff --combined fs/btrfs/extent-tree.c
        return ret;
  }
  
+ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root)
+ {
+       struct btrfs_block_group_cache *cache, *tmp;
+       struct btrfs_transaction *cur_trans = trans->transaction;
+       struct btrfs_path *path;
+       if (list_empty(&cur_trans->dirty_bgs) ||
+           !btrfs_test_opt(root, SPACE_CACHE))
+               return 0;
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+       /* Could add new block groups, use _safe just in case */
+       list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
+                                dirty_list) {
+               if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+                       cache_save_setup(cache, trans, path);
+       }
+       btrfs_free_path(path);
+       return 0;
+ }
  int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
  {
@@@ -5110,7 -5136,11 +5136,11 @@@ int btrfs_delalloc_reserve_metadata(str
        num_bytes = ALIGN(num_bytes, root->sectorsize);
  
        spin_lock(&BTRFS_I(inode)->lock);
-       BTRFS_I(inode)->outstanding_extents++;
+       nr_extents = (unsigned)div64_u64(num_bytes +
+                                        BTRFS_MAX_EXTENT_SIZE - 1,
+                                        BTRFS_MAX_EXTENT_SIZE);
+       BTRFS_I(inode)->outstanding_extents += nr_extents;
+       nr_extents = 0;
  
        if (BTRFS_I(inode)->outstanding_extents >
            BTRFS_I(inode)->reserved_extents)
@@@ -5255,6 -5285,9 +5285,9 @@@ void btrfs_delalloc_release_metadata(st
        if (dropped > 0)
                to_free += btrfs_calc_trans_metadata_size(root, dropped);
  
+       if (btrfs_test_is_dummy_root(root))
+               return;
        trace_btrfs_space_reservation(root->fs_info, "delalloc",
                                      btrfs_ino(inode), to_free, 0);
        if (root->fs_info->quota_enabled) {
@@@ -9375,6 -9408,7 +9408,6 @@@ int btrfs_remove_block_group(struct btr
         * are still on the list after taking the semaphore
         */
        list_del_init(&block_group->list);
 -      list_del_init(&block_group->ro_list);
        if (list_empty(&block_group->space_info->block_groups[index])) {
                kobj = block_group->space_info->block_group_kobjs[index];
                block_group->space_info->block_group_kobjs[index] = NULL;
        btrfs_remove_free_space_cache(block_group);
  
        spin_lock(&block_group->space_info->lock);
 +      list_del_init(&block_group->ro_list);
        block_group->space_info->total_bytes -= block_group->key.offset;
        block_group->space_info->bytes_readonly -= block_group->key.offset;
        block_group->space_info->disk_total -= block_group->key.offset * factor;
diff --combined fs/btrfs/extent_io.c
@@@ -1408,8 -1408,8 +1408,8 @@@ int extent_range_redirty_for_io(struct 
        while (index <= end_index) {
                page = find_get_page(inode->i_mapping, index);
                BUG_ON(!page); /* Pages should be in the extent_io_tree */
 -              account_page_redirty(page);
                __set_page_dirty_nobuffers(page);
 +              account_page_redirty(page);
                page_cache_release(page);
                index++;
        }
@@@ -2191,7 -2191,7 +2191,7 @@@ void btrfs_free_io_failure_record(struc
  
                next = next_state(state);
  
 -              failrec = (struct io_failure_record *)state->private;
 +              failrec = (struct io_failure_record *)(unsigned long)state->private;
                free_extent_state(state);
                kfree(failrec);
  
@@@ -4968,6 -4968,12 +4968,12 @@@ static int release_extent_buffer(struc
  
                /* Should be safe to release our pages at this point */
                btrfs_release_extent_buffer_page(eb);
+ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+               if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) {
+                       __free_extent_buffer(eb);
+                       return 1;
+               }
+ #endif
                call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
                return 1;
        }
diff --combined fs/btrfs/inode.c
@@@ -108,6 -108,13 +108,13 @@@ static struct extent_map *create_pinned
  
  static int btrfs_dirty_inode(struct inode *inode);
  
+ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ void btrfs_test_inode_set_ops(struct inode *inode)
+ {
+       BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ }
+ #endif
  static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
                                     struct inode *inode,  struct inode *dir,
                                     const struct qstr *qstr)
@@@ -1542,30 -1549,17 +1549,17 @@@ static void btrfs_split_extent_hook(str
                u64 new_size;
  
                /*
-                * We need the largest size of the remaining extent to see if we
-                * need to add a new outstanding extent.  Think of the following
-                * case
-                *
-                * [MEAX_EXTENT_SIZEx2 - 4k][4k]
-                *
-                * The new_size would just be 4k and we'd think we had enough
-                * outstanding extents for this if we only took one side of the
-                * split, same goes for the other direction.  We need to see if
-                * the larger size still is the same amount of extents as the
-                * original size, because if it is we need to add a new
-                * outstanding extent.  But if we split up and the larger size
-                * is less than the original then we are good to go since we've
-                * already accounted for the extra extent in our original
-                * accounting.
+                * See the explanation in btrfs_merge_extent_hook, the same
+                * applies here, just in reverse.
                 */
                new_size = orig->end - split + 1;
-               if ((split - orig->start) > new_size)
-                       new_size = split - orig->start;
-               num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+               num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
                                        BTRFS_MAX_EXTENT_SIZE);
-               if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
-                             BTRFS_MAX_EXTENT_SIZE) < num_extents)
+               new_size = split - orig->start;
+               num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                                       BTRFS_MAX_EXTENT_SIZE);
+               if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+                             BTRFS_MAX_EXTENT_SIZE) >= num_extents)
                        return;
        }
  
@@@ -1591,8 -1585,10 +1585,10 @@@ static void btrfs_merge_extent_hook(str
        if (!(other->state & EXTENT_DELALLOC))
                return;
  
-       old_size = other->end - other->start + 1;
-       new_size = old_size + (new->end - new->start + 1);
+       if (new->start > other->start)
+               new_size = new->end - other->start + 1;
+       else
+               new_size = other->end - new->start + 1;
  
        /* we're not bigger than the max, unreserve the space and go */
        if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
        }
  
        /*
-        * If we grew by another max_extent, just return, we want to keep that
-        * reserved amount.
+        * We have to add up either side to figure out how many extents were
+        * accounted for before we merged into one big extent.  If the number of
+        * extents we accounted for is <= the amount we need for the new range
+        * then we can return, otherwise drop.  Think of it like this
+        *
+        * [ 4k][MAX_SIZE]
+        *
+        * So we've grown the extent by a MAX_SIZE extent, this would mean we
+        * need 2 outstanding extents, on one side we have 1 and the other side
+        * we have 1 so they are == and we can return.  But in this case
+        *
+        * [MAX_SIZE+4k][MAX_SIZE+4k]
+        *
+        * Each range on their own accounts for 2 extents, but merged together
+        * they are only 3 extents worth of accounting, so we need to drop in
+        * this case.
         */
+       old_size = other->end - other->start + 1;
        num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
                                BTRFS_MAX_EXTENT_SIZE);
+       old_size = new->end - new->start + 1;
+       num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                                BTRFS_MAX_EXTENT_SIZE);
        if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
-                     BTRFS_MAX_EXTENT_SIZE) > num_extents)
+                     BTRFS_MAX_EXTENT_SIZE) >= num_extents)
                return;
  
        spin_lock(&BTRFS_I(inode)->lock);
@@@ -1686,6 -1701,10 +1701,10 @@@ static void btrfs_set_bit_hook(struct i
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
  
+               /* For sanity tests */
+               if (btrfs_test_is_dummy_root(root))
+                       return;
                __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
                                     root->fs_info->delalloc_batch);
                spin_lock(&BTRFS_I(inode)->lock);
@@@ -1741,6 -1760,10 +1760,10 @@@ static void btrfs_clear_bit_hook(struc
                    root != root->fs_info->tree_root)
                        btrfs_delalloc_release_metadata(inode, len);
  
+               /* For sanity tests. */
+               if (btrfs_test_is_dummy_root(root))
+                       return;
                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
                    && do_list && !(state->state & EXTENT_NORESERVE))
                        btrfs_free_reserved_data_space(inode, len);
@@@ -3670,6 -3693,7 +3693,6 @@@ cache_acl
        switch (inode->i_mode & S_IFMT) {
        case S_IFREG:
                inode->i_mapping->a_ops = &btrfs_aops;
 -              inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
                inode->i_fop = &btrfs_file_operations;
                inode->i_op = &btrfs_file_inode_operations;
        case S_IFLNK:
                inode->i_op = &btrfs_symlink_inode_operations;
                inode->i_mapping->a_ops = &btrfs_symlink_aops;
 -              inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
                break;
        default:
                inode->i_op = &btrfs_special_inode_operations;
@@@ -6165,6 -6190,7 +6188,6 @@@ static int btrfs_create(struct inode *d
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
        inode->i_mapping->a_ops = &btrfs_aops;
 -      inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
  
        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err)
@@@ -7213,7 -7239,7 +7236,7 @@@ static int btrfs_get_blocks_direct(stru
        u64 start = iblock << inode->i_blkbits;
        u64 lockstart, lockend;
        u64 len = bh_result->b_size;
-       u64 orig_len = len;
+       u64 *outstanding_extents = NULL;
        int unlock_bits = EXTENT_LOCKED;
        int ret = 0;
  
        lockstart = start;
        lockend = start + len - 1;
  
+       if (current->journal_info) {
+               /*
+                * Need to pull our outstanding extents and set journal_info to NULL so
+                * that anything that needs to check if there's a transction doesn't get
+                * confused.
+                */
+               outstanding_extents = current->journal_info;
+               current->journal_info = NULL;
+       }
        /*
         * If this errors out it's because we couldn't invalidate pagecache for
         * this range and we need to fallback to buffered.
@@@ -7348,11 -7384,20 +7381,20 @@@ unlock
                if (start + len > i_size_read(inode))
                        i_size_write(inode, start + len);
  
-               if (len < orig_len) {
+               /*
+                * If we have an outstanding_extents count still set then we're
+                * within our reservation, otherwise we need to adjust our inode
+                * counter appropriately.
+                */
+               if (*outstanding_extents) {
+                       (*outstanding_extents)--;
+               } else {
                        spin_lock(&BTRFS_I(inode)->lock);
                        BTRFS_I(inode)->outstanding_extents++;
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
+               current->journal_info = outstanding_extents;
                btrfs_free_reserved_data_space(inode, len);
        }
  
  unlock_err:
        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                         unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+       if (outstanding_extents)
+               current->journal_info = outstanding_extents;
        return ret;
  }
  
@@@ -8075,6 -8122,7 +8119,7 @@@ static ssize_t btrfs_direct_IO(int rw, 
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+       u64 outstanding_extents = 0;
        size_t count = 0;
        int flags = 0;
        bool wakeup = true;
                ret = btrfs_delalloc_reserve_space(inode, count);
                if (ret)
                        goto out;
+               outstanding_extents = div64_u64(count +
+                                               BTRFS_MAX_EXTENT_SIZE - 1,
+                                               BTRFS_MAX_EXTENT_SIZE);
+               /*
+                * We need to know how many extents we reserved so that we can
+                * do the accounting properly if we go over the number we
+                * originally calculated.  Abuse current->journal_info for this.
+                */
+               current->journal_info = &outstanding_extents;
        } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
                                     &BTRFS_I(inode)->runtime_flags)) {
                inode_dio_done(inode);
                        iter, offset, btrfs_get_blocks_direct, NULL,
                        btrfs_submit_direct, flags);
        if (rw & WRITE) {
+               current->journal_info = NULL;
                if (ret < 0 && ret != -EIOCBQUEUED)
                        btrfs_delalloc_release_space(inode, count);
                else if (ret >= 0 && (size_t)ret < count)
@@@ -9277,6 -9336,7 +9333,6 @@@ static int btrfs_symlink(struct inode *
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
        inode->i_mapping->a_ops = &btrfs_aops;
 -      inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
  
        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
  
        inode->i_op = &btrfs_symlink_inode_operations;
        inode->i_mapping->a_ops = &btrfs_symlink_aops;
 -      inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        inode_set_bytes(inode, name_len);
        btrfs_i_size_write(inode, name_len);
        err = btrfs_update_inode(trans, root, inode);
@@@ -9531,6 -9592,7 +9587,6 @@@ static int btrfs_tmpfile(struct inode *
        inode->i_op = &btrfs_file_inode_operations;
  
        inode->i_mapping->a_ops = &btrfs_aops;
 -      inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
  
        ret = btrfs_init_inode_security(trans, inode, dir, NULL);
diff --combined fs/btrfs/transaction.c
@@@ -1023,17 -1023,13 +1023,13 @@@ static int update_cowonly_root(struct b
        u64 old_root_bytenr;
        u64 old_root_used;
        struct btrfs_root *tree_root = root->fs_info->tree_root;
-       bool extent_root = (root->objectid == BTRFS_EXTENT_TREE_OBJECTID);
  
        old_root_used = btrfs_root_used(&root->root_item);
-       btrfs_write_dirty_block_groups(trans, root);
  
        while (1) {
                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
                if (old_root_bytenr == root->node->start &&
-                   old_root_used == btrfs_root_used(&root->root_item) &&
-                   (!extent_root ||
-                    list_empty(&trans->transaction->dirty_bgs)))
+                   old_root_used == btrfs_root_used(&root->root_item))
                        break;
  
                btrfs_set_root_node(&root->root_item, root->node);
                        return ret;
  
                old_root_used = btrfs_root_used(&root->root_item);
-               if (extent_root) {
-                       ret = btrfs_write_dirty_block_groups(trans, root);
-                       if (ret)
-                               return ret;
-               }
-               ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-               if (ret)
-                       return ret;
        }
  
        return 0;
@@@ -1068,6 -1056,7 +1056,7 @@@ static noinline int commit_cowonly_root
                                         struct btrfs_root *root)
  {
        struct btrfs_fs_info *fs_info = root->fs_info;
+       struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
        struct list_head *next;
        struct extent_buffer *eb;
        int ret;
        if (ret)
                return ret;
  
+       ret = btrfs_setup_space_cache(trans, root);
+       if (ret)
+               return ret;
        /* run_qgroups might have added some more refs */
        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
        if (ret)
                return ret;
+ again:
        while (!list_empty(&fs_info->dirty_cowonly_roots)) {
                next = fs_info->dirty_cowonly_roots.next;
                list_del_init(next);
                ret = update_cowonly_root(trans, root);
                if (ret)
                        return ret;
+               ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+               if (ret)
+                       return ret;
        }
  
+       while (!list_empty(dirty_bgs)) {
+               ret = btrfs_write_dirty_block_groups(trans, root);
+               if (ret)
+                       return ret;
+               ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+               if (ret)
+                       return ret;
+       }
+       if (!list_empty(&fs_info->dirty_cowonly_roots))
+               goto again;
        list_add_tail(&fs_info->extent_root->dirty_list,
                      &trans->transaction->switch_commits);
        btrfs_after_dev_replace_commit(fs_info);
@@@ -1811,6 -1819,9 +1819,9 @@@ int btrfs_commit_transaction(struct btr
  
                wait_for_commit(root, cur_trans);
  
+               if (unlikely(cur_trans->aborted))
+                       ret = cur_trans->aborted;
                btrfs_put_transaction(cur_trans);
  
                return ret;
@@@ -2130,7 -2141,7 +2141,7 @@@ void btrfs_apply_pending_changes(struc
        unsigned long prev;
        unsigned long bit;
  
 -      prev = cmpxchg(&fs_info->pending_changes, 0, 0);
 +      prev = xchg(&fs_info->pending_changes, 0);
        if (!prev)
                return;