Merge tag 'iommu-config-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[cascardo/linux.git] / fs / btrfs / inode.c
index 016c403..e687bb0 100644 (file)
@@ -153,7 +153,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 
                key.objectid = btrfs_ino(inode);
                key.offset = start;
-               btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+               key.type = BTRFS_EXTENT_DATA_KEY;
 
                datasize = btrfs_file_extent_calc_inline_size(cur_size);
                path->leave_spinning = 1;
@@ -249,8 +249,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
                data_len = compressed_size;
 
        if (start > 0 ||
-           actual_end >= PAGE_CACHE_SIZE ||
-           data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+           actual_end > PAGE_CACHE_SIZE ||
+           data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
            (!compressed_size &&
            (actual_end & (root->sectorsize - 1)) == 0) ||
            end + 1 < isize ||
@@ -348,6 +348,23 @@ static noinline int add_async_extent(struct async_cow *cow,
        return 0;
 }
 
+static inline int inode_need_compress(struct inode *inode)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+
+       /* force compress */
+       if (btrfs_test_opt(root, FORCE_COMPRESS))
+               return 1;
+       /* bad compression ratios */
+       if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+               return 0;
+       if (btrfs_test_opt(root, COMPRESS) ||
+           BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
+           BTRFS_I(inode)->force_compress)
+               return 1;
+       return 0;
+}
+
 /*
  * we create compressed extents in two phases.  The first
  * phase compresses a range of pages that have already been
@@ -365,7 +382,7 @@ static noinline int add_async_extent(struct async_cow *cow,
  * are written in the same order that the flusher thread sent them
  * down.
  */
-static noinline int compress_file_range(struct inode *inode,
+static noinline void compress_file_range(struct inode *inode,
                                        struct page *locked_page,
                                        u64 start, u64 end,
                                        struct async_cow *async_cow,
@@ -394,14 +411,6 @@ static noinline int compress_file_range(struct inode *inode,
            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
                btrfs_add_inode_defrag(NULL, inode);
 
-       /*
-        * skip compression for a small file range(<=blocksize) that
-        * isn't an inline extent, since it dosen't save disk space at all.
-        */
-       if ((end - start + 1) <= blocksize &&
-           (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
-               goto cleanup_and_bail_uncompressed;
-
        actual_end = min_t(u64, isize, end + 1);
 again:
        will_compress = 0;
@@ -423,6 +432,14 @@ again:
 
        total_compressed = actual_end - start;
 
+       /*
+        * skip compression for a small file range(<=blocksize) that
+        * isn't an inline extent, since it dosen't save disk space at all.
+        */
+       if (total_compressed <= blocksize &&
+          (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+               goto cleanup_and_bail_uncompressed;
+
        /* we want to make sure that amount of ram required to uncompress
         * an extent is reasonable, so we limit the total size in ram
         * of a compressed extent to 128k.  This is a crucial number
@@ -444,10 +461,7 @@ again:
         * inode has not been flagged as nocompress.  This flag can
         * change at any time if we discover bad compression ratios.
         */
-       if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
-           (btrfs_test_opt(root, COMPRESS) ||
-            (BTRFS_I(inode)->force_compress) ||
-            (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
+       if (inode_need_compress(inode)) {
                WARN_ON(pages);
                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
                if (!pages) {
@@ -513,7 +527,10 @@ cont:
                if (ret <= 0) {
                        unsigned long clear_flags = EXTENT_DELALLOC |
                                EXTENT_DEFRAG;
+                       unsigned long page_error_op;
+
                        clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
+                       page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
 
                        /*
                         * inline extent creation worked or returned error,
@@ -524,6 +541,7 @@ cont:
                                                     clear_flags, PAGE_UNLOCK |
                                                     PAGE_CLEAR_DIRTY |
                                                     PAGE_SET_WRITEBACK |
+                                                    page_error_op |
                                                     PAGE_END_WRITEBACK);
                        goto free_pages_out;
                }
@@ -606,8 +624,7 @@ cleanup_and_bail_uncompressed:
                *num_added += 1;
        }
 
-out:
-       return ret;
+       return;
 
 free_pages_out:
        for (i = 0; i < nr_pages_ret; i++) {
@@ -615,8 +632,22 @@ free_pages_out:
                page_cache_release(pages[i]);
        }
        kfree(pages);
+}
 
-       goto out;
+static void free_async_extent_pages(struct async_extent *async_extent)
+{
+       int i;
+
+       if (!async_extent->pages)
+               return;
+
+       for (i = 0; i < async_extent->nr_pages; i++) {
+               WARN_ON(async_extent->pages[i]->mapping);
+               page_cache_release(async_extent->pages[i]);
+       }
+       kfree(async_extent->pages);
+       async_extent->nr_pages = 0;
+       async_extent->pages = NULL;
 }
 
 /*
@@ -625,7 +656,7 @@ free_pages_out:
  * queued.  We walk all the async extents created by compress_file_range
  * and send them down to the disk.
  */
-static noinline int submit_compressed_extents(struct inode *inode,
+static noinline void submit_compressed_extents(struct inode *inode,
                                              struct async_cow *async_cow)
 {
        struct async_extent *async_extent;
@@ -637,9 +668,6 @@ static noinline int submit_compressed_extents(struct inode *inode,
        struct extent_io_tree *io_tree;
        int ret = 0;
 
-       if (list_empty(&async_cow->extents))
-               return 0;
-
 again:
        while (!list_empty(&async_cow->extents)) {
                async_extent = list_entry(async_cow->extents.next,
@@ -695,15 +723,7 @@ retry:
                                           async_extent->compressed_size,
                                           0, alloc_hint, &ins, 1, 1);
                if (ret) {
-                       int i;
-
-                       for (i = 0; i < async_extent->nr_pages; i++) {
-                               WARN_ON(async_extent->pages[i]->mapping);
-                               page_cache_release(async_extent->pages[i]);
-                       }
-                       kfree(async_extent->pages);
-                       async_extent->nr_pages = 0;
-                       async_extent->pages = NULL;
+                       free_async_extent_pages(async_extent);
 
                        if (ret == -ENOSPC) {
                                unlock_extent(io_tree, async_extent->start,
@@ -800,15 +820,26 @@ retry:
                                    ins.objectid,
                                    ins.offset, async_extent->pages,
                                    async_extent->nr_pages);
+               if (ret) {
+                       struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+                       struct page *p = async_extent->pages[0];
+                       const u64 start = async_extent->start;
+                       const u64 end = start + async_extent->ram_size - 1;
+
+                       p->mapping = inode->i_mapping;
+                       tree->ops->writepage_end_io_hook(p, start, end,
+                                                        NULL, 0);
+                       p->mapping = NULL;
+                       extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+                                                    PAGE_END_WRITEBACK |
+                                                    PAGE_SET_ERROR);
+                       free_async_extent_pages(async_extent);
+               }
                alloc_hint = ins.objectid + ins.offset;
                kfree(async_extent);
-               if (ret)
-                       goto out;
                cond_resched();
        }
-       ret = 0;
-out:
-       return ret;
+       return;
 out_free_reserve:
        btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 out_free:
@@ -818,7 +849,9 @@ out_free:
                                     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
                                     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
                                     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
-                                    PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+                                    PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
+                                    PAGE_SET_ERROR);
+       free_async_extent_pages(async_extent);
        kfree(async_extent);
        goto again;
 }
@@ -1094,7 +1127,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                async_cow->locked_page = locked_page;
                async_cow->start = start;
 
-               if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+               if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
+                   !btrfs_test_opt(root, FORCE_COMPRESS))
                        cur_end = end;
                else
                        cur_end = min(end, start + 512 * 1024 - 1);
@@ -1303,7 +1337,7 @@ next_slot:
                         * we fall into common COW way.
                         */
                        if (!nolock) {
-                               err = btrfs_start_nocow_write(root);
+                               err = btrfs_start_write_no_snapshoting(root);
                                if (!err)
                                        goto out_check;
                        }
@@ -1327,7 +1361,7 @@ out_check:
                if (extent_end <= start) {
                        path->slots[0]++;
                        if (!nolock && nocow)
-                               btrfs_end_nocow_write(root);
+                               btrfs_end_write_no_snapshoting(root);
                        goto next_slot;
                }
                if (!nocow) {
@@ -1347,7 +1381,7 @@ out_check:
                                             page_started, nr_written, 1);
                        if (ret) {
                                if (!nolock && nocow)
-                                       btrfs_end_nocow_write(root);
+                                       btrfs_end_write_no_snapshoting(root);
                                goto error;
                        }
                        cow_start = (u64)-1;
@@ -1398,7 +1432,7 @@ out_check:
                                                      num_bytes);
                        if (ret) {
                                if (!nolock && nocow)
-                                       btrfs_end_nocow_write(root);
+                                       btrfs_end_write_no_snapshoting(root);
                                goto error;
                        }
                }
@@ -1409,7 +1443,7 @@ out_check:
                                             EXTENT_DELALLOC, PAGE_UNLOCK |
                                             PAGE_SET_PRIVATE2);
                if (!nolock && nocow)
-                       btrfs_end_nocow_write(root);
+                       btrfs_end_write_no_snapshoting(root);
                cur_offset = extent_end;
                if (cur_offset > end)
                        break;
@@ -1445,6 +1479,26 @@ error:
        return ret;
 }
 
+static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
+{
+
+       if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+           !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
+               return 0;
+
+       /*
+        * @defrag_bytes is a hint value, no spinlock held here,
+        * if is not zero, it means the file is defragging.
+        * Force cow if given extent needs to be defragged.
+        */
+       if (BTRFS_I(inode)->defrag_bytes &&
+           test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+                          EXTENT_DEFRAG, 0, NULL))
+               return 1;
+
+       return 0;
+}
+
 /*
  * extent_io.c call back to do delayed allocation processing
  */
@@ -1453,17 +1507,15 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
                              unsigned long *nr_written)
 {
        int ret;
-       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int force_cow = need_force_cow(inode, start, end);
 
-       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
+       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 1, nr_written);
-       } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
+       } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 0, nr_written);
-       } else if (!btrfs_test_opt(root, COMPRESS) &&
-                  !(BTRFS_I(inode)->force_compress) &&
-                  !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
+       } else if (!inode_need_compress(inode)) {
                ret = cow_file_range(inode, locked_page, start, end,
                                      page_started, nr_written, 1);
        } else {
@@ -1555,6 +1607,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
                               struct extent_state *state, unsigned long *bits)
 {
 
+       if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
+               WARN_ON(1);
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
         * but in this case, we are only testing for the DELALLOC
@@ -1577,6 +1631,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
                                     root->fs_info->delalloc_batch);
                spin_lock(&BTRFS_I(inode)->lock);
                BTRFS_I(inode)->delalloc_bytes += len;
+               if (*bits & EXTENT_DEFRAG)
+                       BTRFS_I(inode)->defrag_bytes += len;
                if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
                                         &BTRFS_I(inode)->runtime_flags))
                        btrfs_add_delalloc_inodes(root, inode);
@@ -1591,6 +1647,13 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                                 struct extent_state *state,
                                 unsigned long *bits)
 {
+       u64 len = state->end + 1 - state->start;
+
+       spin_lock(&BTRFS_I(inode)->lock);
+       if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
+               BTRFS_I(inode)->defrag_bytes -= len;
+       spin_unlock(&BTRFS_I(inode)->lock);
+
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
         * but in this case, we are only testing for the DELALLOC
@@ -1598,7 +1661,6 @@ static void btrfs_clear_bit_hook(struct inode *inode,
         */
        if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
-               u64 len = state->end + 1 - state->start;
                bool do_list = !btrfs_is_free_space_inode(inode);
 
                if (*bits & EXTENT_FIRST_DELALLOC) {
@@ -2660,6 +2722,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                goto out;
        }
 
+       btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
+                                    ordered_extent->file_offset +
+                                    ordered_extent->len - 1);
+
        if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
                truncated = true;
                logical_len = ordered_extent->truncated_len;
@@ -2856,6 +2922,40 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
        return 0;
 }
 
+static int __readpage_endio_check(struct inode *inode,
+                                 struct btrfs_io_bio *io_bio,
+                                 int icsum, struct page *page,
+                                 int pgoff, u64 start, size_t len)
+{
+       char *kaddr;
+       u32 csum_expected;
+       u32 csum = ~(u32)0;
+       static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+                                     DEFAULT_RATELIMIT_BURST);
+
+       csum_expected = *(((u32 *)io_bio->csum) + icsum);
+
+       kaddr = kmap_atomic(page);
+       csum = btrfs_csum_data(kaddr + pgoff, csum,  len);
+       btrfs_csum_final(csum, (char *)&csum);
+       if (csum != csum_expected)
+               goto zeroit;
+
+       kunmap_atomic(kaddr);
+       return 0;
+zeroit:
+       if (__ratelimit(&_rs))
+               btrfs_info(BTRFS_I(inode)->root->fs_info,
+                          "csum failed ino %llu off %llu csum %u expected csum %u",
+                          btrfs_ino(inode), start, csum, csum_expected);
+       memset(kaddr + pgoff, 1, len);
+       flush_dcache_page(page);
+       kunmap_atomic(kaddr);
+       if (csum_expected == 0)
+               return 0;
+       return -EIO;
+}
+
 /*
  * when reads are done, we need to check csums to verify the data is correct
  * if there's a match, we allow the bio to finish.  If not, the code in
@@ -2868,20 +2968,15 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
        size_t offset = start - page_offset(page);
        struct inode *inode = page->mapping->host;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-       char *kaddr;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       u32 csum_expected;
-       u32 csum = ~(u32)0;
-       static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
-                                     DEFAULT_RATELIMIT_BURST);
 
        if (PageChecked(page)) {
                ClearPageChecked(page);
-               goto good;
+               return 0;
        }
 
        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
-               goto good;
+               return 0;
 
        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
            test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
@@ -2891,28 +2986,8 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
        }
 
        phy_offset >>= inode->i_sb->s_blocksize_bits;
-       csum_expected = *(((u32 *)io_bio->csum) + phy_offset);
-
-       kaddr = kmap_atomic(page);
-       csum = btrfs_csum_data(kaddr + offset, csum,  end - start + 1);
-       btrfs_csum_final(csum, (char *)&csum);
-       if (csum != csum_expected)
-               goto zeroit;
-
-       kunmap_atomic(kaddr);
-good:
-       return 0;
-
-zeroit:
-       if (__ratelimit(&_rs))
-               btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
-                       btrfs_ino(page->mapping->host), start, csum, csum_expected);
-       memset(kaddr + offset, 1, end - start + 1);
-       flush_dcache_page(page);
-       kunmap_atomic(kaddr);
-       if (csum_expected == 0)
-               return 0;
-       return -EIO;
+       return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
+                                     start, (size_t)(end - start + 1));
 }
 
 struct delayed_iput {
@@ -3159,7 +3234,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
        path->reada = -1;
 
        key.objectid = BTRFS_ORPHAN_OBJECTID;
-       btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+       key.type = BTRFS_ORPHAN_ITEM_KEY;
        key.offset = (u64)-1;
 
        while (1) {
@@ -3186,7 +3261,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                /* make sure the item matches what we want */
                if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
                        break;
-               if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
+               if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
                        break;
 
                /* release the path since we're done with it */
@@ -3662,7 +3737,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
         * without delay
         */
        if (!btrfs_is_free_space_inode(inode)
-           && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+           && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+           && !root->fs_info->log_root_recovering) {
                btrfs_update_root_times(trans, root);
 
                ret = btrfs_delayed_update_inode(trans, root, inode);
@@ -4085,7 +4161,7 @@ search_again:
                fi = NULL;
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-               found_type = btrfs_key_type(&found_key);
+               found_type = found_key.type;
 
                if (found_key.objectid != ino)
                        break;
@@ -4523,6 +4599,26 @@ next:
        return err;
 }
 
+static int wait_snapshoting_atomic_t(atomic_t *a)
+{
+       schedule();
+       return 0;
+}
+
+static void wait_for_snapshot_creation(struct btrfs_root *root)
+{
+       while (true) {
+               int ret;
+
+               ret = btrfs_start_write_no_snapshoting(root);
+               if (ret)
+                       break;
+               wait_on_atomic_t(&root->will_be_snapshoted,
+                                wait_snapshoting_atomic_t,
+                                TASK_UNINTERRUPTIBLE);
+       }
+}
+
 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4547,17 +4643,30 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 
        if (newsize > oldsize) {
                truncate_pagecache(inode, newsize);
+               /*
+                * Don't do an expanding truncate while snapshoting is ongoing.
+                * This is to ensure the snapshot captures a fully consistent
+                * state of this file - if the snapshot captures this expanding
+                * truncation, it must capture all writes that happened before
+                * this truncation.
+                */
+               wait_for_snapshot_creation(root);
                ret = btrfs_cont_expand(inode, oldsize, newsize);
-               if (ret)
+               if (ret) {
+                       btrfs_end_write_no_snapshoting(root);
                        return ret;
+               }
 
                trans = btrfs_start_transaction(root, 1);
-               if (IS_ERR(trans))
+               if (IS_ERR(trans)) {
+                       btrfs_end_write_no_snapshoting(root);
                        return PTR_ERR(trans);
+               }
 
                i_size_write(inode, newsize);
                btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
                ret = btrfs_update_inode(trans, root, inode);
+               btrfs_end_write_no_snapshoting(root);
                btrfs_end_transaction(trans, root);
        } else {
 
@@ -4747,6 +4856,8 @@ void btrfs_evict_inode(struct inode *inode)
        /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
        btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
+       btrfs_free_io_failure_record(inode, 0, (u64)-1);
+
        if (root->fs_info->log_root_recovering) {
                BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
                                 &BTRFS_I(inode)->runtime_flags));
@@ -5202,42 +5313,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                        iput(inode);
                        inode = ERR_PTR(ret);
                }
-               /*
-                * If orphan cleanup did remove any orphans, it means the tree
-                * was modified and therefore the commit root is not the same as
-                * the current root anymore. This is a problem, because send
-                * uses the commit root and therefore can see inode items that
-                * don't exist in the current root anymore, and for example make
-                * calls to btrfs_iget, which will do tree lookups based on the
-                * current root and not on the commit root. Those lookups will
-                * fail, returning a -ESTALE error, and making send fail with
-                * that error. So make sure a send does not see any orphans we
-                * have just removed, and that it will see the same inodes
-                * regardless of whether a transaction commit happened before
-                * it started (meaning that the commit root will be the same as
-                * the current root) or not.
-                */
-               if (sub_root->node != sub_root->commit_root) {
-                       u64 sub_flags = btrfs_root_flags(&sub_root->root_item);
-
-                       if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) {
-                               struct extent_buffer *eb;
-
-                               /*
-                                * Assert we can't have races between dentry
-                                * lookup called through the snapshot creation
-                                * ioctl and the VFS.
-                                */
-                               ASSERT(mutex_is_locked(&dir->i_mutex));
-
-                               down_write(&root->fs_info->commit_root_sem);
-                               eb = sub_root->commit_root;
-                               sub_root->commit_root =
-                                       btrfs_root_node(sub_root);
-                               up_write(&root->fs_info->commit_root_sem);
-                               free_extent_buffer(eb);
-                       }
-               }
        }
 
        return inode;
@@ -5280,7 +5355,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
                        return ERR_CAST(inode);
        }
 
-       return d_materialise_unique(dentry, inode);
+       return d_splice_alias(inode, dentry);
 }
 
 unsigned char btrfs_filetype_table[] = {
@@ -5331,7 +5406,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
                btrfs_get_delayed_items(inode, &ins_list, &del_list);
        }
 
-       btrfs_set_key_type(&key, key_type);
+       key.type = key_type;
        key.offset = ctx->pos;
        key.objectid = btrfs_ino(inode);
 
@@ -5356,7 +5431,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 
                if (found_key.objectid != key.objectid)
                        break;
-               if (btrfs_key_type(&found_key) != key_type)
+               if (found_key.type != key_type)
                        break;
                if (found_key.offset < ctx->pos)
                        goto next;
@@ -5568,7 +5643,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
        int ret;
 
        key.objectid = btrfs_ino(inode);
-       btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+       key.type = BTRFS_DIR_INDEX_KEY;
        key.offset = (u64)-1;
 
        path = btrfs_alloc_path();
@@ -5600,7 +5675,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
        if (found_key.objectid != btrfs_ino(inode) ||
-           btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
+           found_key.type != BTRFS_DIR_INDEX_KEY) {
                BTRFS_I(inode)->index_cnt = 2;
                goto out;
        }
@@ -5718,7 +5793,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
 
        key[0].objectid = objectid;
-       btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
+       key[0].type = BTRFS_INODE_ITEM_KEY;
        key[0].offset = 0;
 
        sizes[0] = sizeof(struct btrfs_inode_item);
@@ -5731,7 +5806,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                 * add more hard links than can fit in the ref item.
                 */
                key[1].objectid = objectid;
-               btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+               key[1].type = BTRFS_INODE_REF_KEY;
                key[1].offset = ref_objectid;
 
                sizes[1] = name_len + sizeof(*ref);
@@ -5740,7 +5815,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        location = &BTRFS_I(inode)->location;
        location->objectid = objectid;
        location->offset = 0;
-       btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+       location->type = BTRFS_INODE_ITEM_KEY;
 
        ret = btrfs_insert_inode_locked(inode);
        if (ret < 0)
@@ -5832,7 +5907,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
                memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
        } else {
                key.objectid = ino;
-               btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+               key.type = BTRFS_INODE_ITEM_KEY;
                key.offset = 0;
        }
 
@@ -6191,21 +6266,60 @@ out_fail_inode:
        goto out_fail;
 }
 
+/* Find next extent map of a given extent map, caller needs to ensure locks */
+static struct extent_map *next_extent_map(struct extent_map *em)
+{
+       struct rb_node *next;
+
+       next = rb_next(&em->rb_node);
+       if (!next)
+               return NULL;
+       return container_of(next, struct extent_map, rb_node);
+}
+
+static struct extent_map *prev_extent_map(struct extent_map *em)
+{
+       struct rb_node *prev;
+
+       prev = rb_prev(&em->rb_node);
+       if (!prev)
+               return NULL;
+       return container_of(prev, struct extent_map, rb_node);
+}
+
 /* helper for btfs_get_extent.  Given an existing extent in the tree,
+ * the existing extent is the nearest extent to map_start,
  * and an extent that you want to insert, deal with overlap and insert
- * the new extent into the tree.
+ * the best fitted new extent into the tree.
  */
 static int merge_extent_mapping(struct extent_map_tree *em_tree,
                                struct extent_map *existing,
                                struct extent_map *em,
                                u64 map_start)
 {
+       struct extent_map *prev;
+       struct extent_map *next;
+       u64 start;
+       u64 end;
        u64 start_diff;
 
        BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
-       start_diff = map_start - em->start;
-       em->start = map_start;
-       em->len = existing->start - em->start;
+
+       if (existing->start > map_start) {
+               next = existing;
+               prev = prev_extent_map(next);
+       } else {
+               prev = existing;
+               next = next_extent_map(prev);
+       }
+
+       start = prev ? extent_map_end(prev) : em->start;
+       start = max_t(u64, start, em->start);
+       end = next ? next->start : extent_map_end(em);
+       end = min_t(u64, end, extent_map_end(em));
+       start_diff = start - em->start;
+       em->start = start;
+       em->len = end - start;
        if (em->block_start < EXTENT_MAP_LAST_BYTE &&
            !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                em->block_start += start_diff;
@@ -6333,7 +6447,7 @@ again:
                              struct btrfs_file_extent_item);
        /* are we inside the extent that was found? */
        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-       found_type = btrfs_key_type(&found_key);
+       found_type = found_key.type;
        if (found_key.objectid != objectid ||
            found_type != BTRFS_EXTENT_DATA_KEY) {
                /*
@@ -6482,25 +6596,21 @@ insert:
 
                ret = 0;
 
-               existing = lookup_extent_mapping(em_tree, start, len);
-               if (existing && (existing->start > start ||
-                   existing->start + existing->len <= start)) {
+               existing = search_extent_mapping(em_tree, start, len);
+               /*
+                * existing will always be non-NULL, since there must be
+                * extent causing the -EEXIST.
+                */
+               if (start >= extent_map_end(existing) ||
+                   start <= existing->start) {
+                       /*
+                        * The existing extent map is the one nearest to
+                        * the [start, start + len) range which overlaps
+                        */
+                       err = merge_extent_mapping(em_tree, existing,
+                                                  em, start);
                        free_extent_map(existing);
-                       existing = NULL;
-               }
-               if (!existing) {
-                       existing = lookup_extent_mapping(em_tree, em->start,
-                                                        em->len);
-                       if (existing) {
-                               err = merge_extent_mapping(em_tree, existing,
-                                                          em, start);
-                               free_extent_map(existing);
-                               if (err) {
-                                       free_extent_map(em);
-                                       em = NULL;
-                               }
-                       } else {
-                               err = -EIO;
+                       if (err) {
                                free_extent_map(em);
                                em = NULL;
                        }
@@ -6942,9 +7052,12 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                        btrfs_put_ordered_extent(ordered);
                } else {
                        /* Screw you mmap */
-                       ret = filemap_write_and_wait_range(inode->i_mapping,
-                                                          lockstart,
-                                                          lockend);
+                       ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
+                       if (ret)
+                               break;
+                       ret = filemap_fdatawait_range(inode->i_mapping,
+                                                     lockstart,
+                                                     lockend);
                        if (ret)
                                break;
 
@@ -7112,8 +7225,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                                                       block_start, len,
                                                       orig_block_len,
                                                       ram_bytes, type);
-                               if (IS_ERR(em))
+                               if (IS_ERR(em)) {
+                                       ret = PTR_ERR(em);
                                        goto unlock_err;
+                               }
                        }
 
                        ret = btrfs_add_ordered_extent_dio(inode, start,
@@ -7188,45 +7303,277 @@ unlock_err:
        return ret;
 }
 
-static void btrfs_endio_direct_read(struct bio *bio, int err)
+static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+                                       int rw, int mirror_num)
 {
-       struct btrfs_dio_private *dip = bio->bi_private;
-       struct bio_vec *bvec;
-       struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct bio *dio_bio;
-       u32 *csums = (u32 *)dip->csum;
+       int ret;
+
+       BUG_ON(rw & REQ_WRITE);
+
+       bio_get(bio);
+
+       ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+                                 BTRFS_WQ_ENDIO_DIO_REPAIR);
+       if (ret)
+               goto err;
+
+       ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+err:
+       bio_put(bio);
+       return ret;
+}
+
+static int btrfs_check_dio_repairable(struct inode *inode,
+                                     struct bio *failed_bio,
+                                     struct io_failure_record *failrec,
+                                     int failed_mirror)
+{
+       int num_copies;
+
+       num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+                                     failrec->logical, failrec->len);
+       if (num_copies == 1) {
+               /*
+                * we only have a single copy of the data, so don't bother with
+                * all the retry and error correction code that follows. no
+                * matter what the error is, it is very likely to persist.
+                */
+               pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+                        num_copies, failrec->this_mirror, failed_mirror);
+               return 0;
+       }
+
+       failrec->failed_mirror = failed_mirror;
+       failrec->this_mirror++;
+       if (failrec->this_mirror == failed_mirror)
+               failrec->this_mirror++;
+
+       if (failrec->this_mirror > num_copies) {
+               pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
+                        num_copies, failrec->this_mirror, failed_mirror);
+               return 0;
+       }
+
+       return 1;
+}
+
+static int dio_read_error(struct inode *inode, struct bio *failed_bio,
+                         struct page *page, u64 start, u64 end,
+                         int failed_mirror, bio_end_io_t *repair_endio,
+                         void *repair_arg)
+{
+       struct io_failure_record *failrec;
+       struct bio *bio;
+       int isector;
+       int read_mode;
+       int ret;
+
+       BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+       ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+       if (ret)
+               return ret;
+
+       ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
+                                        failed_mirror);
+       if (!ret) {
+               free_io_failure(inode, failrec);
+               return -EIO;
+       }
+
+       if (failed_bio->bi_vcnt > 1)
+               read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+       else
+               read_mode = READ_SYNC;
+
+       isector = start - btrfs_io_bio(failed_bio)->logical;
+       isector >>= inode->i_sb->s_blocksize_bits;
+       bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
+                                     0, isector, repair_endio, repair_arg);
+       if (!bio) {
+               free_io_failure(inode, failrec);
+               return -EIO;
+       }
+
+       btrfs_debug(BTRFS_I(inode)->root->fs_info,
+                   "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
+                   read_mode, failrec->this_mirror, failrec->in_validation);
+
+       ret = submit_dio_repair_bio(inode, bio, read_mode,
+                                   failrec->this_mirror);
+       if (ret) {
+               free_io_failure(inode, failrec);
+               bio_put(bio);
+       }
+
+       return ret;
+}
+
+struct btrfs_retry_complete {
+       struct completion done;
+       struct inode *inode;
        u64 start;
+       int uptodate;
+};
+
+static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
+{
+       struct btrfs_retry_complete *done = bio->bi_private;
+       struct bio_vec *bvec;
+       int i;
+
+       if (err)
+               goto end;
+
+       done->uptodate = 1;
+       bio_for_each_segment_all(bvec, bio, i)
+               clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
+end:
+       complete(&done->done);
+       bio_put(bio);
+}
+
+static int __btrfs_correct_data_nocsum(struct inode *inode,
+                                      struct btrfs_io_bio *io_bio)
+{
+       struct bio_vec *bvec;
+       struct btrfs_retry_complete done;
+       u64 start;
+       int i;
+       int ret;
+
+       start = io_bio->logical;
+       done.inode = inode;
+
+       bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+try_again:
+               done.uptodate = 0;
+               done.start = start;
+               init_completion(&done.done);
+
+               ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+                                    start + bvec->bv_len - 1,
+                                    io_bio->mirror_num,
+                                    btrfs_retry_endio_nocsum, &done);
+               if (ret)
+                       return ret;
+
+               wait_for_completion(&done.done);
+
+               if (!done.uptodate) {
+                       /* We might have another mirror, so try again */
+                       goto try_again;
+               }
+
+               start += bvec->bv_len;
+       }
+
+       return 0;
+}
+
+static void btrfs_retry_endio(struct bio *bio, int err)
+{
+       struct btrfs_retry_complete *done = bio->bi_private;
+       struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+       struct bio_vec *bvec;
+       int uptodate;
+       int ret;
        int i;
 
-       start = dip->logical_offset;
+       if (err)
+               goto end;
+
+       uptodate = 1;
        bio_for_each_segment_all(bvec, bio, i) {
-               if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
-                       struct page *page = bvec->bv_page;
-                       char *kaddr;
-                       u32 csum = ~(u32)0;
-                       unsigned long flags;
-
-                       local_irq_save(flags);
-                       kaddr = kmap_atomic(page);
-                       csum = btrfs_csum_data(kaddr + bvec->bv_offset,
-                                              csum, bvec->bv_len);
-                       btrfs_csum_final(csum, (char *)&csum);
-                       kunmap_atomic(kaddr);
-                       local_irq_restore(flags);
-
-                       flush_dcache_page(bvec->bv_page);
-                       if (csum != csums[i]) {
-                               btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
-                                         btrfs_ino(inode), start, csum,
-                                         csums[i]);
-                               err = -EIO;
-                       }
+               ret = __readpage_endio_check(done->inode, io_bio, i,
+                                            bvec->bv_page, 0,
+                                            done->start, bvec->bv_len);
+               if (!ret)
+                       clean_io_failure(done->inode, done->start,
+                                        bvec->bv_page, 0);
+               else
+                       uptodate = 0;
+       }
+
+       done->uptodate = uptodate;
+end:
+       complete(&done->done);
+       bio_put(bio);
+}
+
+static int __btrfs_subio_endio_read(struct inode *inode,
+                                   struct btrfs_io_bio *io_bio, int err)
+{
+       struct bio_vec *bvec;
+       struct btrfs_retry_complete done;
+       u64 start;
+       u64 offset = 0;
+       int i;
+       int ret;
+
+       err = 0;
+       start = io_bio->logical;
+       done.inode = inode;
+
+       bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+               ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
+                                            0, start, bvec->bv_len);
+               if (likely(!ret))
+                       goto next;
+try_again:
+               done.uptodate = 0;
+               done.start = start;
+               init_completion(&done.done);
+
+               ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+                                    start + bvec->bv_len - 1,
+                                    io_bio->mirror_num,
+                                    btrfs_retry_endio, &done);
+               if (ret) {
+                       err = ret;
+                       goto next;
                }
 
+               wait_for_completion(&done.done);
+
+               if (!done.uptodate) {
+                       /* We might have another mirror, so try again */
+                       goto try_again;
+               }
+next:
+               offset += bvec->bv_len;
                start += bvec->bv_len;
        }
 
+       return err;
+}
+
+static int btrfs_subio_endio_read(struct inode *inode,
+                                 struct btrfs_io_bio *io_bio, int err)
+{
+       bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+       if (skip_csum) {
+               if (unlikely(err))
+                       return __btrfs_correct_data_nocsum(inode, io_bio);
+               else
+                       return 0;
+       } else {
+               return __btrfs_subio_endio_read(inode, io_bio, err);
+       }
+}
+
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+       struct btrfs_dio_private *dip = bio->bi_private;
+       struct inode *inode = dip->inode;
+       struct bio *dio_bio;
+       struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+
+       if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
+               err = btrfs_subio_endio_read(inode, io_bio, err);
+
        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
                      dip->logical_offset + dip->bytes - 1);
        dio_bio = dip->dio_bio;
@@ -7237,6 +7584,9 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
        if (err)
                clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
        dio_end_io(dio_bio, err);
+
+       if (io_bio->end_io)
+               io_bio->end_io(io_bio, err);
        bio_put(bio);
 }
 
@@ -7302,12 +7652,17 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
 {
        struct btrfs_dio_private *dip = bio->bi_private;
 
+       if (err)
+               btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
+                          "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
+                          btrfs_ino(dip->inode), bio->bi_rw,
+                          (unsigned long long)bio->bi_iter.bi_sector,
+                          bio->bi_iter.bi_size, err);
+
+       if (dip->subio_endio)
+               err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
+
        if (err) {
-               btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
-                         "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
-                     btrfs_ino(dip->inode), bio->bi_rw,
-                     (unsigned long long)bio->bi_iter.bi_sector,
-                     bio->bi_iter.bi_size, err);
                dip->errors = 1;
 
                /*
@@ -7338,6 +7693,38 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
        return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
 }
 
+static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
+                                                struct inode *inode,
+                                                struct btrfs_dio_private *dip,
+                                                struct bio *bio,
+                                                u64 file_offset)
+{
+       struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+       struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
+       int ret;
+
+       /*
+        * We load all the csum data we need when we submit
+        * the first bio to reduce the csum tree search and
+        * contention.
+        */
+       if (dip->logical_offset == file_offset) {
+               ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
+                                               file_offset);
+               if (ret)
+                       return ret;
+       }
+
+       if (bio == dip->orig_bio)
+               return 0;
+
+       file_offset -= dip->logical_offset;
+       file_offset >>= inode->i_sb->s_blocksize_bits;
+       io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
+
+       return 0;
+}
+
 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                                         int rw, u64 file_offset, int skip_sum,
                                         int async_submit)
@@ -7353,7 +7740,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
        bio_get(bio);
 
        if (!write) {
-               ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+               ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+                               BTRFS_WQ_ENDIO_DATA);
                if (ret)
                        goto err;
        }
@@ -7376,13 +7764,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
                if (ret)
                        goto err;
-       } else if (!skip_sum) {
-               ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio,
-                                               file_offset);
+       } else {
+               ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
+                                                    file_offset);
                if (ret)
                        goto err;
        }
-
 map:
        ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
 err:
@@ -7403,7 +7790,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        u64 submit_len = 0;
        u64 map_length;
        int nr_pages = 0;
-       int ret = 0;
+       int ret;
        int async_submit = 0;
 
        map_length = orig_bio->bi_iter.bi_size;
@@ -7414,6 +7801,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 
        if (map_length >= orig_bio->bi_iter.bi_size) {
                bio = orig_bio;
+               dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
                goto submit;
        }
 
@@ -7430,12 +7818,13 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 
        bio->bi_private = dip;
        bio->bi_end_io = btrfs_end_dio_bio;
+       btrfs_io_bio(bio)->logical = file_offset;
        atomic_inc(&dip->pending_bios);
 
        while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
-               if (unlikely(map_length < submit_len + bvec->bv_len ||
+               if (map_length < submit_len + bvec->bv_len ||
                    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
-                                bvec->bv_offset) < bvec->bv_len)) {
+                                bvec->bv_offset) < bvec->bv_len) {
                        /*
                         * inc the count before we submit the bio so
                         * we know the end IO handler won't happen before
@@ -7464,6 +7853,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                                goto out_err;
                        bio->bi_private = dip;
                        bio->bi_end_io = btrfs_end_dio_bio;
+                       btrfs_io_bio(bio)->logical = file_offset;
 
                        map_length = orig_bio->bi_iter.bi_size;
                        ret = btrfs_map_block(root->fs_info, rw,
@@ -7507,11 +7897,10 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_dio_private *dip;
        struct bio *io_bio;
+       struct btrfs_io_bio *btrfs_bio;
        int skip_sum;
-       int sum_len;
        int write = rw & REQ_WRITE;
        int ret = 0;
-       u16 csum_size;
 
        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
@@ -7521,16 +7910,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
                goto free_ordered;
        }
 
-       if (!skip_sum && !write) {
-               csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-               sum_len = dio_bio->bi_iter.bi_size >>
-                       inode->i_sb->s_blocksize_bits;
-               sum_len *= csum_size;
-       } else {
-               sum_len = 0;
-       }
-
-       dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
+       dip = kzalloc(sizeof(*dip), GFP_NOFS);
        if (!dip) {
                ret = -ENOMEM;
                goto free_io_bio;
@@ -7542,20 +7922,25 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
        dip->bytes = dio_bio->bi_iter.bi_size;
        dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
        io_bio->bi_private = dip;
-       dip->errors = 0;
        dip->orig_bio = io_bio;
        dip->dio_bio = dio_bio;
        atomic_set(&dip->pending_bios, 0);
+       btrfs_bio = btrfs_io_bio(io_bio);
+       btrfs_bio->logical = file_offset;
 
-       if (write)
+       if (write) {
                io_bio->bi_end_io = btrfs_endio_direct_write;
-       else
+       } else {
                io_bio->bi_end_io = btrfs_endio_direct_read;
+               dip->subio_endio = btrfs_subio_endio_read;
+       }
 
        ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
        if (!ret)
                return;
 
+       if (btrfs_bio->end_io)
+               btrfs_bio->end_io(btrfs_bio, ret);
 free_io_bio:
        bio_put(io_bio);
 
@@ -7652,8 +8037,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                ret = btrfs_delalloc_reserve_space(inode, count);
                if (ret)
                        goto out;
-       } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
-                                    &BTRFS_I(inode)->runtime_flags))) {
+       } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+                                    &BTRFS_I(inode)->runtime_flags)) {
                inode_dio_done(inode);
                flags = DIO_LOCKING | DIO_SKIP_HOLES;
                wakeup = false;
@@ -8173,6 +8558,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->last_sub_trans = 0;
        ei->logged_trans = 0;
        ei->delalloc_bytes = 0;
+       ei->defrag_bytes = 0;
        ei->disk_i_size = 0;
        ei->flags = 0;
        ei->csum_bytes = 0;
@@ -8231,6 +8617,7 @@ void btrfs_destroy_inode(struct inode *inode)
        WARN_ON(BTRFS_I(inode)->reserved_extents);
        WARN_ON(BTRFS_I(inode)->delalloc_bytes);
        WARN_ON(BTRFS_I(inode)->csum_bytes);
+       WARN_ON(BTRFS_I(inode)->defrag_bytes);
 
        /*
         * This can happen where we create an inode, but somebody else also
@@ -8646,7 +9033,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
                spin_unlock(&root->delalloc_lock);
 
                work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
-               if (unlikely(!work)) {
+               if (!work) {
                        if (delay_iput)
                                btrfs_add_delayed_iput(inode);
                        else
@@ -8832,7 +9219,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        }
        key.objectid = btrfs_ino(inode);
        key.offset = 0;
-       btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+       key.type = BTRFS_EXTENT_DATA_KEY;
        datasize = btrfs_file_extent_calc_inline_size(name_len);
        err = btrfs_insert_empty_item(trans, root, path, &key,
                                      datasize);
@@ -9110,6 +9497,21 @@ out_inode:
 
 }
 
+/* Inspired by filemap_check_errors() */
+int btrfs_inode_check_errors(struct inode *inode)
+{
+       int ret = 0;
+
+       if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
+           test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
+               ret = -ENOSPC;
+       if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
+           test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
+               ret = -EIO;
+
+       return ret;
+}
+
 static const struct inode_operations btrfs_dir_inode_operations = {
        .getattr        = btrfs_getattr,
        .lookup         = btrfs_lookup,