Merge tag 'iommu-config-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...

[cascardo/linux.git] / fs / btrfs / inode.c
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 016c403..e687bb0 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -153,7 +153,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
  
                 key.objectid = btrfs_ino(inode);
                 key.offset = start;
-               btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+               key.type = BTRFS_EXTENT_DATA_KEY;
  
                 datasize = btrfs_file_extent_calc_inline_size(cur_size);
                 path->leave_spinning = 1;
@@ -249,8 +249,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
                 data_len = compressed_size;
  
         if (start > 0 ||
-           actual_end >= PAGE_CACHE_SIZE ||
-           data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+           actual_end > PAGE_CACHE_SIZE ||
+           data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
             (!compressed_size &&
             (actual_end & (root->sectorsize - 1)) == 0) ||
             end + 1 < isize ||
@@ -348,6 +348,23 @@ static noinline int add_async_extent(struct async_cow *cow,
         return 0;
  }
  
+static inline int inode_need_compress(struct inode *inode)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+
+       /* force compress */
+       if (btrfs_test_opt(root, FORCE_COMPRESS))
+               return 1;
+       /* bad compression ratios */
+       if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+               return 0;
+       if (btrfs_test_opt(root, COMPRESS) ||
+           BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
+           BTRFS_I(inode)->force_compress)
+               return 1;
+       return 0;
+}
+
  /*
   * we create compressed extents in two phases.  The first
   * phase compresses a range of pages that have already been
@@ -365,7 +382,7 @@ static noinline int add_async_extent(struct async_cow *cow,
   * are written in the same order that the flusher thread sent them
   * down.
   */
-static noinline int compress_file_range(struct inode *inode,
+static noinline void compress_file_range(struct inode *inode,
                                         struct page *locked_page,
                                         u64 start, u64 end,
                                         struct async_cow *async_cow,
@@ -394,14 +411,6 @@ static noinline int compress_file_range(struct inode *inode,
             (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
                 btrfs_add_inode_defrag(NULL, inode);
  
-       /*
-        * skip compression for a small file range(<=blocksize) that
-        * isn't an inline extent, since it dosen't save disk space at all.
-        */
-       if ((end - start + 1) <= blocksize &&
-           (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
-               goto cleanup_and_bail_uncompressed;
-
         actual_end = min_t(u64, isize, end + 1);
  again:
         will_compress = 0;
@@ -423,6 +432,14 @@ again:
  
         total_compressed = actual_end - start;
  
+       /*
+        * skip compression for a small file range(<=blocksize) that
+        * isn't an inline extent, since it dosen't save disk space at all.
+        */
+       if (total_compressed <= blocksize &&
+          (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+               goto cleanup_and_bail_uncompressed;
+
         /* we want to make sure that amount of ram required to uncompress
          * an extent is reasonable, so we limit the total size in ram
          * of a compressed extent to 128k.  This is a crucial number
@@ -444,10 +461,7 @@ again:
          * inode has not been flagged as nocompress.  This flag can
          * change at any time if we discover bad compression ratios.
          */
-       if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
-           (btrfs_test_opt(root, COMPRESS) ||
-            (BTRFS_I(inode)->force_compress) ||
-            (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
+       if (inode_need_compress(inode)) {
                 WARN_ON(pages);
                 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
                 if (!pages) {
@@ -513,7 +527,10 @@ cont:
                 if (ret <= 0) {
                         unsigned long clear_flags = EXTENT_DELALLOC |
                                 EXTENT_DEFRAG;
+                       unsigned long page_error_op;
+
                         clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
+                       page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
  
                         /*
                          * inline extent creation worked or returned error,
@@ -524,6 +541,7 @@ cont:
                                                      clear_flags, PAGE_UNLOCK |
                                                      PAGE_CLEAR_DIRTY |
                                                      PAGE_SET_WRITEBACK |
+                                                    page_error_op |
                                                      PAGE_END_WRITEBACK);
                         goto free_pages_out;
                 }
@@ -606,8 +624,7 @@ cleanup_and_bail_uncompressed:
                 *num_added += 1;
         }
  
-out:
-       return ret;
+       return;
  
  free_pages_out:
         for (i = 0; i < nr_pages_ret; i++) {
@@ -615,8 +632,22 @@ free_pages_out:
                 page_cache_release(pages[i]);
         }
         kfree(pages);
+}
  
-       goto out;
+static void free_async_extent_pages(struct async_extent *async_extent)
+{
+       int i;
+
+       if (!async_extent->pages)
+               return;
+
+       for (i = 0; i < async_extent->nr_pages; i++) {
+               WARN_ON(async_extent->pages[i]->mapping);
+               page_cache_release(async_extent->pages[i]);
+       }
+       kfree(async_extent->pages);
+       async_extent->nr_pages = 0;
+       async_extent->pages = NULL;
  }
  
  /*
@@ -625,7 +656,7 @@ free_pages_out:
   * queued.  We walk all the async extents created by compress_file_range
   * and send them down to the disk.
   */
-static noinline int submit_compressed_extents(struct inode *inode,
+static noinline void submit_compressed_extents(struct inode *inode,
                                               struct async_cow *async_cow)
  {
         struct async_extent *async_extent;
@@ -637,9 +668,6 @@ static noinline int submit_compressed_extents(struct inode *inode,
         struct extent_io_tree *io_tree;
         int ret = 0;
  
-       if (list_empty(&async_cow->extents))
-               return 0;
-
  again:
         while (!list_empty(&async_cow->extents)) {
                 async_extent = list_entry(async_cow->extents.next,
@@ -695,15 +723,7 @@ retry:
                                            async_extent->compressed_size,
                                            0, alloc_hint, &ins, 1, 1);
                 if (ret) {
-                       int i;
-
-                       for (i = 0; i < async_extent->nr_pages; i++) {
-                               WARN_ON(async_extent->pages[i]->mapping);
-                               page_cache_release(async_extent->pages[i]);
-                       }
-                       kfree(async_extent->pages);
-                       async_extent->nr_pages = 0;
-                       async_extent->pages = NULL;
+                       free_async_extent_pages(async_extent);
  
                         if (ret == -ENOSPC) {
                                 unlock_extent(io_tree, async_extent->start,
@@ -800,15 +820,26 @@ retry:
                                     ins.objectid,
                                     ins.offset, async_extent->pages,
                                     async_extent->nr_pages);
+               if (ret) {
+                       struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+                       struct page *p = async_extent->pages[0];
+                       const u64 start = async_extent->start;
+                       const u64 end = start + async_extent->ram_size - 1;
+
+                       p->mapping = inode->i_mapping;
+                       tree->ops->writepage_end_io_hook(p, start, end,
+                                                        NULL, 0);
+                       p->mapping = NULL;
+                       extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+                                                    PAGE_END_WRITEBACK |
+                                                    PAGE_SET_ERROR);
+                       free_async_extent_pages(async_extent);
+               }
                 alloc_hint = ins.objectid + ins.offset;
                 kfree(async_extent);
-               if (ret)
-                       goto out;
                 cond_resched();
         }
-       ret = 0;
-out:
-       return ret;
+       return;
  out_free_reserve:
         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
  out_free:
@@ -818,7 +849,9 @@ out_free:
                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
                                      PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
-                                    PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+                                    PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
+                                    PAGE_SET_ERROR);
+       free_async_extent_pages(async_extent);
         kfree(async_extent);
         goto again;
  }
@@ -1094,7 +1127,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                 async_cow->locked_page = locked_page;
                 async_cow->start = start;
  
-               if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
+               if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
+                   !btrfs_test_opt(root, FORCE_COMPRESS))
                         cur_end = end;
                 else
                         cur_end = min(end, start + 512 * 1024 - 1);
@@ -1303,7 +1337,7 @@ next_slot:
                          * we fall into common COW way.
                          */
                         if (!nolock) {
-                               err = btrfs_start_nocow_write(root);
+                               err = btrfs_start_write_no_snapshoting(root);
                                 if (!err)
                                         goto out_check;
                         }
@@ -1327,7 +1361,7 @@ out_check:
                 if (extent_end <= start) {
                         path->slots[0]++;
                         if (!nolock && nocow)
-                               btrfs_end_nocow_write(root);
+                               btrfs_end_write_no_snapshoting(root);
                         goto next_slot;
                 }
                 if (!nocow) {
@@ -1347,7 +1381,7 @@ out_check:
                                              page_started, nr_written, 1);
                         if (ret) {
                                 if (!nolock && nocow)
-                                       btrfs_end_nocow_write(root);
+                                       btrfs_end_write_no_snapshoting(root);
                                 goto error;
                         }
                         cow_start = (u64)-1;
@@ -1398,7 +1432,7 @@ out_check:
                                                       num_bytes);
                         if (ret) {
                                 if (!nolock && nocow)
-                                       btrfs_end_nocow_write(root);
+                                       btrfs_end_write_no_snapshoting(root);
                                 goto error;
                         }
                 }
@@ -1409,7 +1443,7 @@ out_check:
                                              EXTENT_DELALLOC, PAGE_UNLOCK |
                                              PAGE_SET_PRIVATE2);
                 if (!nolock && nocow)
-                       btrfs_end_nocow_write(root);
+                       btrfs_end_write_no_snapshoting(root);
                 cur_offset = extent_end;
                 if (cur_offset > end)
                         break;
@@ -1445,6 +1479,26 @@ error:
         return ret;
  }
  
+static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
+{
+
+       if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+           !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
+               return 0;
+
+       /*
+        * @defrag_bytes is a hint value, no spinlock held here,
+        * if is not zero, it means the file is defragging.
+        * Force cow if given extent needs to be defragged.
+        */
+       if (BTRFS_I(inode)->defrag_bytes &&
+           test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+                          EXTENT_DEFRAG, 0, NULL))
+               return 1;
+
+       return 0;
+}
+
  /*
   * extent_io.c call back to do delayed allocation processing
   */
@@ -1453,17 +1507,15 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
                               unsigned long *nr_written)
  {
         int ret;
-       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int force_cow = need_force_cow(inode, start, end);
  
-       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
+       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
                 ret = run_delalloc_nocow(inode, locked_page, start, end,
                                          page_started, 1, nr_written);
-       } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
+       } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
                 ret = run_delalloc_nocow(inode, locked_page, start, end,
                                          page_started, 0, nr_written);
-       } else if (!btrfs_test_opt(root, COMPRESS) &&
-                  !(BTRFS_I(inode)->force_compress) &&
-                  !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
+       } else if (!inode_need_compress(inode)) {
                 ret = cow_file_range(inode, locked_page, start, end,
                                       page_started, nr_written, 1);
         } else {
@@ -1555,6 +1607,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
                                struct extent_state *state, unsigned long *bits)
  {
  
+       if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
+               WARN_ON(1);
         /*
          * set_bit and clear bit hooks normally require _irqsave/restore
          * but in this case, we are only testing for the DELALLOC
@@ -1577,6 +1631,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
                                      root->fs_info->delalloc_batch);
                 spin_lock(&BTRFS_I(inode)->lock);
                 BTRFS_I(inode)->delalloc_bytes += len;
+               if (*bits & EXTENT_DEFRAG)
+                       BTRFS_I(inode)->defrag_bytes += len;
                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
                                          &BTRFS_I(inode)->runtime_flags))
                         btrfs_add_delalloc_inodes(root, inode);
@@ -1591,6 +1647,13 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                                  struct extent_state *state,
                                  unsigned long *bits)
  {
+       u64 len = state->end + 1 - state->start;
+
+       spin_lock(&BTRFS_I(inode)->lock);
+       if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
+               BTRFS_I(inode)->defrag_bytes -= len;
+       spin_unlock(&BTRFS_I(inode)->lock);
+
         /*
          * set_bit and clear bit hooks normally require _irqsave/restore
          * but in this case, we are only testing for the DELALLOC
@@ -1598,7 +1661,6 @@ static void btrfs_clear_bit_hook(struct inode *inode,
          */
         if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                 struct btrfs_root *root = BTRFS_I(inode)->root;
-               u64 len = state->end + 1 - state->start;
                 bool do_list = !btrfs_is_free_space_inode(inode);
  
                 if (*bits & EXTENT_FIRST_DELALLOC) {
@@ -2660,6 +2722,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                 goto out;
         }
  
+       btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
+                                    ordered_extent->file_offset +
+                                    ordered_extent->len - 1);
+
         if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
                 truncated = true;
                 logical_len = ordered_extent->truncated_len;
@@ -2856,6 +2922,40 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
         return 0;
  }
  
+static int __readpage_endio_check(struct inode *inode,
+                                 struct btrfs_io_bio *io_bio,
+                                 int icsum, struct page *page,
+                                 int pgoff, u64 start, size_t len)
+{
+       char *kaddr;
+       u32 csum_expected;
+       u32 csum = ~(u32)0;
+       static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+                                     DEFAULT_RATELIMIT_BURST);
+
+       csum_expected = *(((u32 *)io_bio->csum) + icsum);
+
+       kaddr = kmap_atomic(page);
+       csum = btrfs_csum_data(kaddr + pgoff, csum,  len);
+       btrfs_csum_final(csum, (char *)&csum);
+       if (csum != csum_expected)
+               goto zeroit;
+
+       kunmap_atomic(kaddr);
+       return 0;
+zeroit:
+       if (__ratelimit(&_rs))
+               btrfs_info(BTRFS_I(inode)->root->fs_info,
+                          "csum failed ino %llu off %llu csum %u expected csum %u",
+                          btrfs_ino(inode), start, csum, csum_expected);
+       memset(kaddr + pgoff, 1, len);
+       flush_dcache_page(page);
+       kunmap_atomic(kaddr);
+       if (csum_expected == 0)
+               return 0;
+       return -EIO;
+}
+
  /*
   * when reads are done, we need to check csums to verify the data is correct
   * if there's a match, we allow the bio to finish.  If not, the code in
@@ -2868,20 +2968,15 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
         size_t offset = start - page_offset(page);
         struct inode *inode = page->mapping->host;
         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-       char *kaddr;
         struct btrfs_root *root = BTRFS_I(inode)->root;
-       u32 csum_expected;
-       u32 csum = ~(u32)0;
-       static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
-                                     DEFAULT_RATELIMIT_BURST);
  
         if (PageChecked(page)) {
                 ClearPageChecked(page);
-               goto good;
+               return 0;
         }
  
         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
-               goto good;
+               return 0;
  
         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
             test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
@@ -2891,28 +2986,8 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
         }
  
         phy_offset >>= inode->i_sb->s_blocksize_bits;
-       csum_expected = *(((u32 *)io_bio->csum) + phy_offset);
-
-       kaddr = kmap_atomic(page);
-       csum = btrfs_csum_data(kaddr + offset, csum,  end - start + 1);
-       btrfs_csum_final(csum, (char *)&csum);
-       if (csum != csum_expected)
-               goto zeroit;
-
-       kunmap_atomic(kaddr);
-good:
-       return 0;
-
-zeroit:
-       if (__ratelimit(&_rs))
-               btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
-                       btrfs_ino(page->mapping->host), start, csum, csum_expected);
-       memset(kaddr + offset, 1, end - start + 1);
-       flush_dcache_page(page);
-       kunmap_atomic(kaddr);
-       if (csum_expected == 0)
-               return 0;
-       return -EIO;
+       return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
+                                     start, (size_t)(end - start + 1));
  }
  
  struct delayed_iput {
@@ -3159,7 +3234,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
         path->reada = -1;
  
         key.objectid = BTRFS_ORPHAN_OBJECTID;
-       btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+       key.type = BTRFS_ORPHAN_ITEM_KEY;
         key.offset = (u64)-1;
  
         while (1) {
@@ -3186,7 +3261,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                 /* make sure the item matches what we want */
                 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
                         break;
-               if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
+               if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
                         break;
  
                 /* release the path since we're done with it */
@@ -3662,7 +3737,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
          * without delay
          */
         if (!btrfs_is_free_space_inode(inode)
-           && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+           && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+           && !root->fs_info->log_root_recovering) {
                 btrfs_update_root_times(trans, root);
  
                 ret = btrfs_delayed_update_inode(trans, root, inode);
@@ -4085,7 +4161,7 @@ search_again:
                 fi = NULL;
                 leaf = path->nodes[0];
                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-               found_type = btrfs_key_type(&found_key);
+               found_type = found_key.type;
  
                 if (found_key.objectid != ino)
                         break;
@@ -4523,6 +4599,26 @@ next:
         return err;
  }
  
+static int wait_snapshoting_atomic_t(atomic_t *a)
+{
+       schedule();
+       return 0;
+}
+
+static void wait_for_snapshot_creation(struct btrfs_root *root)
+{
+       while (true) {
+               int ret;
+
+               ret = btrfs_start_write_no_snapshoting(root);
+               if (ret)
+                       break;
+               wait_on_atomic_t(&root->will_be_snapshoted,
+                                wait_snapshoting_atomic_t,
+                                TASK_UNINTERRUPTIBLE);
+       }
+}
+
  static int btrfs_setsize(struct inode *inode, struct iattr *attr)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4547,17 +4643,30 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
  
         if (newsize > oldsize) {
                 truncate_pagecache(inode, newsize);
+               /*
+                * Don't do an expanding truncate while snapshoting is ongoing.
+                * This is to ensure the snapshot captures a fully consistent
+                * state of this file - if the snapshot captures this expanding
+                * truncation, it must capture all writes that happened before
+                * this truncation.
+                */
+               wait_for_snapshot_creation(root);
                 ret = btrfs_cont_expand(inode, oldsize, newsize);
-               if (ret)
+               if (ret) {
+                       btrfs_end_write_no_snapshoting(root);
                         return ret;
+               }
  
                 trans = btrfs_start_transaction(root, 1);
-               if (IS_ERR(trans))
+               if (IS_ERR(trans)) {
+                       btrfs_end_write_no_snapshoting(root);
                         return PTR_ERR(trans);
+               }
  
                 i_size_write(inode, newsize);
                 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
                 ret = btrfs_update_inode(trans, root, inode);
+               btrfs_end_write_no_snapshoting(root);
                 btrfs_end_transaction(trans, root);
         } else {
  
@@ -4747,6 +4856,8 @@ void btrfs_evict_inode(struct inode *inode)
         /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
         btrfs_wait_ordered_range(inode, 0, (u64)-1);
  
+       btrfs_free_io_failure_record(inode, 0, (u64)-1);
+
         if (root->fs_info->log_root_recovering) {
                 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
                                  &BTRFS_I(inode)->runtime_flags));
@@ -5202,42 +5313,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                         iput(inode);
                         inode = ERR_PTR(ret);
                 }
-               /*
-                * If orphan cleanup did remove any orphans, it means the tree
-                * was modified and therefore the commit root is not the same as
-                * the current root anymore. This is a problem, because send
-                * uses the commit root and therefore can see inode items that
-                * don't exist in the current root anymore, and for example make
-                * calls to btrfs_iget, which will do tree lookups based on the
-                * current root and not on the commit root. Those lookups will
-                * fail, returning a -ESTALE error, and making send fail with
-                * that error. So make sure a send does not see any orphans we
-                * have just removed, and that it will see the same inodes
-                * regardless of whether a transaction commit happened before
-                * it started (meaning that the commit root will be the same as
-                * the current root) or not.
-                */
-               if (sub_root->node != sub_root->commit_root) {
-                       u64 sub_flags = btrfs_root_flags(&sub_root->root_item);
-
-                       if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) {
-                               struct extent_buffer *eb;
-
-                               /*
-                                * Assert we can't have races between dentry
-                                * lookup called through the snapshot creation
-                                * ioctl and the VFS.
-                                */
-                               ASSERT(mutex_is_locked(&dir->i_mutex));
-
-                               down_write(&root->fs_info->commit_root_sem);
-                               eb = sub_root->commit_root;
-                               sub_root->commit_root =
-                                       btrfs_root_node(sub_root);
-                               up_write(&root->fs_info->commit_root_sem);
-                               free_extent_buffer(eb);
-                       }
-               }
         }
  
         return inode;
@@ -5280,7 +5355,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
                         return ERR_CAST(inode);
         }
  
-       return d_materialise_unique(dentry, inode);
+       return d_splice_alias(inode, dentry);
  }
  
  unsigned char btrfs_filetype_table[] = {
@@ -5331,7 +5406,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
                 btrfs_get_delayed_items(inode, &ins_list, &del_list);
         }
  
-       btrfs_set_key_type(&key, key_type);
+       key.type = key_type;
         key.offset = ctx->pos;
         key.objectid = btrfs_ino(inode);
  
@@ -5356,7 +5431,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
  
                 if (found_key.objectid != key.objectid)
                         break;
-               if (btrfs_key_type(&found_key) != key_type)
+               if (found_key.type != key_type)
                         break;
                 if (found_key.offset < ctx->pos)
                         goto next;
@@ -5568,7 +5643,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
         int ret;
  
         key.objectid = btrfs_ino(inode);
-       btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+       key.type = BTRFS_DIR_INDEX_KEY;
         key.offset = (u64)-1;
  
         path = btrfs_alloc_path();
@@ -5600,7 +5675,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
  
         if (found_key.objectid != btrfs_ino(inode) ||
-           btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
+           found_key.type != BTRFS_DIR_INDEX_KEY) {
                 BTRFS_I(inode)->index_cnt = 2;
                 goto out;
         }
@@ -5718,7 +5793,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
  
         key[0].objectid = objectid;
-       btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
+       key[0].type = BTRFS_INODE_ITEM_KEY;
         key[0].offset = 0;
  
         sizes[0] = sizeof(struct btrfs_inode_item);
@@ -5731,7 +5806,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                  * add more hard links than can fit in the ref item.
                  */
                 key[1].objectid = objectid;
-               btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+               key[1].type = BTRFS_INODE_REF_KEY;
                 key[1].offset = ref_objectid;
  
                 sizes[1] = name_len + sizeof(*ref);
@@ -5740,7 +5815,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
         location = &BTRFS_I(inode)->location;
         location->objectid = objectid;
         location->offset = 0;
-       btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+       location->type = BTRFS_INODE_ITEM_KEY;
  
         ret = btrfs_insert_inode_locked(inode);
         if (ret < 0)
@@ -5832,7 +5907,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
                 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
         } else {
                 key.objectid = ino;
-               btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+               key.type = BTRFS_INODE_ITEM_KEY;
                 key.offset = 0;
         }
  
@@ -6191,21 +6266,60 @@ out_fail_inode:
         goto out_fail;
  }
  
+/* Find next extent map of a given extent map, caller needs to ensure locks */
+static struct extent_map *next_extent_map(struct extent_map *em)
+{
+       struct rb_node *next;
+
+       next = rb_next(&em->rb_node);
+       if (!next)
+               return NULL;
+       return container_of(next, struct extent_map, rb_node);
+}
+
+static struct extent_map *prev_extent_map(struct extent_map *em)
+{
+       struct rb_node *prev;
+
+       prev = rb_prev(&em->rb_node);
+       if (!prev)
+               return NULL;
+       return container_of(prev, struct extent_map, rb_node);
+}
+
  /* helper for btfs_get_extent.  Given an existing extent in the tree,
+ * the existing extent is the nearest extent to map_start,
   * and an extent that you want to insert, deal with overlap and insert
- * the new extent into the tree.
+ * the best fitted new extent into the tree.
   */
  static int merge_extent_mapping(struct extent_map_tree *em_tree,
                                 struct extent_map *existing,
                                 struct extent_map *em,
                                 u64 map_start)
  {
+       struct extent_map *prev;
+       struct extent_map *next;
+       u64 start;
+       u64 end;
         u64 start_diff;
  
         BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
-       start_diff = map_start - em->start;
-       em->start = map_start;
-       em->len = existing->start - em->start;
+
+       if (existing->start > map_start) {
+               next = existing;
+               prev = prev_extent_map(next);
+       } else {
+               prev = existing;
+               next = next_extent_map(prev);
+       }
+
+       start = prev ? extent_map_end(prev) : em->start;
+       start = max_t(u64, start, em->start);
+       end = next ? next->start : extent_map_end(em);
+       end = min_t(u64, end, extent_map_end(em));
+       start_diff = start - em->start;
+       em->start = start;
+       em->len = end - start;
         if (em->block_start < EXTENT_MAP_LAST_BYTE &&
             !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                 em->block_start += start_diff;
@@ -6333,7 +6447,7 @@ again:
                               struct btrfs_file_extent_item);
         /* are we inside the extent that was found? */
         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-       found_type = btrfs_key_type(&found_key);
+       found_type = found_key.type;
         if (found_key.objectid != objectid ||
             found_type != BTRFS_EXTENT_DATA_KEY) {
                 /*
@@ -6482,25 +6596,21 @@ insert:
  
                 ret = 0;
  
-               existing = lookup_extent_mapping(em_tree, start, len);
-               if (existing && (existing->start > start ||
-                   existing->start + existing->len <= start)) {
+               existing = search_extent_mapping(em_tree, start, len);
+               /*
+                * existing will always be non-NULL, since there must be
+                * extent causing the -EEXIST.
+                */
+               if (start >= extent_map_end(existing) ||
+                   start <= existing->start) {
+                       /*
+                        * The existing extent map is the one nearest to
+                        * the [start, start + len) range which overlaps
+                        */
+                       err = merge_extent_mapping(em_tree, existing,
+                                                  em, start);
                         free_extent_map(existing);
-                       existing = NULL;
-               }
-               if (!existing) {
-                       existing = lookup_extent_mapping(em_tree, em->start,
-                                                        em->len);
-                       if (existing) {
-                               err = merge_extent_mapping(em_tree, existing,
-                                                          em, start);
-                               free_extent_map(existing);
-                               if (err) {
-                                       free_extent_map(em);
-                                       em = NULL;
-                               }
-                       } else {
-                               err = -EIO;
+                       if (err) {
                                 free_extent_map(em);
                                 em = NULL;
                         }
@@ -6942,9 +7052,12 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                         btrfs_put_ordered_extent(ordered);
                 } else {
                         /* Screw you mmap */
-                       ret = filemap_write_and_wait_range(inode->i_mapping,
-                                                          lockstart,
-                                                          lockend);
+                       ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
+                       if (ret)
+                               break;
+                       ret = filemap_fdatawait_range(inode->i_mapping,
+                                                     lockstart,
+                                                     lockend);
                         if (ret)
                                 break;
  
@@ -7112,8 +7225,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                                                        block_start, len,
                                                        orig_block_len,
                                                        ram_bytes, type);
-                               if (IS_ERR(em))
+                               if (IS_ERR(em)) {
+                                       ret = PTR_ERR(em);
                                         goto unlock_err;
+                               }
                         }
  
                         ret = btrfs_add_ordered_extent_dio(inode, start,
@@ -7188,45 +7303,277 @@ unlock_err:
         return ret;
  }
  
-static void btrfs_endio_direct_read(struct bio *bio, int err)
+static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+                                       int rw, int mirror_num)
  {
-       struct btrfs_dio_private *dip = bio->bi_private;
-       struct bio_vec *bvec;
-       struct inode *inode = dip->inode;
         struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct bio *dio_bio;
-       u32 *csums = (u32 *)dip->csum;
+       int ret;
+
+       BUG_ON(rw & REQ_WRITE);
+
+       bio_get(bio);
+
+       ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+                                 BTRFS_WQ_ENDIO_DIO_REPAIR);
+       if (ret)
+               goto err;
+
+       ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+err:
+       bio_put(bio);
+       return ret;
+}
+
+static int btrfs_check_dio_repairable(struct inode *inode,
+                                     struct bio *failed_bio,
+                                     struct io_failure_record *failrec,
+                                     int failed_mirror)
+{
+       int num_copies;
+
+       num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+                                     failrec->logical, failrec->len);
+       if (num_copies == 1) {
+               /*
+                * we only have a single copy of the data, so don't bother with
+                * all the retry and error correction code that follows. no
+                * matter what the error is, it is very likely to persist.
+                */
+               pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+                        num_copies, failrec->this_mirror, failed_mirror);
+               return 0;
+       }
+
+       failrec->failed_mirror = failed_mirror;
+       failrec->this_mirror++;
+       if (failrec->this_mirror == failed_mirror)
+               failrec->this_mirror++;
+
+       if (failrec->this_mirror > num_copies) {
+               pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
+                        num_copies, failrec->this_mirror, failed_mirror);
+               return 0;
+       }
+
+       return 1;
+}
+
+static int dio_read_error(struct inode *inode, struct bio *failed_bio,
+                         struct page *page, u64 start, u64 end,
+                         int failed_mirror, bio_end_io_t *repair_endio,
+                         void *repair_arg)
+{
+       struct io_failure_record *failrec;
+       struct bio *bio;
+       int isector;
+       int read_mode;
+       int ret;
+
+       BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+       ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+       if (ret)
+               return ret;
+
+       ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
+                                        failed_mirror);
+       if (!ret) {
+               free_io_failure(inode, failrec);
+               return -EIO;
+       }
+
+       if (failed_bio->bi_vcnt > 1)
+               read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+       else
+               read_mode = READ_SYNC;
+
+       isector = start - btrfs_io_bio(failed_bio)->logical;
+       isector >>= inode->i_sb->s_blocksize_bits;
+       bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
+                                     0, isector, repair_endio, repair_arg);
+       if (!bio) {
+               free_io_failure(inode, failrec);
+               return -EIO;
+       }
+
+       btrfs_debug(BTRFS_I(inode)->root->fs_info,
+                   "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
+                   read_mode, failrec->this_mirror, failrec->in_validation);
+
+       ret = submit_dio_repair_bio(inode, bio, read_mode,
+                                   failrec->this_mirror);
+       if (ret) {
+               free_io_failure(inode, failrec);
+               bio_put(bio);
+       }
+
+       return ret;
+}
+
+struct btrfs_retry_complete {
+       struct completion done;
+       struct inode *inode;
         u64 start;
+       int uptodate;
+};
+
+static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
+{
+       struct btrfs_retry_complete *done = bio->bi_private;
+       struct bio_vec *bvec;
+       int i;
+
+       if (err)
+               goto end;
+
+       done->uptodate = 1;
+       bio_for_each_segment_all(bvec, bio, i)
+               clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
+end:
+       complete(&done->done);
+       bio_put(bio);
+}
+
+static int __btrfs_correct_data_nocsum(struct inode *inode,
+                                      struct btrfs_io_bio *io_bio)
+{
+       struct bio_vec *bvec;
+       struct btrfs_retry_complete done;
+       u64 start;
+       int i;
+       int ret;
+
+       start = io_bio->logical;
+       done.inode = inode;
+
+       bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+try_again:
+               done.uptodate = 0;
+               done.start = start;
+               init_completion(&done.done);
+
+               ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+                                    start + bvec->bv_len - 1,
+                                    io_bio->mirror_num,
+                                    btrfs_retry_endio_nocsum, &done);
+               if (ret)
+                       return ret;
+
+               wait_for_completion(&done.done);
+
+               if (!done.uptodate) {
+                       /* We might have another mirror, so try again */
+                       goto try_again;
+               }
+
+               start += bvec->bv_len;
+       }
+
+       return 0;
+}
+
+static void btrfs_retry_endio(struct bio *bio, int err)
+{
+       struct btrfs_retry_complete *done = bio->bi_private;
+       struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+       struct bio_vec *bvec;
+       int uptodate;
+       int ret;
         int i;
  
-       start = dip->logical_offset;
+       if (err)
+               goto end;
+
+       uptodate = 1;
         bio_for_each_segment_all(bvec, bio, i) {
-               if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
-                       struct page *page = bvec->bv_page;
-                       char *kaddr;
-                       u32 csum = ~(u32)0;
-                       unsigned long flags;
-
-                       local_irq_save(flags);
-                       kaddr = kmap_atomic(page);
-                       csum = btrfs_csum_data(kaddr + bvec->bv_offset,
-                                              csum, bvec->bv_len);
-                       btrfs_csum_final(csum, (char *)&csum);
-                       kunmap_atomic(kaddr);
-                       local_irq_restore(flags);
-
-                       flush_dcache_page(bvec->bv_page);
-                       if (csum != csums[i]) {
-                               btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
-                                         btrfs_ino(inode), start, csum,
-                                         csums[i]);
-                               err = -EIO;
-                       }
+               ret = __readpage_endio_check(done->inode, io_bio, i,
+                                            bvec->bv_page, 0,
+                                            done->start, bvec->bv_len);
+               if (!ret)
+                       clean_io_failure(done->inode, done->start,
+                                        bvec->bv_page, 0);
+               else
+                       uptodate = 0;
+       }
+
+       done->uptodate = uptodate;
+end:
+       complete(&done->done);
+       bio_put(bio);
+}
+
+static int __btrfs_subio_endio_read(struct inode *inode,
+                                   struct btrfs_io_bio *io_bio, int err)
+{
+       struct bio_vec *bvec;
+       struct btrfs_retry_complete done;
+       u64 start;
+       u64 offset = 0;
+       int i;
+       int ret;
+
+       err = 0;
+       start = io_bio->logical;
+       done.inode = inode;
+
+       bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+               ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
+                                            0, start, bvec->bv_len);
+               if (likely(!ret))
+                       goto next;
+try_again:
+               done.uptodate = 0;
+               done.start = start;
+               init_completion(&done.done);
+
+               ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+                                    start + bvec->bv_len - 1,
+                                    io_bio->mirror_num,
+                                    btrfs_retry_endio, &done);
+               if (ret) {
+                       err = ret;
+                       goto next;
                 }
  
+               wait_for_completion(&done.done);
+
+               if (!done.uptodate) {
+                       /* We might have another mirror, so try again */
+                       goto try_again;
+               }
+next:
+               offset += bvec->bv_len;
                 start += bvec->bv_len;
         }
  
+       return err;
+}
+
+static int btrfs_subio_endio_read(struct inode *inode,
+                                 struct btrfs_io_bio *io_bio, int err)
+{
+       bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+       if (skip_csum) {
+               if (unlikely(err))
+                       return __btrfs_correct_data_nocsum(inode, io_bio);
+               else
+                       return 0;
+       } else {
+               return __btrfs_subio_endio_read(inode, io_bio, err);
+       }
+}
+
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+       struct btrfs_dio_private *dip = bio->bi_private;
+       struct inode *inode = dip->inode;
+       struct bio *dio_bio;
+       struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+
+       if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
+               err = btrfs_subio_endio_read(inode, io_bio, err);
+
         unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
                       dip->logical_offset + dip->bytes - 1);
         dio_bio = dip->dio_bio;
@@ -7237,6 +7584,9 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
         if (err)
                 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
         dio_end_io(dio_bio, err);
+
+       if (io_bio->end_io)
+               io_bio->end_io(io_bio, err);
         bio_put(bio);
  }
  
@@ -7302,12 +7652,17 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
  {
         struct btrfs_dio_private *dip = bio->bi_private;
  
+       if (err)
+               btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
+                          "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
+                          btrfs_ino(dip->inode), bio->bi_rw,
+                          (unsigned long long)bio->bi_iter.bi_sector,
+                          bio->bi_iter.bi_size, err);
+
+       if (dip->subio_endio)
+               err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
+
         if (err) {
-               btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
-                         "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
-                     btrfs_ino(dip->inode), bio->bi_rw,
-                     (unsigned long long)bio->bi_iter.bi_sector,
-                     bio->bi_iter.bi_size, err);
                 dip->errors = 1;
  
                 /*
@@ -7338,6 +7693,38 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
         return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
  }
  
+static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
+                                                struct inode *inode,
+                                                struct btrfs_dio_private *dip,
+                                                struct bio *bio,
+                                                u64 file_offset)
+{
+       struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+       struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
+       int ret;
+
+       /*
+        * We load all the csum data we need when we submit
+        * the first bio to reduce the csum tree search and
+        * contention.
+        */
+       if (dip->logical_offset == file_offset) {
+               ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
+                                               file_offset);
+               if (ret)
+                       return ret;
+       }
+
+       if (bio == dip->orig_bio)
+               return 0;
+
+       file_offset -= dip->logical_offset;
+       file_offset >>= inode->i_sb->s_blocksize_bits;
+       io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
+
+       return 0;
+}
+
  static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                                          int rw, u64 file_offset, int skip_sum,
                                          int async_submit)
@@ -7353,7 +7740,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
         bio_get(bio);
  
         if (!write) {
-               ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+               ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+                               BTRFS_WQ_ENDIO_DATA);
                 if (ret)
                         goto err;
         }
@@ -7376,13 +7764,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
                 if (ret)
                         goto err;
-       } else if (!skip_sum) {
-               ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio,
-                                               file_offset);
+       } else {
+               ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
+                                                    file_offset);
                 if (ret)
                         goto err;
         }
-
  map:
         ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
  err:
@@ -7403,7 +7790,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
         u64 submit_len = 0;
         u64 map_length;
         int nr_pages = 0;
-       int ret = 0;
+       int ret;
         int async_submit = 0;
  
         map_length = orig_bio->bi_iter.bi_size;
@@ -7414,6 +7801,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
  
         if (map_length >= orig_bio->bi_iter.bi_size) {
                 bio = orig_bio;
+               dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
                 goto submit;
         }
  
@@ -7430,12 +7818,13 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
  
         bio->bi_private = dip;
         bio->bi_end_io = btrfs_end_dio_bio;
+       btrfs_io_bio(bio)->logical = file_offset;
         atomic_inc(&dip->pending_bios);
  
         while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
-               if (unlikely(map_length < submit_len + bvec->bv_len ||
+               if (map_length < submit_len + bvec->bv_len ||
                     bio_add_page(bio, bvec->bv_page, bvec->bv_len,
-                                bvec->bv_offset) < bvec->bv_len)) {
+                                bvec->bv_offset) < bvec->bv_len) {
                         /*
                          * inc the count before we submit the bio so
                          * we know the end IO handler won't happen before
@@ -7464,6 +7853,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                                 goto out_err;
                         bio->bi_private = dip;
                         bio->bi_end_io = btrfs_end_dio_bio;
+                       btrfs_io_bio(bio)->logical = file_offset;
  
                         map_length = orig_bio->bi_iter.bi_size;
                         ret = btrfs_map_block(root->fs_info, rw,
@@ -7507,11 +7897,10 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_dio_private *dip;
         struct bio *io_bio;
+       struct btrfs_io_bio *btrfs_bio;
         int skip_sum;
-       int sum_len;
         int write = rw & REQ_WRITE;
         int ret = 0;
-       u16 csum_size;
  
         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
  
@@ -7521,16 +7910,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
                 goto free_ordered;
         }
  
-       if (!skip_sum && !write) {
-               csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-               sum_len = dio_bio->bi_iter.bi_size >>
-                       inode->i_sb->s_blocksize_bits;
-               sum_len *= csum_size;
-       } else {
-               sum_len = 0;
-       }
-
-       dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
+       dip = kzalloc(sizeof(*dip), GFP_NOFS);
         if (!dip) {
                 ret = -ENOMEM;
                 goto free_io_bio;
@@ -7542,20 +7922,25 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
         dip->bytes = dio_bio->bi_iter.bi_size;
         dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
         io_bio->bi_private = dip;
-       dip->errors = 0;
         dip->orig_bio = io_bio;
         dip->dio_bio = dio_bio;
         atomic_set(&dip->pending_bios, 0);
+       btrfs_bio = btrfs_io_bio(io_bio);
+       btrfs_bio->logical = file_offset;
  
-       if (write)
+       if (write) {
                 io_bio->bi_end_io = btrfs_endio_direct_write;
-       else
+       } else {
                 io_bio->bi_end_io = btrfs_endio_direct_read;
+               dip->subio_endio = btrfs_subio_endio_read;
+       }
  
         ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
         if (!ret)
                 return;
  
+       if (btrfs_bio->end_io)
+               btrfs_bio->end_io(btrfs_bio, ret);
  free_io_bio:
         bio_put(io_bio);
  
@@ -7652,8 +8037,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                 ret = btrfs_delalloc_reserve_space(inode, count);
                 if (ret)
                         goto out;
-       } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
-                                    &BTRFS_I(inode)->runtime_flags))) {
+       } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+                                    &BTRFS_I(inode)->runtime_flags)) {
                 inode_dio_done(inode);
                 flags = DIO_LOCKING | DIO_SKIP_HOLES;
                 wakeup = false;
@@ -8173,6 +8558,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
         ei->last_sub_trans = 0;
         ei->logged_trans = 0;
         ei->delalloc_bytes = 0;
+       ei->defrag_bytes = 0;
         ei->disk_i_size = 0;
         ei->flags = 0;
         ei->csum_bytes = 0;
@@ -8231,6 +8617,7 @@ void btrfs_destroy_inode(struct inode *inode)
         WARN_ON(BTRFS_I(inode)->reserved_extents);
         WARN_ON(BTRFS_I(inode)->delalloc_bytes);
         WARN_ON(BTRFS_I(inode)->csum_bytes);
+       WARN_ON(BTRFS_I(inode)->defrag_bytes);
  
         /*
          * This can happen where we create an inode, but somebody else also
@@ -8646,7 +9033,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
                 spin_unlock(&root->delalloc_lock);
  
                 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
-               if (unlikely(!work)) {
+               if (!work) {
                         if (delay_iput)
                                 btrfs_add_delayed_iput(inode);
                         else
@@ -8832,7 +9219,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
         }
         key.objectid = btrfs_ino(inode);
         key.offset = 0;
-       btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+       key.type = BTRFS_EXTENT_DATA_KEY;
         datasize = btrfs_file_extent_calc_inline_size(name_len);
         err = btrfs_insert_empty_item(trans, root, path, &key,
                                       datasize);
@@ -9110,6 +9497,21 @@ out_inode:
  
  }
  
+/* Inspired by filemap_check_errors() */
+int btrfs_inode_check_errors(struct inode *inode)
+{
+       int ret = 0;
+
+       if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
+           test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
+               ret = -ENOSPC;
+       if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
+           test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
+               ret = -EIO;
+
+       return ret;
+}
+
  static const struct inode_operations btrfs_dir_inode_operations = {
         .getattr        = btrfs_getattr,
         .lookup         = btrfs_lookup,