Merge tag 'xfs-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 27 Jul 2016 16:53:35 +0000 (09:53 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 27 Jul 2016 16:53:35 +0000 (09:53 -0700)
Pull xfs updates from Dave Chinner:
 "The major addition is the new iomap based block mapping
  infrastructure.  We've been kicking this about locally for years, but
  there are other filesystems want to use it too (e.g. gfs2).  Now it
  is fully working, reviewed and ready for merge and be used by other
  filesystems.

  There are a lot of other fixes and cleanups in the tree, but those are
  XFS internal things and none are of the scale or visibility of the
  iomap changes.  See below for details.

  I am likely to send another pull request next week - we're just about
  ready to merge some new functionality (on disk block->owner reverse
  mapping infrastructure), but that's a huge chunk of code (74 files
  changed, 7283 insertions(+), 1114 deletions(-)) so I'm keeping that
  separate to all the "normal" pull request changes so they don't get
  lost in the noise.

  Summary of changes in this update:
   - generic iomap based IO path infrastructure
   - generic iomap based fiemap implementation
   - xfs iomap based Io path implementation
   - buffer error handling fixes
   - tracking of in flight buffer IO for unmount serialisation
   - direct IO and DAX io path separation and simplification
   - shortform directory format definition changes for wider platform
     compatibility
   - various buffer cache fixes
   - cleanups in preparation for rmap merge
   - error injection cleanups and fixes
   - log item format buffer memory allocation restructuring to prevent
     rare OOM reclaim deadlocks
   - sparse inode chunks are now fully supported"

* tag 'xfs-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: (53 commits)
  xfs: remove EXPERIMENTAL tag from sparse inode feature
  xfs: bufferhead chains are invalid after end_page_writeback
  xfs: allocate log vector buffers outside CIL context lock
  libxfs: directory node splitting does not have an extra block
  xfs: remove dax code from object file when disabled
  xfs: skip dirty pages in ->releasepage()
  xfs: remove __arch_pack
  xfs: kill xfs_dir2_inou_t
  xfs: kill xfs_dir2_sf_off_t
  xfs: split direct I/O and DAX path
  xfs: direct calls in the direct I/O path
  xfs: stop using generic_file_read_iter for direct I/O
  xfs: split xfs_file_read_iter into buffered and direct I/O helpers
  xfs: remove s_maxbytes enforcement in xfs_file_read_iter
  xfs: kill ioflags
  xfs: don't pass ioflags around in the ioctl path
  xfs: track and serialize in-flight async buffers against unmount
  xfs: exclude never-released buffers from buftarg I/O accounting
  xfs: don't reset b_retries to 0 on every failure
  xfs: remove extraneous buffer flag changes
  ...

1  2 
fs/buffer.c
fs/internal.h
fs/nfsd/blocklayout.c
fs/xfs/xfs_aops.c
fs/xfs/xfs_buf.c
fs/xfs/xfs_file.c

diff --combined fs/buffer.c
@@@ -21,6 -21,7 +21,7 @@@
  #include <linux/kernel.h>
  #include <linux/syscalls.h>
  #include <linux/fs.h>
+ #include <linux/iomap.h>
  #include <linux/mm.h>
  #include <linux/percpu.h>
  #include <linux/slab.h>
@@@ -45,7 -46,7 +46,7 @@@
  #include <trace/events/block.h>
  
  static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 -static int submit_bh_wbc(int rw, struct buffer_head *bh,
 +static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
                         unsigned long bio_flags,
                         struct writeback_control *wbc);
  
@@@ -153,7 -154,7 +154,7 @@@ static void __end_buffer_read_notouch(s
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
 -              /* This happens, due to failed READA attempts. */
 +              /* This happens, due to failed read-ahead attempts. */
                clear_buffer_uptodate(bh);
        }
        unlock_buffer(bh);
@@@ -588,7 -589,7 +589,7 @@@ void write_boundary_block(struct block_
        struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
        if (bh) {
                if (buffer_dirty(bh))
 -                      ll_rw_block(WRITE, 1, &bh);
 +                      ll_rw_block(REQ_OP_WRITE, 0, 1, &bh);
                put_bh(bh);
        }
  }
@@@ -1225,7 -1226,7 +1226,7 @@@ static struct buffer_head *__bread_slow
        } else {
                get_bh(bh);
                bh->b_end_io = end_buffer_read_sync;
 -              submit_bh(READ, bh);
 +              submit_bh(REQ_OP_READ, 0, bh);
                wait_on_buffer(bh);
                if (buffer_uptodate(bh))
                        return bh;
@@@ -1395,7 -1396,7 +1396,7 @@@ void __breadahead(struct block_device *
  {
        struct buffer_head *bh = __getblk(bdev, block, size);
        if (likely(bh)) {
 -              ll_rw_block(READA, 1, &bh);
 +              ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
                brelse(bh);
        }
  }
@@@ -1687,7 -1688,7 +1688,7 @@@ static struct buffer_head *create_page_
   * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
   * causes the writes to be flagged as synchronous writes.
   */
 -static int __block_write_full_page(struct inode *inode, struct page *page,
 +int __block_write_full_page(struct inode *inode, struct page *page,
                        get_block_t *get_block, struct writeback_control *wbc,
                        bh_end_io_t *handler)
  {
        struct buffer_head *bh, *head;
        unsigned int blocksize, bbits;
        int nr_underway = 0;
 -      int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
 +      int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
  
        head = create_page_buffers(page, inode,
                                        (1 << BH_Dirty)|(1 << BH_Uptodate));
        do {
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
 -                      submit_bh_wbc(write_op, bh, 0, wbc);
 +                      submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc);
                        nr_underway++;
                }
                bh = next;
@@@ -1840,7 -1841,7 +1841,7 @@@ recover
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
                        clear_buffer_dirty(bh);
 -                      submit_bh_wbc(write_op, bh, 0, wbc);
 +                      submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc);
                        nr_underway++;
                }
                bh = next;
        unlock_page(page);
        goto done;
  }
 +EXPORT_SYMBOL(__block_write_full_page);
  
  /*
   * If a page has any new buffers, zero them out here, and mark them uptodate
@@@ -1892,8 -1892,62 +1893,62 @@@ void page_zero_new_buffers(struct page 
  }
  EXPORT_SYMBOL(page_zero_new_buffers);
  
- int __block_write_begin(struct page *page, loff_t pos, unsigned len,
-               get_block_t *get_block)
+ static void
+ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
+               struct iomap *iomap)
+ {
+       loff_t offset = block << inode->i_blkbits;
+       bh->b_bdev = iomap->bdev;
+       /*
+        * Block points to offset in file we need to map, iomap contains
+        * the offset at which the map starts. If the map ends before the
+        * current block, then do not map the buffer and let the caller
+        * handle it.
+        */
+       BUG_ON(offset >= iomap->offset + iomap->length);
+       switch (iomap->type) {
+       case IOMAP_HOLE:
+               /*
+                * If the buffer is not up to date or beyond the current EOF,
+                * we need to mark it as new to ensure sub-block zeroing is
+                * executed if necessary.
+                */
+               if (!buffer_uptodate(bh) ||
+                   (offset >= i_size_read(inode)))
+                       set_buffer_new(bh);
+               break;
+       case IOMAP_DELALLOC:
+               if (!buffer_uptodate(bh) ||
+                   (offset >= i_size_read(inode)))
+                       set_buffer_new(bh);
+               set_buffer_uptodate(bh);
+               set_buffer_mapped(bh);
+               set_buffer_delay(bh);
+               break;
+       case IOMAP_UNWRITTEN:
+               /*
+                * For unwritten regions, we always need to ensure that
+                * sub-block writes cause the regions in the block we are not
+                * writing to are zeroed. Set the buffer as new to ensure this.
+                */
+               set_buffer_new(bh);
+               set_buffer_unwritten(bh);
+               /* FALLTHRU */
+       case IOMAP_MAPPED:
+               if (offset >= i_size_read(inode))
+                       set_buffer_new(bh);
+               bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
+                               ((offset - iomap->offset) >> inode->i_blkbits);
+               set_buffer_mapped(bh);
+               break;
+       }
+ }
+ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+               get_block_t *get_block, struct iomap *iomap)
  {
        unsigned from = pos & (PAGE_SIZE - 1);
        unsigned to = from + len;
                        clear_buffer_new(bh);
                if (!buffer_mapped(bh)) {
                        WARN_ON(bh->b_size != blocksize);
-                       err = get_block(inode, block, bh, 1);
-                       if (err)
-                               break;
+                       if (get_block) {
+                               err = get_block(inode, block, bh, 1);
+                               if (err)
+                                       break;
+                       } else {
+                               iomap_to_bh(inode, block, bh, iomap);
+                       }
                        if (buffer_new(bh)) {
                                unmap_underlying_metadata(bh->b_bdev,
                                                        bh->b_blocknr);
                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
                    !buffer_unwritten(bh) &&
                     (block_start < from || block_end > to)) {
 -                      ll_rw_block(READ, 1, &bh);
 +                      ll_rw_block(REQ_OP_READ, 0, 1, &bh);
                        *wait_bh++=bh;
                }
        }
                page_zero_new_buffers(page, from, to);
        return err;
  }
+ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+               get_block_t *get_block)
+ {
+       return __block_write_begin_int(page, pos, len, get_block, NULL);
+ }
  EXPORT_SYMBOL(__block_write_begin);
  
  static int __block_commit_write(struct inode *inode, struct page *page,
@@@ -2249,7 -2314,7 +2315,7 @@@ int block_read_full_page(struct page *p
                if (buffer_uptodate(bh))
                        end_buffer_async_read(bh, 1);
                else
 -                      submit_bh(READ, bh);
 +                      submit_bh(REQ_OP_READ, 0, bh);
        }
        return 0;
  }
@@@ -2583,7 -2648,7 +2649,7 @@@ int nobh_write_begin(struct address_spa
                if (block_start < from || block_end > to) {
                        lock_buffer(bh);
                        bh->b_end_io = end_buffer_read_nobh;
 -                      submit_bh(READ, bh);
 +                      submit_bh(REQ_OP_READ, 0, bh);
                        nr_reads++;
                }
        }
@@@ -2853,7 -2918,7 +2919,7 @@@ int block_truncate_page(struct address_
  
        if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
                err = -EIO;
 -              ll_rw_block(READ, 1, &bh);
 +              ll_rw_block(REQ_OP_READ, 0, 1, &bh);
                wait_on_buffer(bh);
                /* Uhhuh. Read error. Complain and punt. */
                if (!buffer_uptodate(bh))
@@@ -2950,7 -3015,7 +3016,7 @@@ static void end_bio_bh_io_sync(struct b
   * errors, this only handles the "we need to be able to
   * do IO at the final sector" case.
   */
 -void guard_bio_eod(int rw, struct bio *bio)
 +void guard_bio_eod(int op, struct bio *bio)
  {
        sector_t maxsector;
        struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
        bvec->bv_len -= truncated_bytes;
  
        /* ..and clear the end of the buffer for reads */
 -      if ((rw & RW_MASK) == READ) {
 +      if (op == REQ_OP_READ) {
                zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
                                truncated_bytes);
        }
  }
  
 -static int submit_bh_wbc(int rw, struct buffer_head *bh,
 +static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
                         unsigned long bio_flags, struct writeback_control *wbc)
  {
        struct bio *bio;
        /*
         * Only clear out a write error when rewriting
         */
 -      if (test_set_buffer_req(bh) && (rw & WRITE))
 +      if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
                clear_buffer_write_io_error(bh);
  
        /*
        bio->bi_flags |= bio_flags;
  
        /* Take care of bh's that straddle the end of the device */
 -      guard_bio_eod(rw, bio);
 +      guard_bio_eod(op, bio);
  
        if (buffer_meta(bh))
 -              rw |= REQ_META;
 +              op_flags |= REQ_META;
        if (buffer_prio(bh))
 -              rw |= REQ_PRIO;
 +              op_flags |= REQ_PRIO;
 +      bio_set_op_attrs(bio, op, op_flags);
  
 -      submit_bio(rw, bio);
 +      submit_bio(bio);
        return 0;
  }
  
 -int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 +int _submit_bh(int op, int op_flags, struct buffer_head *bh,
 +             unsigned long bio_flags)
  {
 -      return submit_bh_wbc(rw, bh, bio_flags, NULL);
 +      return submit_bh_wbc(op, op_flags, bh, bio_flags, NULL);
  }
  EXPORT_SYMBOL_GPL(_submit_bh);
  
 -int submit_bh(int rw, struct buffer_head *bh)
 +int submit_bh(int op, int op_flags,  struct buffer_head *bh)
  {
 -      return submit_bh_wbc(rw, bh, 0, NULL);
 +      return submit_bh_wbc(op, op_flags, bh, 0, NULL);
  }
  EXPORT_SYMBOL(submit_bh);
  
  /**
   * ll_rw_block: low-level access to block devices (DEPRECATED)
 - * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
 + * @op: whether to %READ or %WRITE
 + * @op_flags: rq_flag_bits
   * @nr: number of &struct buffer_heads in the array
   * @bhs: array of pointers to &struct buffer_head
   *
   * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
 - * requests an I/O operation on them, either a %READ or a %WRITE.  The third
 - * %READA option is described in the documentation for generic_make_request()
 - * which ll_rw_block() calls.
 + * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE.
 + * @op_flags contains flags modifying the detailed I/O behavior, most notably
 + * %REQ_RAHEAD.
   *
   * This function drops any buffer that it cannot get a lock on (with the
   * BH_Lock state bit), any buffer that appears to be clean when doing a write
   * All of the buffers must be for the same device, and must also be a
   * multiple of the current approved size for the device.
   */
 -void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 +void ll_rw_block(int op, int op_flags,  int nr, struct buffer_head *bhs[])
  {
        int i;
  
  
                if (!trylock_buffer(bh))
                        continue;
 -              if (rw == WRITE) {
 +              if (op == WRITE) {
                        if (test_clear_buffer_dirty(bh)) {
                                bh->b_end_io = end_buffer_write_sync;
                                get_bh(bh);
 -                              submit_bh(WRITE, bh);
 +                              submit_bh(op, op_flags, bh);
                                continue;
                        }
                } else {
                        if (!buffer_uptodate(bh)) {
                                bh->b_end_io = end_buffer_read_sync;
                                get_bh(bh);
 -                              submit_bh(rw, bh);
 +                              submit_bh(op, op_flags, bh);
                                continue;
                        }
                }
  }
  EXPORT_SYMBOL(ll_rw_block);
  
 -void write_dirty_buffer(struct buffer_head *bh, int rw)
 +void write_dirty_buffer(struct buffer_head *bh, int op_flags)
  {
        lock_buffer(bh);
        if (!test_clear_buffer_dirty(bh)) {
        }
        bh->b_end_io = end_buffer_write_sync;
        get_bh(bh);
 -      submit_bh(rw, bh);
 +      submit_bh(REQ_OP_WRITE, op_flags, bh);
  }
  EXPORT_SYMBOL(write_dirty_buffer);
  
   * and then start new I/O and then wait upon it.  The caller must have a ref on
   * the buffer_head.
   */
 -int __sync_dirty_buffer(struct buffer_head *bh, int rw)
 +int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
  {
        int ret = 0;
  
        if (test_clear_buffer_dirty(bh)) {
                get_bh(bh);
                bh->b_end_io = end_buffer_write_sync;
 -              ret = submit_bh(rw, bh);
 +              ret = submit_bh(REQ_OP_WRITE, op_flags, bh);
                wait_on_buffer(bh);
                if (!ret && !buffer_uptodate(bh))
                        ret = -EIO;
@@@ -3395,7 -3457,7 +3461,7 @@@ int bh_submit_read(struct buffer_head *
  
        get_bh(bh);
        bh->b_end_io = end_buffer_read_sync;
 -      submit_bh(READ, bh);
 +      submit_bh(REQ_OP_READ, 0, bh);
        wait_on_buffer(bh);
        if (buffer_uptodate(bh))
                return 0;
diff --combined fs/internal.h
@@@ -11,6 -11,7 +11,7 @@@
  
  struct super_block;
  struct file_system_type;
+ struct iomap;
  struct linux_binprm;
  struct path;
  struct mount;
@@@ -39,6 -40,8 +40,8 @@@ static inline int __sync_blockdev(struc
   * buffer.c
   */
  extern void guard_bio_eod(int rw, struct bio *bio);
+ extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+               get_block_t *get_block, struct iomap *iomap);
  
  /*
   * char_dev.c
@@@ -130,7 -133,6 +133,7 @@@ extern int invalidate_inodes(struct sup
  extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
  extern int d_set_mounted(struct dentry *dentry);
  extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
 +extern struct dentry *d_alloc_cursor(struct dentry *);
  
  /*
   * read_write.c
diff --combined fs/nfsd/blocklayout.c
@@@ -2,6 -2,7 +2,7 @@@
   * Copyright (c) 2014-2016 Christoph Hellwig.
   */
  #include <linux/exportfs.h>
+ #include <linux/iomap.h>
  #include <linux/genhd.h>
  #include <linux/slab.h>
  #include <linux/pr.h>
@@@ -290,7 -291,7 +291,7 @@@ out_free_buf
        return error;
  }
  
 -#define NFSD_MDS_PR_KEY               0x0100000000000000
 +#define NFSD_MDS_PR_KEY               0x0100000000000000ULL
  
  /*
   * We use the client ID as a unique key for the reservations.
diff --combined fs/xfs/xfs_aops.c
@@@ -87,6 -87,12 +87,12 @@@ xfs_find_bdev_for_inode
   * We're now finished for good with this page.  Update the page state via the
   * associated buffer_heads, paying attention to the start and end offsets that
   * we need to process on the page.
+  *
+  * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
+  * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
+  * the page at all, as we may be racing with memory reclaim and it can free both
+  * the bufferhead chain and the page as it will see the page as clean and
+  * unused.
   */
  static void
  xfs_finish_page_writeback(
        int                     error)
  {
        unsigned int            end = bvec->bv_offset + bvec->bv_len - 1;
-       struct buffer_head      *head, *bh;
+       struct buffer_head      *head, *bh, *next;
        unsigned int            off = 0;
+       unsigned int            bsize;
  
        ASSERT(bvec->bv_offset < PAGE_SIZE);
        ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
  
        bh = head = page_buffers(bvec->bv_page);
  
+       bsize = bh->b_size;
        do {
+               next = bh->b_this_page;
                if (off < bvec->bv_offset)
                        goto next_bh;
                if (off > end)
                        break;
                bh->b_end_io(bh, !error);
  next_bh:
-               off += bh->b_size;
-       } while ((bh = bh->b_this_page) != head);
+               off += bsize;
+       } while ((bh = next) != head);
  }
  
  /*
@@@ -438,8 -447,7 +447,8 @@@ xfs_submit_ioend
  
        ioend->io_bio->bi_private = ioend;
        ioend->io_bio->bi_end_io = xfs_end_bio;
 -
 +      bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
 +                       (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
        /*
         * If we are failing the IO now, just mark the ioend with an
         * error and finish it. This will run IO completion immediately
                return status;
        }
  
 -      submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
 -                 ioend->io_bio);
 +      submit_bio(ioend->io_bio);
        return 0;
  }
  
@@@ -510,9 -519,8 +519,9 @@@ xfs_chain_bio
  
        bio_chain(ioend->io_bio, new);
        bio_get(ioend->io_bio);         /* for xfs_destroy_ioend */
 -      submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
 -                 ioend->io_bio);
 +      bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
 +                        (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
 +      submit_bio(ioend->io_bio);
        ioend->io_bio = new;
  }
  
@@@ -1041,6 -1049,20 +1050,20 @@@ xfs_vm_releasepage
  
        trace_xfs_releasepage(page->mapping->host, page, 0, 0);
  
+       /*
+        * mm accommodates an old ext3 case where clean pages might not have had
+        * the dirty bit cleared. Thus, it can send actual dirty pages to
+        * ->releasepage() via shrink_active_list(). Conversely,
+        * block_invalidatepage() can send pages that are still marked dirty
+        * but otherwise have invalidated buffers.
+        *
+        * We've historically freed buffers on the latter. Instead, quietly
+        * filter out all dirty pages to avoid spurious buffer state warnings.
+        * This can likely be removed once shrink_active_list() is fixed.
+        */
+       if (PageDirty(page))
+               return 0;
        xfs_count_page_state(page, &delalloc, &unwritten);
  
        if (WARN_ON_ONCE(delalloc))
@@@ -1144,6 -1166,8 +1167,8 @@@ __xfs_get_blocks
        ssize_t                 size;
        int                     new = 0;
  
+       BUG_ON(create && !direct);
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
  
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
        size = bh_result->b_size;
  
-       if (!create && direct && offset >= i_size_read(inode))
+       if (!create && offset >= i_size_read(inode))
                return 0;
  
        /*
         * Direct I/O is usually done on preallocated files, so try getting
-        * a block mapping without an exclusive lock first.  For buffered
-        * writes we already have the exclusive iolock anyway, so avoiding
-        * a lock roundtrip here by taking the ilock exclusive from the
-        * beginning is a useful micro optimization.
+        * a block mapping without an exclusive lock first.
         */
-       if (create && !direct) {
-               lockmode = XFS_ILOCK_EXCL;
-               xfs_ilock(ip, lockmode);
-       } else {
-               lockmode = xfs_ilock_data_map_shared(ip);
-       }
+       lockmode = xfs_ilock_data_map_shared(ip);
  
        ASSERT(offset <= mp->m_super->s_maxbytes);
        if (offset + size > mp->m_super->s_maxbytes)
             (imap.br_startblock == HOLESTARTBLOCK ||
              imap.br_startblock == DELAYSTARTBLOCK) ||
             (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
-               if (direct || xfs_get_extsz_hint(ip)) {
-                       /*
-                        * xfs_iomap_write_direct() expects the shared lock. It
-                        * is unlocked on return.
-                        */
-                       if (lockmode == XFS_ILOCK_EXCL)
-                               xfs_ilock_demote(ip, lockmode);
-                       error = xfs_iomap_write_direct(ip, offset, size,
-                                                      &imap, nimaps);
-                       if (error)
-                               return error;
-                       new = 1;
+               /*
+                * xfs_iomap_write_direct() expects the shared lock. It
+                * is unlocked on return.
+                */
+               if (lockmode == XFS_ILOCK_EXCL)
+                       xfs_ilock_demote(ip, lockmode);
  
-               } else {
-                       /*
-                        * Delalloc reservations do not require a transaction,
-                        * we can go on without dropping the lock here. If we
-                        * are allocating a new delalloc block, make sure that
-                        * we set the new flag so that we mark the buffer new so
-                        * that we know that it is newly allocated if the write
-                        * fails.
-                        */
-                       if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
-                               new = 1;
-                       error = xfs_iomap_write_delay(ip, offset, size, &imap);
-                       if (error)
-                               goto out_unlock;
+               error = xfs_iomap_write_direct(ip, offset, size,
+                                              &imap, nimaps);
+               if (error)
+                       return error;
+               new = 1;
  
-                       xfs_iunlock(ip, lockmode);
-               }
                trace_xfs_get_blocks_alloc(ip, offset, size,
                                ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
                                                   : XFS_IO_DELALLOC, &imap);
        }
  
        /* trim mapping down to size requested */
-       if (direct || size > (1 << inode->i_blkbits))
-               xfs_map_trim_size(inode, iblock, bh_result,
-                                 &imap, offset, size);
+       xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
  
        /*
         * For unwritten extents do not report a disk address in the buffered
                if (ISUNWRITTEN(&imap))
                        set_buffer_unwritten(bh_result);
                /* direct IO needs special help */
-               if (create && direct) {
+               if (create) {
                        if (dax_fault)
                                ASSERT(!ISUNWRITTEN(&imap));
                        else
             (new || ISUNWRITTEN(&imap))))
                set_buffer_new(bh_result);
  
-       if (imap.br_startblock == DELAYSTARTBLOCK) {
-               BUG_ON(direct);
-               if (create) {
-                       set_buffer_uptodate(bh_result);
-                       set_buffer_mapped(bh_result);
-                       set_buffer_delay(bh_result);
-               }
-       }
+       BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
  
        return 0;
  
@@@ -1337,7 -1326,7 +1327,7 @@@ xfs_get_blocks_dax_fault
   * whereas if we have flags set we will always be called in task context
   * (i.e. from a workqueue).
   */
STATIC int
+ int
  xfs_end_io_direct_write(
        struct kiocb            *iocb,
        loff_t                  offset,
@@@ -1408,234 -1397,10 +1398,10 @@@ xfs_vm_direct_IO
        struct kiocb            *iocb,
        struct iov_iter         *iter)
  {
-       struct inode            *inode = iocb->ki_filp->f_mapping->host;
-       dio_iodone_t            *endio = NULL;
-       int                     flags = 0;
-       struct block_device     *bdev;
-       if (iov_iter_rw(iter) == WRITE) {
-               endio = xfs_end_io_direct_write;
-               flags = DIO_ASYNC_EXTEND;
-       }
-       if (IS_DAX(inode)) {
-               return dax_do_io(iocb, inode, iter,
-                                xfs_get_blocks_direct, endio, 0);
-       }
-       bdev = xfs_find_bdev_for_inode(inode);
-       return  __blockdev_direct_IO(iocb, inode, bdev, iter,
-                       xfs_get_blocks_direct, endio, NULL, flags);
- }
- /*
-  * Punch out the delalloc blocks we have already allocated.
-  *
-  * Don't bother with xfs_setattr given that nothing can have made it to disk yet
-  * as the page is still locked at this point.
-  */
- STATIC void
- xfs_vm_kill_delalloc_range(
-       struct inode            *inode,
-       loff_t                  start,
-       loff_t                  end)
- {
-       struct xfs_inode        *ip = XFS_I(inode);
-       xfs_fileoff_t           start_fsb;
-       xfs_fileoff_t           end_fsb;
-       int                     error;
-       start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
-       end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
-       if (end_fsb <= start_fsb)
-               return;
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
-                                               end_fsb - start_fsb);
-       if (error) {
-               /* something screwed, just bail */
-               if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                       xfs_alert(ip->i_mount,
-               "xfs_vm_write_failed: unable to clean up ino %lld",
-                                       ip->i_ino);
-               }
-       }
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
- STATIC void
- xfs_vm_write_failed(
-       struct inode            *inode,
-       struct page             *page,
-       loff_t                  pos,
-       unsigned                len)
- {
-       loff_t                  block_offset;
-       loff_t                  block_start;
-       loff_t                  block_end;
-       loff_t                  from = pos & (PAGE_SIZE - 1);
-       loff_t                  to = from + len;
-       struct buffer_head      *bh, *head;
-       struct xfs_mount        *mp = XFS_I(inode)->i_mount;
        /*
-        * The request pos offset might be 32 or 64 bit, this is all fine
-        * on 64-bit platform.  However, for 64-bit pos request on 32-bit
-        * platform, the high 32-bit will be masked off if we evaluate the
-        * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
-        * 0xfffff000 as an unsigned long, hence the result is incorrect
-        * which could cause the following ASSERT failed in most cases.
-        * In order to avoid this, we can evaluate the block_offset of the
-        * start of the page by using shifts rather than masks the mismatch
-        * problem.
+        * We just need the method present so that open/fcntl allow direct I/O.
         */
-       block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT;
-       ASSERT(block_offset + from == pos);
-       head = page_buffers(page);
-       block_start = 0;
-       for (bh = head; bh != head || !block_start;
-            bh = bh->b_this_page, block_start = block_end,
-                                  block_offset += bh->b_size) {
-               block_end = block_start + bh->b_size;
-               /* skip buffers before the write */
-               if (block_end <= from)
-                       continue;
-               /* if the buffer is after the write, we're done */
-               if (block_start >= to)
-                       break;
-               /*
-                * Process delalloc and unwritten buffers beyond EOF. We can
-                * encounter unwritten buffers in the event that a file has
-                * post-EOF unwritten extents and an extending write happens to
-                * fail (e.g., an unaligned write that also involves a delalloc
-                * to the same page).
-                */
-               if (!buffer_delay(bh) && !buffer_unwritten(bh))
-                       continue;
-               if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
-                   block_offset < i_size_read(inode))
-                       continue;
-               if (buffer_delay(bh))
-                       xfs_vm_kill_delalloc_range(inode, block_offset,
-                                                  block_offset + bh->b_size);
-               /*
-                * This buffer does not contain data anymore. make sure anyone
-                * who finds it knows that for certain.
-                */
-               clear_buffer_delay(bh);
-               clear_buffer_uptodate(bh);
-               clear_buffer_mapped(bh);
-               clear_buffer_new(bh);
-               clear_buffer_dirty(bh);
-               clear_buffer_unwritten(bh);
-       }
- }
- /*
-  * This used to call block_write_begin(), but it unlocks and releases the page
-  * on error, and we need that page to be able to punch stale delalloc blocks out
-  * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
-  * the appropriate point.
-  */
- STATIC int
- xfs_vm_write_begin(
-       struct file             *file,
-       struct address_space    *mapping,
-       loff_t                  pos,
-       unsigned                len,
-       unsigned                flags,
-       struct page             **pagep,
-       void                    **fsdata)
- {
-       pgoff_t                 index = pos >> PAGE_SHIFT;
-       struct page             *page;
-       int                     status;
-       struct xfs_mount        *mp = XFS_I(mapping->host)->i_mount;
-       ASSERT(len <= PAGE_SIZE);
-       page = grab_cache_page_write_begin(mapping, index, flags);
-       if (!page)
-               return -ENOMEM;
-       status = __block_write_begin(page, pos, len, xfs_get_blocks);
-       if (xfs_mp_fail_writes(mp))
-               status = -EIO;
-       if (unlikely(status)) {
-               struct inode    *inode = mapping->host;
-               size_t          isize = i_size_read(inode);
-               xfs_vm_write_failed(inode, page, pos, len);
-               unlock_page(page);
-               /*
-                * If the write is beyond EOF, we only want to kill blocks
-                * allocated in this write, not blocks that were previously
-                * written successfully.
-                */
-               if (xfs_mp_fail_writes(mp))
-                       isize = 0;
-               if (pos + len > isize) {
-                       ssize_t start = max_t(ssize_t, pos, isize);
-                       truncate_pagecache_range(inode, start, pos + len);
-               }
-               put_page(page);
-               page = NULL;
-       }
-       *pagep = page;
-       return status;
- }
- /*
-  * On failure, we only need to kill delalloc blocks beyond EOF in the range of
-  * this specific write because they will never be written. Previous writes
-  * beyond EOF where block allocation succeeded do not need to be trashed, so
-  * only new blocks from this write should be trashed. For blocks within
-  * EOF, generic_write_end() zeros them so they are safe to leave alone and be
-  * written with all the other valid data.
-  */
- STATIC int
- xfs_vm_write_end(
-       struct file             *file,
-       struct address_space    *mapping,
-       loff_t                  pos,
-       unsigned                len,
-       unsigned                copied,
-       struct page             *page,
-       void                    *fsdata)
- {
-       int                     ret;
-       ASSERT(len <= PAGE_SIZE);
-       ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-       if (unlikely(ret < len)) {
-               struct inode    *inode = mapping->host;
-               size_t          isize = i_size_read(inode);
-               loff_t          to = pos + len;
-               if (to > isize) {
-                       /* only kill blocks in this write beyond EOF */
-                       if (pos > isize)
-                               isize = pos;
-                       xfs_vm_kill_delalloc_range(inode, isize, to);
-                       truncate_pagecache_range(inode, isize, to);
-               }
-       }
-       return ret;
+       return -EINVAL;
  }
  
  STATIC sector_t
@@@ -1748,8 -1513,6 +1514,6 @@@ const struct address_space_operations x
        .set_page_dirty         = xfs_vm_set_page_dirty,
        .releasepage            = xfs_vm_releasepage,
        .invalidatepage         = xfs_vm_invalidatepage,
-       .write_begin            = xfs_vm_write_begin,
-       .write_end              = xfs_vm_write_end,
        .bmap                   = xfs_vm_bmap,
        .direct_IO              = xfs_vm_direct_IO,
        .migratepage            = buffer_migrate_page,
diff --combined fs/xfs/xfs_buf.c
@@@ -79,6 -79,47 +79,47 @@@ xfs_buf_vmap_len
        return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
  }
  
+ /*
+  * Bump the I/O in flight count on the buftarg if we haven't yet done so for
+  * this buffer. The count is incremented once per buffer (per hold cycle)
+  * because the corresponding decrement is deferred to buffer release. Buffers
+  * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
+  * tracking adds unnecessary overhead. This is used for sychronization purposes
+  * with unmount (see xfs_wait_buftarg()), so all we really need is a count of
+  * in-flight buffers.
+  *
+  * Buffers that are never released (e.g., superblock, iclog buffers) must set
+  * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
+  * never reaches zero and unmount hangs indefinitely.
+  */
+ static inline void
+ xfs_buf_ioacct_inc(
+       struct xfs_buf  *bp)
+ {
+       if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT))
+               return;
+       ASSERT(bp->b_flags & XBF_ASYNC);
+       bp->b_flags |= _XBF_IN_FLIGHT;
+       percpu_counter_inc(&bp->b_target->bt_io_count);
+ }
+ /*
+  * Clear the in-flight state on a buffer about to be released to the LRU or
+  * freed and unaccount from the buftarg.
+  */
+ static inline void
+ xfs_buf_ioacct_dec(
+       struct xfs_buf  *bp)
+ {
+       if (!(bp->b_flags & _XBF_IN_FLIGHT))
+               return;
+       ASSERT(bp->b_flags & XBF_ASYNC);
+       bp->b_flags &= ~_XBF_IN_FLIGHT;
+       percpu_counter_dec(&bp->b_target->bt_io_count);
+ }
  /*
   * When we mark a buffer stale, we remove the buffer from the LRU and clear the
   * b_lru_ref count so that the buffer is freed immediately when the buffer
@@@ -102,6 -143,14 +143,14 @@@ xfs_buf_stale
         */
        bp->b_flags &= ~_XBF_DELWRI_Q;
  
+       /*
+        * Once the buffer is marked stale and unlocked, a subsequent lookup
+        * could reset b_flags. There is no guarantee that the buffer is
+        * unaccounted (released to LRU) before that occurs. Drop in-flight
+        * status now to preserve accounting consistency.
+        */
+       xfs_buf_ioacct_dec(bp);
        spin_lock(&bp->b_lock);
        atomic_set(&bp->b_lru_ref, 0);
        if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
@@@ -815,7 -864,8 +864,8 @@@ xfs_buf_get_uncached
        struct xfs_buf          *bp;
        DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
  
-       bp = _xfs_buf_alloc(target, &map, 1, 0);
+       /* flags might contain irrelevant bits, pass only what we care about */
+       bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT);
        if (unlikely(bp == NULL))
                goto fail;
  
@@@ -866,63 -916,85 +916,85 @@@ xfs_buf_hold
  }
  
  /*
-  *    Releases a hold on the specified buffer.  If the
-  *    the hold count is 1, calls xfs_buf_free.
+  * Release a hold on the specified buffer. If the hold count is 1, the buffer is
+  * placed on LRU or freed (depending on b_lru_ref).
   */
  void
  xfs_buf_rele(
        xfs_buf_t               *bp)
  {
        struct xfs_perag        *pag = bp->b_pag;
+       bool                    release;
+       bool                    freebuf = false;
  
        trace_xfs_buf_rele(bp, _RET_IP_);
  
        if (!pag) {
                ASSERT(list_empty(&bp->b_lru));
                ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
-               if (atomic_dec_and_test(&bp->b_hold))
+               if (atomic_dec_and_test(&bp->b_hold)) {
+                       xfs_buf_ioacct_dec(bp);
                        xfs_buf_free(bp);
+               }
                return;
        }
  
        ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
  
        ASSERT(atomic_read(&bp->b_hold) > 0);
-       if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
-               spin_lock(&bp->b_lock);
-               if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
-                       /*
-                        * If the buffer is added to the LRU take a new
-                        * reference to the buffer for the LRU and clear the
-                        * (now stale) dispose list state flag
-                        */
-                       if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
-                               bp->b_state &= ~XFS_BSTATE_DISPOSE;
-                               atomic_inc(&bp->b_hold);
-                       }
-                       spin_unlock(&bp->b_lock);
-                       spin_unlock(&pag->pag_buf_lock);
-               } else {
-                       /*
-                        * most of the time buffers will already be removed from
-                        * the LRU, so optimise that case by checking for the
-                        * XFS_BSTATE_DISPOSE flag indicating the last list the
-                        * buffer was on was the disposal list
-                        */
-                       if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
-                               list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
-                       } else {
-                               ASSERT(list_empty(&bp->b_lru));
-                       }
-                       spin_unlock(&bp->b_lock);
  
-                       ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
-                       rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
-                       spin_unlock(&pag->pag_buf_lock);
-                       xfs_perag_put(pag);
-                       xfs_buf_free(bp);
+       release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+       spin_lock(&bp->b_lock);
+       if (!release) {
+               /*
+                * Drop the in-flight state if the buffer is already on the LRU
+                * and it holds the only reference. This is racy because we
+                * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
+                * ensures the decrement occurs only once per-buf.
+                */
+               if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
+                       xfs_buf_ioacct_dec(bp);
+               goto out_unlock;
+       }
+       /* the last reference has been dropped ... */
+       xfs_buf_ioacct_dec(bp);
+       if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+               /*
+                * If the buffer is added to the LRU take a new reference to the
+                * buffer for the LRU and clear the (now stale) dispose list
+                * state flag
+                */
+               if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+                       bp->b_state &= ~XFS_BSTATE_DISPOSE;
+                       atomic_inc(&bp->b_hold);
+               }
+               spin_unlock(&pag->pag_buf_lock);
+       } else {
+               /*
+                * most of the time buffers will already be removed from the
+                * LRU, so optimise that case by checking for the
+                * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
+                * was on was the disposal list
+                */
+               if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
+                       list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+               } else {
+                       ASSERT(list_empty(&bp->b_lru));
                }
+               ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+               rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+               spin_unlock(&pag->pag_buf_lock);
+               xfs_perag_put(pag);
+               freebuf = true;
        }
+ out_unlock:
+       spin_unlock(&bp->b_lock);
+       if (freebuf)
+               xfs_buf_free(bp);
  }
  
  
@@@ -944,10 -1016,12 +1016,12 @@@ xfs_buf_trylock
        int                     locked;
  
        locked = down_trylock(&bp->b_sema) == 0;
-       if (locked)
+       if (locked) {
                XB_SET_OWNER(bp);
-       trace_xfs_buf_trylock(bp, _RET_IP_);
+               trace_xfs_buf_trylock(bp, _RET_IP_);
+       } else {
+               trace_xfs_buf_trylock_fail(bp, _RET_IP_);
+       }
        return locked;
  }
  
@@@ -1127,8 -1201,7 +1201,8 @@@ xfs_buf_ioapply_map
        int             map,
        int             *buf_offset,
        int             *count,
 -      int             rw)
 +      int             op,
 +      int             op_flags)
  {
        int             page_index;
        int             total_nr_pages = bp->b_page_count;
  
  next_chunk:
        atomic_inc(&bp->b_io_remaining);
 -      nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
 -      if (nr_pages > total_nr_pages)
 -              nr_pages = total_nr_pages;
 +      nr_pages = min(total_nr_pages, BIO_MAX_PAGES);
  
        bio = bio_alloc(GFP_NOIO, nr_pages);
        bio->bi_bdev = bp->b_target->bt_bdev;
        bio->bi_iter.bi_sector = sector;
        bio->bi_end_io = xfs_buf_bio_end_io;
        bio->bi_private = bp;
 -
 +      bio_set_op_attrs(bio, op, op_flags);
  
        for (; size && nr_pages; nr_pages--, page_index++) {
                int     rbytes, nbytes = PAGE_SIZE - offset;
                        flush_kernel_vmap_range(bp->b_addr,
                                                xfs_buf_vmap_len(bp));
                }
 -              submit_bio(rw, bio);
 +              submit_bio(bio);
                if (size)
                        goto next_chunk;
        } else {
@@@ -1209,8 -1284,7 +1283,8 @@@ _xfs_buf_ioapply
        struct xfs_buf  *bp)
  {
        struct blk_plug plug;
 -      int             rw;
 +      int             op;
 +      int             op_flags = 0;
        int             offset;
        int             size;
        int             i;
                bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue;
  
        if (bp->b_flags & XBF_WRITE) {
 +              op = REQ_OP_WRITE;
                if (bp->b_flags & XBF_SYNCIO)
 -                      rw = WRITE_SYNC;
 -              else
 -                      rw = WRITE;
 +                      op_flags = WRITE_SYNC;
                if (bp->b_flags & XBF_FUA)
 -                      rw |= REQ_FUA;
 +                      op_flags |= REQ_FUA;
                if (bp->b_flags & XBF_FLUSH)
 -                      rw |= REQ_FLUSH;
 +                      op_flags |= REQ_PREFLUSH;
  
                /*
                 * Run the write verifier callback function if it exists. If
                        }
                }
        } else if (bp->b_flags & XBF_READ_AHEAD) {
 -              rw = READA;
 +              op = REQ_OP_READ;
 +              op_flags = REQ_RAHEAD;
        } else {
 -              rw = READ;
 +              op = REQ_OP_READ;
        }
  
        /* we only use the buffer cache for meta-data */
 -      rw |= REQ_META;
 +      op_flags |= REQ_META;
  
        /*
         * Walk all the vectors issuing IO on them. Set up the initial offset
        size = BBTOB(bp->b_io_length);
        blk_start_plug(&plug);
        for (i = 0; i < bp->b_map_count; i++) {
 -              xfs_buf_ioapply_map(bp, i, &offset, &size, rw);
 +              xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags);
                if (bp->b_error)
                        break;
                if (size <= 0)
@@@ -1339,6 -1413,7 +1413,7 @@@ xfs_buf_submit
         * xfs_buf_ioend too early.
         */
        atomic_set(&bp->b_io_remaining, 1);
+       xfs_buf_ioacct_inc(bp);
        _xfs_buf_ioapply(bp);
  
        /*
@@@ -1524,13 -1599,19 +1599,19 @@@ xfs_wait_buftarg
        int loop = 0;
  
        /*
-        * We need to flush the buffer workqueue to ensure that all IO
-        * completion processing is 100% done. Just waiting on buffer locks is
-        * not sufficient for async IO as the reference count held over IO is
-        * not released until after the buffer lock is dropped. Hence we need to
-        * ensure here that all reference counts have been dropped before we
-        * start walking the LRU list.
+        * First wait on the buftarg I/O count for all in-flight buffers to be
+        * released. This is critical as new buffers do not make the LRU until
+        * they are released.
+        *
+        * Next, flush the buffer workqueue to ensure all completion processing
+        * has finished. Just waiting on buffer locks is not sufficient for
+        * async IO as the reference count held over IO is not released until
+        * after the buffer lock is dropped. Hence we need to ensure here that
+        * all reference counts have been dropped before we start walking the
+        * LRU list.
         */
+       while (percpu_counter_sum(&btp->bt_io_count))
+               delay(100);
        drain_workqueue(btp->bt_mount->m_buf_workqueue);
  
        /* loop until there is nothing left on the lru list. */
@@@ -1627,6 -1708,8 +1708,8 @@@ xfs_free_buftarg
        struct xfs_buftarg      *btp)
  {
        unregister_shrinker(&btp->bt_shrinker);
+       ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
+       percpu_counter_destroy(&btp->bt_io_count);
        list_lru_destroy(&btp->bt_lru);
  
        if (mp->m_flags & XFS_MOUNT_BARRIER)
@@@ -1691,6 -1774,9 +1774,9 @@@ xfs_alloc_buftarg
        if (list_lru_init(&btp->bt_lru))
                goto error;
  
+       if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+               goto error;
        btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
        btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
        btp->bt_shrinker.seeks = DEFAULT_SEEKS;
@@@ -1774,18 -1860,33 +1860,33 @@@ xfs_buf_cmp
        return 0;
  }
  
+ /*
+  * submit buffers for write.
+  *
+  * When we have a large buffer list, we do not want to hold all the buffers
+  * locked while we block on the request queue waiting for IO dispatch. To avoid
+  * this problem, we lock and submit buffers in groups of 50, thereby minimising
+  * the lock hold times for lists which may contain thousands of objects.
+  *
+  * To do this, we sort the buffer list before we walk the list to lock and
+  * submit buffers, and we plug and unplug around each group of buffers we
+  * submit.
+  */
  static int
__xfs_buf_delwri_submit(
xfs_buf_delwri_submit_buffers(
        struct list_head        *buffer_list,
-       struct list_head        *io_list,
-       bool                    wait)
+       struct list_head        *wait_list)
  {
-       struct blk_plug         plug;
        struct xfs_buf          *bp, *n;
+       LIST_HEAD               (submit_list);
        int                     pinned = 0;
+       struct blk_plug         plug;
  
+       list_sort(NULL, buffer_list, xfs_buf_cmp);
+       blk_start_plug(&plug);
        list_for_each_entry_safe(bp, n, buffer_list, b_list) {
-               if (!wait) {
+               if (!wait_list) {
                        if (xfs_buf_ispinned(bp)) {
                                pinned++;
                                continue;
                        continue;
                }
  
-               list_move_tail(&bp->b_list, io_list);
                trace_xfs_buf_delwri_split(bp, _RET_IP_);
-       }
-       list_sort(NULL, io_list, xfs_buf_cmp);
-       blk_start_plug(&plug);
-       list_for_each_entry_safe(bp, n, io_list, b_list) {
-               bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
-               bp->b_flags |= XBF_WRITE | XBF_ASYNC;
  
                /*
-                * we do all Io submission async. This means if we need to wait
-                * for IO completion we need to take an extra reference so the
-                * buffer is still valid on the other side.
+                * We do all IO submission async. This means if we need
+                * to wait for IO completion we need to take an extra
+                * reference so the buffer is still valid on the other
+                * side. We need to move the buffer onto the io_list
+                * at this point so the caller can still access it.
                 */
-               if (wait)
+               bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL);
+               bp->b_flags |= XBF_WRITE | XBF_ASYNC;
+               if (wait_list) {
                        xfs_buf_hold(bp);
-               else
+                       list_move_tail(&bp->b_list, wait_list);
+               } else
                        list_del_init(&bp->b_list);
  
                xfs_buf_submit(bp);
@@@ -1849,8 -1946,7 +1946,7 @@@ in
  xfs_buf_delwri_submit_nowait(
        struct list_head        *buffer_list)
  {
-       LIST_HEAD               (io_list);
-       return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
+       return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
  }
  
  /*
  xfs_buf_delwri_submit(
        struct list_head        *buffer_list)
  {
-       LIST_HEAD               (io_list);
+       LIST_HEAD               (wait_list);
        int                     error = 0, error2;
        struct xfs_buf          *bp;
  
-       __xfs_buf_delwri_submit(buffer_list, &io_list, true);
+       xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
  
        /* Wait for IO to complete. */
-       while (!list_empty(&io_list)) {
-               bp = list_first_entry(&io_list, struct xfs_buf, b_list);
+       while (!list_empty(&wait_list)) {
+               bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
  
                list_del_init(&bp->b_list);
  
diff --combined fs/xfs/xfs_file.c
@@@ -37,6 -37,7 +37,7 @@@
  #include "xfs_log.h"
  #include "xfs_icache.h"
  #include "xfs_pnfs.h"
+ #include "xfs_iomap.h"
  
  #include <linux/dcache.h>
  #include <linux/falloc.h>
@@@ -80,61 -81,17 +81,17 @@@ xfs_rw_ilock_demote
  }
  
  /*
-  * xfs_iozero clears the specified range supplied via the page cache (except in
-  * the DAX case). Writes through the page cache will allocate blocks over holes,
-  * though the callers usually map the holes first and avoid them. If a block is
-  * not completely zeroed, then it will be read from disk before being partially
-  * zeroed.
-  *
-  * In the DAX case, we can just directly write to the underlying pages. This
-  * will not allocate blocks, but will avoid holes and unwritten extents and so
-  * not do unnecessary work.
+  * Clear the specified ranges to zero through either the pagecache or DAX.
+  * Holes and unwritten extents will be left as-is as they already are zeroed.
   */
  int
- xfs_iozero(
-       struct xfs_inode        *ip,    /* inode                        */
-       loff_t                  pos,    /* offset in file               */
-       size_t                  count)  /* size of data to zero         */
+ xfs_zero_range(
+       struct xfs_inode        *ip,
+       xfs_off_t               pos,
+       xfs_off_t               count,
+       bool                    *did_zero)
  {
-       struct page             *page;
-       struct address_space    *mapping;
-       int                     status = 0;
-       mapping = VFS_I(ip)->i_mapping;
-       do {
-               unsigned offset, bytes;
-               void *fsdata;
-               offset = (pos & (PAGE_SIZE -1)); /* Within page */
-               bytes = PAGE_SIZE - offset;
-               if (bytes > count)
-                       bytes = count;
-               if (IS_DAX(VFS_I(ip))) {
-                       status = dax_zero_page_range(VFS_I(ip), pos, bytes,
-                                                    xfs_get_blocks_direct);
-                       if (status)
-                               break;
-               } else {
-                       status = pagecache_write_begin(NULL, mapping, pos, bytes,
-                                               AOP_FLAG_UNINTERRUPTIBLE,
-                                               &page, &fsdata);
-                       if (status)
-                               break;
-                       zero_user(page, offset, bytes);
-                       status = pagecache_write_end(NULL, mapping, pos, bytes,
-                                               bytes, page, fsdata);
-                       WARN_ON(status <= 0); /* can't return less than zero! */
-                       status = 0;
-               }
-               pos += bytes;
-               count -= bytes;
-       } while (count);
-       return status;
+       return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
  }
  
  int
@@@ -282,48 -239,35 +239,35 @@@ xfs_file_fsync
  }
  
  STATIC ssize_t
- xfs_file_read_iter(
+ xfs_file_dio_aio_read(
        struct kiocb            *iocb,
        struct iov_iter         *to)
  {
-       struct file             *file = iocb->ki_filp;
-       struct inode            *inode = file->f_mapping->host;
+       struct address_space    *mapping = iocb->ki_filp->f_mapping;
+       struct inode            *inode = mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       size_t                  size = iov_iter_count(to);
+       loff_t                  isize = i_size_read(inode);
+       size_t                  count = iov_iter_count(to);
+       struct iov_iter         data;
+       struct xfs_buftarg      *target;
        ssize_t                 ret = 0;
-       int                     ioflags = 0;
-       xfs_fsize_t             n;
-       loff_t                  pos = iocb->ki_pos;
  
-       XFS_STATS_INC(mp, xs_read_calls);
-       if (unlikely(iocb->ki_flags & IOCB_DIRECT))
-               ioflags |= XFS_IO_ISDIRECT;
-       if (file->f_mode & FMODE_NOCMTIME)
-               ioflags |= XFS_IO_INVIS;
-       if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
-               xfs_buftarg_t   *target =
-                       XFS_IS_REALTIME_INODE(ip) ?
-                               mp->m_rtdev_targp : mp->m_ddev_targp;
-               /* DIO must be aligned to device logical sector size */
-               if ((pos | size) & target->bt_logical_sectormask) {
-                       if (pos == i_size_read(inode))
-                               return 0;
-                       return -EINVAL;
-               }
-       }
+       trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
  
-       n = mp->m_super->s_maxbytes - pos;
-       if (n <= 0 || size == 0)
-               return 0;
+       if (!count)
+               return 0; /* skip atime */
  
-       if (n < size)
-               size = n;
+       if (XFS_IS_REALTIME_INODE(ip))
+               target = ip->i_mount->m_rtdev_targp;
+       else
+               target = ip->i_mount->m_ddev_targp;
  
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return -EIO;
+       /* DIO must be aligned to device logical sector size */
+       if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
+               if (iocb->ki_pos == isize)
+                       return 0;
+               return -EINVAL;
+       }
  
        /*
         * Locking is a bit tricky here. If we take an exclusive lock for direct
         * serialisation.
         */
        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-       if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
+       if (mapping->nrpages) {
                xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
                xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
  
                 * flush and reduce the chances of repeated iolock cycles going
                 * forward.
                 */
-               if (inode->i_mapping->nrpages) {
-                       ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+               if (mapping->nrpages) {
+                       ret = filemap_write_and_wait(mapping);
                        if (ret) {
                                xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
                                return ret;
                         * we fail to invalidate a page, but this should never
                         * happen on XFS. Warn if it does fail.
                         */
-                       ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
+                       ret = invalidate_inode_pages2(mapping);
                        WARN_ON_ONCE(ret);
                        ret = 0;
                }
                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
        }
  
-       trace_xfs_file_read(ip, size, pos, ioflags);
+       data = *to;
+       ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+                       xfs_get_blocks_direct, NULL, NULL, 0);
+       if (ret > 0) {
+               iocb->ki_pos += ret;
+               iov_iter_advance(to, ret);
+       }
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
  
+       file_accessed(iocb->ki_filp);
+       return ret;
+ }
+ static noinline ssize_t
+ xfs_file_dax_read(
+       struct kiocb            *iocb,
+       struct iov_iter         *to)
+ {
+       struct address_space    *mapping = iocb->ki_filp->f_mapping;
+       struct inode            *inode = mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct iov_iter         data = *to;
+       size_t                  count = iov_iter_count(to);
+       ssize_t                 ret = 0;
+       trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+       if (!count)
+               return 0; /* skip atime */
+       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+       ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
+       if (ret > 0) {
+               iocb->ki_pos += ret;
+               iov_iter_advance(to, ret);
+       }
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+       file_accessed(iocb->ki_filp);
+       return ret;
+ }
+ STATIC ssize_t
+ xfs_file_buffered_aio_read(
+       struct kiocb            *iocb,
+       struct iov_iter         *to)
+ {
+       struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
+       ssize_t                 ret;
+       trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
        ret = generic_file_read_iter(iocb, to);
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+       return ret;
+ }
+ STATIC ssize_t
+ xfs_file_read_iter(
+       struct kiocb            *iocb,
+       struct iov_iter         *to)
+ {
+       struct inode            *inode = file_inode(iocb->ki_filp);
+       struct xfs_mount        *mp = XFS_I(inode)->i_mount;
+       ssize_t                 ret = 0;
+       XFS_STATS_INC(mp, xs_read_calls);
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+       if (IS_DAX(inode))
+               ret = xfs_file_dax_read(iocb, to);
+       else if (iocb->ki_flags & IOCB_DIRECT)
+               ret = xfs_file_dio_aio_read(iocb, to);
+       else
+               ret = xfs_file_buffered_aio_read(iocb, to);
        if (ret > 0)
                XFS_STATS_ADD(mp, xs_read_bytes, ret);
-       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
  }
  
@@@ -389,18 -408,14 +408,14 @@@ xfs_file_splice_read
        unsigned int            flags)
  {
        struct xfs_inode        *ip = XFS_I(infilp->f_mapping->host);
-       int                     ioflags = 0;
        ssize_t                 ret;
  
        XFS_STATS_INC(ip->i_mount, xs_read_calls);
  
-       if (infilp->f_mode & FMODE_NOCMTIME)
-               ioflags |= XFS_IO_INVIS;
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
  
-       trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+       trace_xfs_file_splice_read(ip, count, *ppos);
  
        /*
         * DAX inodes cannot ues the page cache for splice, so we have to push
@@@ -423,49 -438,6 +438,6 @@@ out
        return ret;
  }
  
- /*
-  * This routine is called to handle zeroing any space in the last block of the
-  * file that is beyond the EOF.  We do this since the size is being increased
-  * without writing anything to that block and we don't want to read the
-  * garbage on the disk.
-  */
- STATIC int                            /* error (positive) */
- xfs_zero_last_block(
-       struct xfs_inode        *ip,
-       xfs_fsize_t             offset,
-       xfs_fsize_t             isize,
-       bool                    *did_zeroing)
- {
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_fileoff_t           last_fsb = XFS_B_TO_FSBT(mp, isize);
-       int                     zero_offset = XFS_B_FSB_OFFSET(mp, isize);
-       int                     zero_len;
-       int                     nimaps = 1;
-       int                     error = 0;
-       struct xfs_bmbt_irec    imap;
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       if (error)
-               return error;
-       ASSERT(nimaps > 0);
-       /*
-        * If the block underlying isize is just a hole, then there
-        * is nothing to zero.
-        */
-       if (imap.br_startblock == HOLESTARTBLOCK)
-               return 0;
-       zero_len = mp->m_sb.sb_blocksize - zero_offset;
-       if (isize + zero_len > offset)
-               zero_len = offset - isize;
-       *did_zeroing = true;
-       return xfs_iozero(ip, isize, zero_len);
- }
  /*
   * Zero any on disk space between the current EOF and the new, larger EOF.
   *
@@@ -484,94 -456,11 +456,11 @@@ xfs_zero_eof
        xfs_fsize_t             isize,          /* current inode size */
        bool                    *did_zeroing)
  {
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_fileoff_t           start_zero_fsb;
-       xfs_fileoff_t           end_zero_fsb;
-       xfs_fileoff_t           zero_count_fsb;
-       xfs_fileoff_t           last_fsb;
-       xfs_fileoff_t           zero_off;
-       xfs_fsize_t             zero_len;
-       int                     nimaps;
-       int                     error = 0;
-       struct xfs_bmbt_irec    imap;
        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
        ASSERT(offset > isize);
  
        trace_xfs_zero_eof(ip, isize, offset - isize);
-       /*
-        * First handle zeroing the block on which isize resides.
-        *
-        * We only zero a part of that block so it is handled specially.
-        */
-       if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
-               error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
-               if (error)
-                       return error;
-       }
-       /*
-        * Calculate the range between the new size and the old where blocks
-        * needing to be zeroed may exist.
-        *
-        * To get the block where the last byte in the file currently resides,
-        * we need to subtract one from the size and truncate back to a block
-        * boundary.  We subtract 1 in case the size is exactly on a block
-        * boundary.
-        */
-       last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
-       start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
-       end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
-       ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
-       if (last_fsb == end_zero_fsb) {
-               /*
-                * The size was only incremented on its last block.
-                * We took care of that above, so just return.
-                */
-               return 0;
-       }
-       ASSERT(start_zero_fsb <= end_zero_fsb);
-       while (start_zero_fsb <= end_zero_fsb) {
-               nimaps = 1;
-               zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
-                                         &imap, &nimaps, 0);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               if (error)
-                       return error;
-               ASSERT(nimaps > 0);
-               if (imap.br_state == XFS_EXT_UNWRITTEN ||
-                   imap.br_startblock == HOLESTARTBLOCK) {
-                       start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-                       ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-                       continue;
-               }
-               /*
-                * There are blocks we need to zero.
-                */
-               zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
-               zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
-               if ((zero_off + zero_len) > offset)
-                       zero_len = offset - zero_off;
-               error = xfs_iozero(ip, zero_off, zero_len);
-               if (error)
-                       return error;
-               *did_zeroing = true;
-               start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-               ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-       }
-       return 0;
+       return xfs_zero_range(ip, isize, offset - isize, did_zeroing);
  }
  
  /*
@@@ -722,8 -611,7 +611,7 @@@ xfs_file_dio_aio_write
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
  
        /* DIO must be aligned to device logical sector size */
-       if (!IS_DAX(inode) &&
-           ((iocb->ki_pos | count) & target->bt_logical_sectormask))
+       if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
                return -EINVAL;
  
        /* "unaligned" here means not aligned to a filesystem block */
        end = iocb->ki_pos + count - 1;
  
        /*
-        * See xfs_file_read_iter() for why we do a full-file flush here.
+        * See xfs_file_dio_aio_read() for why we do a full-file flush here.
         */
        if (mapping->nrpages) {
                ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
                iolock = XFS_IOLOCK_SHARED;
        }
  
-       trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+       trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
  
        data = *from;
-       ret = mapping->a_ops->direct_IO(iocb, &data);
+       ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+                       xfs_get_blocks_direct, xfs_end_io_direct_write,
+                       NULL, DIO_ASYNC_EXTEND);
  
        /* see generic_file_direct_write() for why this is necessary */
        if (mapping->nrpages) {
@@@ -809,10 -699,70 +699,70 @@@ out
        xfs_rw_iunlock(ip, iolock);
  
        /*
-        * No fallback to buffered IO on errors for XFS. DAX can result in
-        * partial writes, but direct IO will either complete fully or fail.
+        * No fallback to buffered IO on errors for XFS, direct IO will either
+        * complete fully or fail.
         */
-       ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
+       ASSERT(ret < 0 || ret == count);
+       return ret;
+ }
+ static noinline ssize_t
+ xfs_file_dax_write(
+       struct kiocb            *iocb,
+       struct iov_iter         *from)
+ {
+       struct address_space    *mapping = iocb->ki_filp->f_mapping;
+       struct inode            *inode = mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       ssize_t                 ret = 0;
+       int                     unaligned_io = 0;
+       int                     iolock;
+       struct iov_iter         data;
+       /* "unaligned" here means not aligned to a filesystem block */
+       if ((iocb->ki_pos & mp->m_blockmask) ||
+           ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
+               unaligned_io = 1;
+               iolock = XFS_IOLOCK_EXCL;
+       } else if (mapping->nrpages) {
+               iolock = XFS_IOLOCK_EXCL;
+       } else {
+               iolock = XFS_IOLOCK_SHARED;
+       }
+       xfs_rw_ilock(ip, iolock);
+       ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+       if (ret)
+               goto out;
+       /*
+        * Yes, even DAX files can have page cache attached to them:  A zeroed
+        * page is inserted into the pagecache when we have to serve a write
+        * fault on a hole.  It should never be dirtied and can simply be
+        * dropped from the pagecache once we get real data for the page.
+        */
+       if (mapping->nrpages) {
+               ret = invalidate_inode_pages2(mapping);
+               WARN_ON_ONCE(ret);
+       }
+       if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
+               xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+               iolock = XFS_IOLOCK_SHARED;
+       }
+       trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
+       data = *from;
+       ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
+                       xfs_end_io_direct_write, 0);
+       if (ret > 0) {
+               iocb->ki_pos += ret;
+               iov_iter_advance(from, ret);
+       }
+ out:
+       xfs_rw_iunlock(ip, iolock);
        return ret;
  }
  
@@@ -839,9 -789,8 +789,8 @@@ xfs_file_buffered_aio_write
        current->backing_dev_info = inode_to_bdi(inode);
  
  write_retry:
-       trace_xfs_file_buffered_write(ip, iov_iter_count(from),
-                                     iocb->ki_pos, 0);
-       ret = generic_perform_write(file, from, iocb->ki_pos);
+       trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
+       ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
        if (likely(ret >= 0))
                iocb->ki_pos += ret;
  
@@@ -895,7 -844,9 +844,9 @@@ xfs_file_write_iter
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
  
-       if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
+       if (IS_DAX(inode))
+               ret = xfs_file_dax_write(iocb, from);
+       else if (iocb->ki_flags & IOCB_DIRECT)
                ret = xfs_file_dio_aio_write(iocb, from);
        else
                ret = xfs_file_buffered_aio_write(iocb, from);
@@@ -1551,9 -1502,9 +1502,9 @@@ xfs_filemap_page_mkwrite
        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
        if (IS_DAX(inode)) {
 -              ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
 +              ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
        } else {
-               ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+               ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
                ret = block_page_mkwrite_return(ret);
        }
  
@@@ -1585,7 -1536,7 +1536,7 @@@ xfs_filemap_fault
                 * changes to xfs_get_blocks_direct() to map unwritten extent
                 * ioend for conversion on read-only mappings.
                 */
 -              ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
 +              ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
        } else
                ret = filemap_fault(vma, vmf);
        xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@@ -1622,7 -1573,7 +1573,7 @@@ xfs_filemap_pmd_fault
        }
  
        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 -      ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
 +      ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
        xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
        if (flags & FAULT_FLAG_WRITE)