Merge tag 'xfs-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 27 Jul 2016 16:53:35 +0000 (09:53 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 27 Jul 2016 16:53:35 +0000 (09:53 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 27 Jul 2016 16:53:35 +0000 (09:53 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 27 Jul 2016 16:53:35 +0000 (09:53 -0700)
diff --combined fs/buffer.c

index b9fa1be,228288a..9c8eb9b
--- 1/fs/buffer.c
--- 2/fs/buffer.c
+++ b/fs/buffer.c
@@@ -21,6 -21,7 +21,7 @@@
   #include <linux/kernel.h>
   #include <linux/syscalls.h>
   #include <linux/fs.h>
+ #include <linux/iomap.h>
   #include <linux/mm.h>
   #include <linux/percpu.h>
   #include <linux/slab.h>
@@@ -45,7 -46,7 +46,7 @@@
   #include <trace/events/block.h>
   
   static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
- -static int submit_bh_wbc(int rw, struct buffer_head *bh,
+ +static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
                          unsigned long bio_flags,
                          struct writeback_control *wbc);
   
@@@ -153,7 -154,7 +154,7 @@@ static void __end_buffer_read_notouch(s
         if (uptodate) {
                 set_buffer_uptodate(bh);
         } else {
- -              /* This happens, due to failed READA attempts. */
+ +              /* This happens, due to failed read-ahead attempts. */
                 clear_buffer_uptodate(bh);
         }
         unlock_buffer(bh);
@@@ -588,7 -589,7 +589,7 @@@ void write_boundary_block(struct block_
         struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
         if (bh) {
                 if (buffer_dirty(bh))
- -                      ll_rw_block(WRITE, 1, &bh);
+ +                      ll_rw_block(REQ_OP_WRITE, 0, 1, &bh);
                 put_bh(bh);
         }
   }
@@@ -1225,7 -1226,7 +1226,7 @@@ static struct buffer_head *__bread_slow
         } else {
                 get_bh(bh);
                 bh->b_end_io = end_buffer_read_sync;
- -              submit_bh(READ, bh);
+ +              submit_bh(REQ_OP_READ, 0, bh);
                 wait_on_buffer(bh);
                 if (buffer_uptodate(bh))
                         return bh;
@@@ -1395,7 -1396,7 +1396,7 @@@ void __breadahead(struct block_device *
   {
         struct buffer_head *bh = __getblk(bdev, block, size);
         if (likely(bh)) {
- -              ll_rw_block(READA, 1, &bh);
+ +              ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
                 brelse(bh);
         }
   }
@@@ -1687,7 -1688,7 +1688,7 @@@ static struct buffer_head *create_page_
    * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
    * causes the writes to be flagged as synchronous writes.
    */
- -static int __block_write_full_page(struct inode *inode, struct page *page,
+ +int __block_write_full_page(struct inode *inode, struct page *page,
                         get_block_t *get_block, struct writeback_control *wbc,
                         bh_end_io_t *handler)
   {
@@@ -1697,7 -1698,7 +1698,7 @@@
         struct buffer_head *bh, *head;
         unsigned int blocksize, bbits;
         int nr_underway = 0;
- -      int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+ +      int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
   
         head = create_page_buffers(page, inode,
                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
@@@ -1786,7 -1787,7 +1787,7 @@@
         do {
                 struct buffer_head *next = bh->b_this_page;
                 if (buffer_async_write(bh)) {
- -                      submit_bh_wbc(write_op, bh, 0, wbc);
+ +                      submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc);
                         nr_underway++;
                 }
                 bh = next;
@@@ -1840,7 -1841,7 +1841,7 @@@ recover
                 struct buffer_head *next = bh->b_this_page;
                 if (buffer_async_write(bh)) {
                         clear_buffer_dirty(bh);
- -                      submit_bh_wbc(write_op, bh, 0, wbc);
+ +                      submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc);
                         nr_underway++;
                 }
                 bh = next;
@@@ -1848,7 -1849,6 +1849,7 @@@
         unlock_page(page);
         goto done;
   }
+ +EXPORT_SYMBOL(__block_write_full_page);
   
   /*
    * If a page has any new buffers, zero them out here, and mark them uptodate
@@@ -1892,8 -1892,62 +1893,62 @@@ void page_zero_new_buffers(struct page 
   }
   EXPORT_SYMBOL(page_zero_new_buffers);
   
- int __block_write_begin(struct page *page, loff_t pos, unsigned len,
-               get_block_t *get_block)
+ static void
+ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
+               struct iomap *iomap)
+ {
+       loff_t offset = block << inode->i_blkbits;
+ 
+       bh->b_bdev = iomap->bdev;
+ 
+       /*
+        * Block points to offset in file we need to map, iomap contains
+        * the offset at which the map starts. If the map ends before the
+        * current block, then do not map the buffer and let the caller
+        * handle it.
+        */
+       BUG_ON(offset >= iomap->offset + iomap->length);
+ 
+       switch (iomap->type) {
+       case IOMAP_HOLE:
+               /*
+                * If the buffer is not up to date or beyond the current EOF,
+                * we need to mark it as new to ensure sub-block zeroing is
+                * executed if necessary.
+                */
+               if (!buffer_uptodate(bh) ||
+                   (offset >= i_size_read(inode)))
+                       set_buffer_new(bh);
+               break;
+       case IOMAP_DELALLOC:
+               if (!buffer_uptodate(bh) ||
+                   (offset >= i_size_read(inode)))
+                       set_buffer_new(bh);
+               set_buffer_uptodate(bh);
+               set_buffer_mapped(bh);
+               set_buffer_delay(bh);
+               break;
+       case IOMAP_UNWRITTEN:
+               /*
+                * For unwritten regions, we always need to ensure that
+                * sub-block writes cause the regions in the block we are not
+                * writing to are zeroed. Set the buffer as new to ensure this.
+                */
+               set_buffer_new(bh);
+               set_buffer_unwritten(bh);
+               /* FALLTHRU */
+       case IOMAP_MAPPED:
+               if (offset >= i_size_read(inode))
+                       set_buffer_new(bh);
+               bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
+                               ((offset - iomap->offset) >> inode->i_blkbits);
+               set_buffer_mapped(bh);
+               break;
+       }
+ }
+ 
+ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+               get_block_t *get_block, struct iomap *iomap)
   {
         unsigned from = pos & (PAGE_SIZE - 1);
         unsigned to = from + len;
@@@ -1929,9 -1983,14 +1984,14 @@@
                         clear_buffer_new(bh);
                 if (!buffer_mapped(bh)) {
                         WARN_ON(bh->b_size != blocksize);
-                       err = get_block(inode, block, bh, 1);
-                       if (err)
-                               break;
+                       if (get_block) {
+                               err = get_block(inode, block, bh, 1);
+                               if (err)
+                                       break;
+                       } else {
+                               iomap_to_bh(inode, block, bh, iomap);
+                       }
+ 
                         if (buffer_new(bh)) {
                                 unmap_underlying_metadata(bh->b_bdev,
                                                         bh->b_blocknr);
@@@ -1956,7 -2015,7 +2016,7 @@@
                 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
                     !buffer_unwritten(bh) &&
                      (block_start < from || block_end > to)) {
- -                      ll_rw_block(READ, 1, &bh);
+ +                      ll_rw_block(REQ_OP_READ, 0, 1, &bh);
                         *wait_bh++=bh;
                 }
         }
@@@ -1972,6 -2031,12 +2032,12 @@@
                 page_zero_new_buffers(page, from, to);
         return err;
   }
+ 
+ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+               get_block_t *get_block)
+ {
+       return __block_write_begin_int(page, pos, len, get_block, NULL);
+ }
   EXPORT_SYMBOL(__block_write_begin);
   
   static int __block_commit_write(struct inode *inode, struct page *page,
@@@ -2249,7 -2314,7 +2315,7 @@@ int block_read_full_page(struct page *p
                 if (buffer_uptodate(bh))
                         end_buffer_async_read(bh, 1);
                 else
- -                      submit_bh(READ, bh);
+ +                      submit_bh(REQ_OP_READ, 0, bh);
         }
         return 0;
   }
@@@ -2583,7 -2648,7 +2649,7 @@@ int nobh_write_begin(struct address_spa
                 if (block_start < from || block_end > to) {
                         lock_buffer(bh);
                         bh->b_end_io = end_buffer_read_nobh;
- -                      submit_bh(READ, bh);
+ +                      submit_bh(REQ_OP_READ, 0, bh);
                         nr_reads++;
                 }
         }
@@@ -2853,7 -2918,7 +2919,7 @@@ int block_truncate_page(struct address_
   
         if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
                 err = -EIO;
- -              ll_rw_block(READ, 1, &bh);
+ +              ll_rw_block(REQ_OP_READ, 0, 1, &bh);
                 wait_on_buffer(bh);
                 /* Uhhuh. Read error. Complain and punt. */
                 if (!buffer_uptodate(bh))
@@@ -2950,7 -3015,7 +3016,7 @@@ static void end_bio_bh_io_sync(struct b
    * errors, this only handles the "we need to be able to
    * do IO at the final sector" case.
    */
- -void guard_bio_eod(int rw, struct bio *bio)
+ +void guard_bio_eod(int op, struct bio *bio)
   {
         sector_t maxsector;
         struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
@@@ -2980,13 -3045,13 +3046,13 @@@
         bvec->bv_len -= truncated_bytes;
   
         /* ..and clear the end of the buffer for reads */
- -      if ((rw & RW_MASK) == READ) {
+ +      if (op == REQ_OP_READ) {
                 zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
                                 truncated_bytes);
         }
   }
   
- -static int submit_bh_wbc(int rw, struct buffer_head *bh,
+ +static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
                          unsigned long bio_flags, struct writeback_control *wbc)
   {
         struct bio *bio;
@@@ -3000,7 -3065,7 +3066,7 @@@
         /*
          * Only clear out a write error when rewriting
          */
- -      if (test_set_buffer_req(bh) && (rw & WRITE))
+ +      if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
                 clear_buffer_write_io_error(bh);
   
         /*
@@@ -3025,42 -3090,39 +3091,42 @@@
         bio->bi_flags |= bio_flags;
   
         /* Take care of bh's that straddle the end of the device */
- -      guard_bio_eod(rw, bio);
+ +      guard_bio_eod(op, bio);
   
         if (buffer_meta(bh))
- -              rw |= REQ_META;
+ +              op_flags |= REQ_META;
         if (buffer_prio(bh))
- -              rw |= REQ_PRIO;
+ +              op_flags |= REQ_PRIO;
+ +      bio_set_op_attrs(bio, op, op_flags);
   
- -      submit_bio(rw, bio);
+ +      submit_bio(bio);
         return 0;
   }
   
- -int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
+ +int _submit_bh(int op, int op_flags, struct buffer_head *bh,
+ +             unsigned long bio_flags)
   {
- -      return submit_bh_wbc(rw, bh, bio_flags, NULL);
+ +      return submit_bh_wbc(op, op_flags, bh, bio_flags, NULL);
   }
   EXPORT_SYMBOL_GPL(_submit_bh);
   
- -int submit_bh(int rw, struct buffer_head *bh)
+ +int submit_bh(int op, int op_flags,  struct buffer_head *bh)
   {
- -      return submit_bh_wbc(rw, bh, 0, NULL);
+ +      return submit_bh_wbc(op, op_flags, bh, 0, NULL);
   }
   EXPORT_SYMBOL(submit_bh);
   
   /**
    * ll_rw_block: low-level access to block devices (DEPRECATED)
- - * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
+ + * @op: whether to %READ or %WRITE
+ + * @op_flags: rq_flag_bits
    * @nr: number of &struct buffer_heads in the array
    * @bhs: array of pointers to &struct buffer_head
    *
    * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
- - * requests an I/O operation on them, either a %READ or a %WRITE.  The third
- - * %READA option is described in the documentation for generic_make_request()
- - * which ll_rw_block() calls.
+ + * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE.
+ + * @op_flags contains flags modifying the detailed I/O behavior, most notably
+ + * %REQ_RAHEAD.
    *
    * This function drops any buffer that it cannot get a lock on (with the
    * BH_Lock state bit), any buffer that appears to be clean when doing a write
@@@ -3076,7 -3138,7 +3142,7 @@@
    * All of the buffers must be for the same device, and must also be a
    * multiple of the current approved size for the device.
    */
- -void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
+ +void ll_rw_block(int op, int op_flags,  int nr, struct buffer_head *bhs[])
   {
         int i;
   
@@@ -3085,18 -3147,18 +3151,18 @@@
   
                 if (!trylock_buffer(bh))
                         continue;
- -              if (rw == WRITE) {
+ +              if (op == WRITE) {
                         if (test_clear_buffer_dirty(bh)) {
                                 bh->b_end_io = end_buffer_write_sync;
                                 get_bh(bh);
- -                              submit_bh(WRITE, bh);
+ +                              submit_bh(op, op_flags, bh);
                                 continue;
                         }
                 } else {
                         if (!buffer_uptodate(bh)) {
                                 bh->b_end_io = end_buffer_read_sync;
                                 get_bh(bh);
- -                              submit_bh(rw, bh);
+ +                              submit_bh(op, op_flags, bh);
                                 continue;
                         }
                 }
@@@ -3105,7 -3167,7 +3171,7 @@@
   }
   EXPORT_SYMBOL(ll_rw_block);
   
- -void write_dirty_buffer(struct buffer_head *bh, int rw)
+ +void write_dirty_buffer(struct buffer_head *bh, int op_flags)
   {
         lock_buffer(bh);
         if (!test_clear_buffer_dirty(bh)) {
@@@ -3114,7 -3176,7 +3180,7 @@@
         }
         bh->b_end_io = end_buffer_write_sync;
         get_bh(bh);
- -      submit_bh(rw, bh);
+ +      submit_bh(REQ_OP_WRITE, op_flags, bh);
   }
   EXPORT_SYMBOL(write_dirty_buffer);
   
@@@ -3123,7 -3185,7 +3189,7 @@@
    * and then start new I/O and then wait upon it.  The caller must have a ref on
    * the buffer_head.
    */
- -int __sync_dirty_buffer(struct buffer_head *bh, int rw)
+ +int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
   {
         int ret = 0;
   
@@@ -3132,7 -3194,7 +3198,7 @@@
         if (test_clear_buffer_dirty(bh)) {
                 get_bh(bh);
                 bh->b_end_io = end_buffer_write_sync;
- -              ret = submit_bh(rw, bh);
+ +              ret = submit_bh(REQ_OP_WRITE, op_flags, bh);
                 wait_on_buffer(bh);
                 if (!ret && !buffer_uptodate(bh))
                         ret = -EIO;
@@@ -3395,7 -3457,7 +3461,7 @@@ int bh_submit_read(struct buffer_head *
   
         get_bh(bh);
         bh->b_end_io = end_buffer_read_sync;
- -      submit_bh(READ, bh);
+ +      submit_bh(REQ_OP_READ, 0, bh);
         wait_on_buffer(bh);
         if (buffer_uptodate(bh))
                 return 0;
diff --combined fs/internal.h

index f57ced5,c0c6f49..cef0913
--- 1/fs/internal.h
--- 2/fs/internal.h
+++ b/fs/internal.h
@@@ -11,6 -11,7 +11,7 @@@
   
   struct super_block;
   struct file_system_type;
+ struct iomap;
   struct linux_binprm;
   struct path;
   struct mount;
@@@ -39,6 -40,8 +40,8 @@@ static inline int __sync_blockdev(struc
    * buffer.c
    */
   extern void guard_bio_eod(int rw, struct bio *bio);
+ extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+               get_block_t *get_block, struct iomap *iomap);
   
   /*
    * char_dev.c
@@@ -130,7 -133,6 +133,7 @@@ extern int invalidate_inodes(struct sup
   extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
   extern int d_set_mounted(struct dentry *dentry);
   extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
+ +extern struct dentry *d_alloc_cursor(struct dentry *);
   
   /*
    * read_write.c
diff --combined fs/nfsd/blocklayout.c

index 31f3df1,4df16ae..ad2c05e
--- 1/fs/nfsd/blocklayout.c
--- 2/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@@ -2,6 -2,7 +2,7 @@@
    * Copyright (c) 2014-2016 Christoph Hellwig.
    */
   #include <linux/exportfs.h>
+ #include <linux/iomap.h>
   #include <linux/genhd.h>
   #include <linux/slab.h>
   #include <linux/pr.h>
@@@ -290,7 -291,7 +291,7 @@@ out_free_buf
         return error;
   }
   
- -#define NFSD_MDS_PR_KEY               0x0100000000000000
+ +#define NFSD_MDS_PR_KEY               0x0100000000000000ULL
   
   /*
    * We use the client ID as a unique key for the reservations.
diff --combined fs/xfs/xfs_aops.c

index 87d2b21,ebfde55..7575cfc
--- 1/fs/xfs/xfs_aops.c
--- 2/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@@ -87,6 -87,12 +87,12 @@@ xfs_find_bdev_for_inode
    * We're now finished for good with this page.  Update the page state via the
    * associated buffer_heads, paying attention to the start and end offsets that
    * we need to process on the page.
+  *
+  * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
+  * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
+  * the page at all, as we may be racing with memory reclaim and it can free both
+  * the bufferhead chain and the page as it will see the page as clean and
+  * unused.
    */
   static void
   xfs_finish_page_writeback(
@@@ -95,8 -101,9 +101,9 @@@
         int                     error)
   {
         unsigned int            end = bvec->bv_offset + bvec->bv_len - 1;
-       struct buffer_head      *head, *bh;
+       struct buffer_head      *head, *bh, *next;
         unsigned int            off = 0;
+       unsigned int            bsize;
   
         ASSERT(bvec->bv_offset < PAGE_SIZE);
         ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
@@@ -105,15 -112,17 +112,17 @@@
   
         bh = head = page_buffers(bvec->bv_page);
   
+       bsize = bh->b_size;
         do {
+               next = bh->b_this_page;
                 if (off < bvec->bv_offset)
                         goto next_bh;
                 if (off > end)
                         break;
                 bh->b_end_io(bh, !error);
   next_bh:
-               off += bh->b_size;
-       } while ((bh = bh->b_this_page) != head);
+               off += bsize;
+       } while ((bh = next) != head);
   }
   
   /*
@@@ -438,8 -447,7 +447,8 @@@ xfs_submit_ioend
   
         ioend->io_bio->bi_private = ioend;
         ioend->io_bio->bi_end_io = xfs_end_bio;
- -
+ +      bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
+ +                       (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
         /*
          * If we are failing the IO now, just mark the ioend with an
          * error and finish it. This will run IO completion immediately
@@@ -452,7 -460,8 +461,7 @@@
                 return status;
         }
   
- -      submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
- -                 ioend->io_bio);
+ +      submit_bio(ioend->io_bio);
         return 0;
   }
   
@@@ -510,9 -519,8 +519,9 @@@ xfs_chain_bio
   
         bio_chain(ioend->io_bio, new);
         bio_get(ioend->io_bio);         /* for xfs_destroy_ioend */
- -      submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE,
- -                 ioend->io_bio);
+ +      bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
+ +                        (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+ +      submit_bio(ioend->io_bio);
         ioend->io_bio = new;
   }
   
@@@ -1041,6 -1049,20 +1050,20 @@@ xfs_vm_releasepage
   
         trace_xfs_releasepage(page->mapping->host, page, 0, 0);
   
+       /*
+        * mm accommodates an old ext3 case where clean pages might not have had
+        * the dirty bit cleared. Thus, it can send actual dirty pages to
+        * ->releasepage() via shrink_active_list(). Conversely,
+        * block_invalidatepage() can send pages that are still marked dirty
+        * but otherwise have invalidated buffers.
+        *
+        * We've historically freed buffers on the latter. Instead, quietly
+        * filter out all dirty pages to avoid spurious buffer state warnings.
+        * This can likely be removed once shrink_active_list() is fixed.
+        */
+       if (PageDirty(page))
+               return 0;
+ 
         xfs_count_page_state(page, &delalloc, &unwritten);
   
         if (WARN_ON_ONCE(delalloc))
@@@ -1144,6 -1166,8 +1167,8 @@@ __xfs_get_blocks
         ssize_t                 size;
         int                     new = 0;
   
+       BUG_ON(create && !direct);
+ 
         if (XFS_FORCED_SHUTDOWN(mp))
                 return -EIO;
   
@@@ -1151,22 -1175,14 +1176,14 @@@
         ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
         size = bh_result->b_size;
   
-       if (!create && direct && offset >= i_size_read(inode))
+       if (!create && offset >= i_size_read(inode))
                 return 0;
   
         /*
          * Direct I/O is usually done on preallocated files, so try getting
-        * a block mapping without an exclusive lock first.  For buffered
-        * writes we already have the exclusive iolock anyway, so avoiding
-        * a lock roundtrip here by taking the ilock exclusive from the
-        * beginning is a useful micro optimization.
+        * a block mapping without an exclusive lock first.
          */
-       if (create && !direct) {
-               lockmode = XFS_ILOCK_EXCL;
-               xfs_ilock(ip, lockmode);
-       } else {
-               lockmode = xfs_ilock_data_map_shared(ip);
-       }
+       lockmode = xfs_ilock_data_map_shared(ip);
   
         ASSERT(offset <= mp->m_super->s_maxbytes);
         if (offset + size > mp->m_super->s_maxbytes)
@@@ -1185,37 -1201,19 +1202,19 @@@
              (imap.br_startblock == HOLESTARTBLOCK ||
               imap.br_startblock == DELAYSTARTBLOCK) ||
              (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
-               if (direct || xfs_get_extsz_hint(ip)) {
-                       /*
-                        * xfs_iomap_write_direct() expects the shared lock. It
-                        * is unlocked on return.
-                        */
-                       if (lockmode == XFS_ILOCK_EXCL)
-                               xfs_ilock_demote(ip, lockmode);
- 
-                       error = xfs_iomap_write_direct(ip, offset, size,
-                                                      &imap, nimaps);
-                       if (error)
-                               return error;
-                       new = 1;
+               /*
+                * xfs_iomap_write_direct() expects the shared lock. It
+                * is unlocked on return.
+                */
+               if (lockmode == XFS_ILOCK_EXCL)
+                       xfs_ilock_demote(ip, lockmode);
   
-               } else {
-                       /*
-                        * Delalloc reservations do not require a transaction,
-                        * we can go on without dropping the lock here. If we
-                        * are allocating a new delalloc block, make sure that
-                        * we set the new flag so that we mark the buffer new so
-                        * that we know that it is newly allocated if the write
-                        * fails.
-                        */
-                       if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
-                               new = 1;
-                       error = xfs_iomap_write_delay(ip, offset, size, &imap);
-                       if (error)
-                               goto out_unlock;
+               error = xfs_iomap_write_direct(ip, offset, size,
+                                              &imap, nimaps);
+               if (error)
+                       return error;
+               new = 1;
   
-                       xfs_iunlock(ip, lockmode);
-               }
                 trace_xfs_get_blocks_alloc(ip, offset, size,
                                 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
                                                    : XFS_IO_DELALLOC, &imap);
@@@ -1236,9 -1234,7 +1235,7 @@@
         }
   
         /* trim mapping down to size requested */
-       if (direct || size > (1 << inode->i_blkbits))
-               xfs_map_trim_size(inode, iblock, bh_result,
-                                 &imap, offset, size);
+       xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
   
         /*
          * For unwritten extents do not report a disk address in the buffered
@@@ -1251,7 -1247,7 +1248,7 @@@
                 if (ISUNWRITTEN(&imap))
                         set_buffer_unwritten(bh_result);
                 /* direct IO needs special help */
-               if (create && direct) {
+               if (create) {
                         if (dax_fault)
                                 ASSERT(!ISUNWRITTEN(&imap));
                         else
@@@ -1280,14 -1276,7 +1277,7 @@@
              (new || ISUNWRITTEN(&imap))))
                 set_buffer_new(bh_result);
   
-       if (imap.br_startblock == DELAYSTARTBLOCK) {
-               BUG_ON(direct);
-               if (create) {
-                       set_buffer_uptodate(bh_result);
-                       set_buffer_mapped(bh_result);
-                       set_buffer_delay(bh_result);
-               }
-       }
+       BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
   
         return 0;
   
@@@ -1337,7 -1326,7 +1327,7 @@@ xfs_get_blocks_dax_fault
    * whereas if we have flags set we will always be called in task context
    * (i.e. from a workqueue).
    */
- STATIC int
+ int
   xfs_end_io_direct_write(
         struct kiocb            *iocb,
         loff_t                  offset,
@@@ -1408,234 -1397,10 +1398,10 @@@ xfs_vm_direct_IO
         struct kiocb            *iocb,
         struct iov_iter         *iter)
   {
-       struct inode            *inode = iocb->ki_filp->f_mapping->host;
-       dio_iodone_t            *endio = NULL;
-       int                     flags = 0;
-       struct block_device     *bdev;
- 
-       if (iov_iter_rw(iter) == WRITE) {
-               endio = xfs_end_io_direct_write;
-               flags = DIO_ASYNC_EXTEND;
-       }
- 
-       if (IS_DAX(inode)) {
-               return dax_do_io(iocb, inode, iter,
-                                xfs_get_blocks_direct, endio, 0);
-       }
- 
-       bdev = xfs_find_bdev_for_inode(inode);
-       return  __blockdev_direct_IO(iocb, inode, bdev, iter,
-                       xfs_get_blocks_direct, endio, NULL, flags);
- }
- 
- /*
-  * Punch out the delalloc blocks we have already allocated.
-  *
-  * Don't bother with xfs_setattr given that nothing can have made it to disk yet
-  * as the page is still locked at this point.
-  */
- STATIC void
- xfs_vm_kill_delalloc_range(
-       struct inode            *inode,
-       loff_t                  start,
-       loff_t                  end)
- {
-       struct xfs_inode        *ip = XFS_I(inode);
-       xfs_fileoff_t           start_fsb;
-       xfs_fileoff_t           end_fsb;
-       int                     error;
- 
-       start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
-       end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
-       if (end_fsb <= start_fsb)
-               return;
- 
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
-                                               end_fsb - start_fsb);
-       if (error) {
-               /* something screwed, just bail */
-               if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                       xfs_alert(ip->i_mount,
-               "xfs_vm_write_failed: unable to clean up ino %lld",
-                                       ip->i_ino);
-               }
-       }
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
- 
- STATIC void
- xfs_vm_write_failed(
-       struct inode            *inode,
-       struct page             *page,
-       loff_t                  pos,
-       unsigned                len)
- {
-       loff_t                  block_offset;
-       loff_t                  block_start;
-       loff_t                  block_end;
-       loff_t                  from = pos & (PAGE_SIZE - 1);
-       loff_t                  to = from + len;
-       struct buffer_head      *bh, *head;
-       struct xfs_mount        *mp = XFS_I(inode)->i_mount;
- 
         /*
-        * The request pos offset might be 32 or 64 bit, this is all fine
-        * on 64-bit platform.  However, for 64-bit pos request on 32-bit
-        * platform, the high 32-bit will be masked off if we evaluate the
-        * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
-        * 0xfffff000 as an unsigned long, hence the result is incorrect
-        * which could cause the following ASSERT failed in most cases.
-        * In order to avoid this, we can evaluate the block_offset of the
-        * start of the page by using shifts rather than masks the mismatch
-        * problem.
+        * We just need the method present so that open/fcntl allow direct I/O.
          */
-       block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT;
- 
-       ASSERT(block_offset + from == pos);
- 
-       head = page_buffers(page);
-       block_start = 0;
-       for (bh = head; bh != head || !block_start;
-            bh = bh->b_this_page, block_start = block_end,
-                                  block_offset += bh->b_size) {
-               block_end = block_start + bh->b_size;
- 
-               /* skip buffers before the write */
-               if (block_end <= from)
-                       continue;
- 
-               /* if the buffer is after the write, we're done */
-               if (block_start >= to)
-                       break;
- 
-               /*
-                * Process delalloc and unwritten buffers beyond EOF. We can
-                * encounter unwritten buffers in the event that a file has
-                * post-EOF unwritten extents and an extending write happens to
-                * fail (e.g., an unaligned write that also involves a delalloc
-                * to the same page).
-                */
-               if (!buffer_delay(bh) && !buffer_unwritten(bh))
-                       continue;
- 
-               if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
-                   block_offset < i_size_read(inode))
-                       continue;
- 
-               if (buffer_delay(bh))
-                       xfs_vm_kill_delalloc_range(inode, block_offset,
-                                                  block_offset + bh->b_size);
- 
-               /*
-                * This buffer does not contain data anymore. make sure anyone
-                * who finds it knows that for certain.
-                */
-               clear_buffer_delay(bh);
-               clear_buffer_uptodate(bh);
-               clear_buffer_mapped(bh);
-               clear_buffer_new(bh);
-               clear_buffer_dirty(bh);
-               clear_buffer_unwritten(bh);
-       }
- 
- }
- 
- /*
-  * This used to call block_write_begin(), but it unlocks and releases the page
-  * on error, and we need that page to be able to punch stale delalloc blocks out
-  * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
-  * the appropriate point.
-  */
- STATIC int
- xfs_vm_write_begin(
-       struct file             *file,
-       struct address_space    *mapping,
-       loff_t                  pos,
-       unsigned                len,
-       unsigned                flags,
-       struct page             **pagep,
-       void                    **fsdata)
- {
-       pgoff_t                 index = pos >> PAGE_SHIFT;
-       struct page             *page;
-       int                     status;
-       struct xfs_mount        *mp = XFS_I(mapping->host)->i_mount;
- 
-       ASSERT(len <= PAGE_SIZE);
- 
-       page = grab_cache_page_write_begin(mapping, index, flags);
-       if (!page)
-               return -ENOMEM;
- 
-       status = __block_write_begin(page, pos, len, xfs_get_blocks);
-       if (xfs_mp_fail_writes(mp))
-               status = -EIO;
-       if (unlikely(status)) {
-               struct inode    *inode = mapping->host;
-               size_t          isize = i_size_read(inode);
- 
-               xfs_vm_write_failed(inode, page, pos, len);
-               unlock_page(page);
- 
-               /*
-                * If the write is beyond EOF, we only want to kill blocks
-                * allocated in this write, not blocks that were previously
-                * written successfully.
-                */
-               if (xfs_mp_fail_writes(mp))
-                       isize = 0;
-               if (pos + len > isize) {
-                       ssize_t start = max_t(ssize_t, pos, isize);
- 
-                       truncate_pagecache_range(inode, start, pos + len);
-               }
- 
-               put_page(page);
-               page = NULL;
-       }
- 
-       *pagep = page;
-       return status;
- }
- 
- /*
-  * On failure, we only need to kill delalloc blocks beyond EOF in the range of
-  * this specific write because they will never be written. Previous writes
-  * beyond EOF where block allocation succeeded do not need to be trashed, so
-  * only new blocks from this write should be trashed. For blocks within
-  * EOF, generic_write_end() zeros them so they are safe to leave alone and be
-  * written with all the other valid data.
-  */
- STATIC int
- xfs_vm_write_end(
-       struct file             *file,
-       struct address_space    *mapping,
-       loff_t                  pos,
-       unsigned                len,
-       unsigned                copied,
-       struct page             *page,
-       void                    *fsdata)
- {
-       int                     ret;
- 
-       ASSERT(len <= PAGE_SIZE);
- 
-       ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-       if (unlikely(ret < len)) {
-               struct inode    *inode = mapping->host;
-               size_t          isize = i_size_read(inode);
-               loff_t          to = pos + len;
- 
-               if (to > isize) {
-                       /* only kill blocks in this write beyond EOF */
-                       if (pos > isize)
-                               isize = pos;
-                       xfs_vm_kill_delalloc_range(inode, isize, to);
-                       truncate_pagecache_range(inode, isize, to);
-               }
-       }
-       return ret;
+       return -EINVAL;
   }
   
   STATIC sector_t
@@@ -1748,8 -1513,6 +1514,6 @@@ const struct address_space_operations x
         .set_page_dirty         = xfs_vm_set_page_dirty,
         .releasepage            = xfs_vm_releasepage,
         .invalidatepage         = xfs_vm_invalidatepage,
-       .write_begin            = xfs_vm_write_begin,
-       .write_end              = xfs_vm_write_end,
         .bmap                   = xfs_vm_bmap,
         .direct_IO              = xfs_vm_direct_IO,
         .migratepage            = buffer_migrate_page,
diff --combined fs/xfs/xfs_buf.c

index a87a0d5,32fc540..47a318c
--- 1/fs/xfs/xfs_buf.c
--- 2/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@@ -79,6 -79,47 +79,47 @@@ xfs_buf_vmap_len
         return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
   }
   
+ /*
+  * Bump the I/O in flight count on the buftarg if we haven't yet done so for
+  * this buffer. The count is incremented once per buffer (per hold cycle)
+  * because the corresponding decrement is deferred to buffer release. Buffers
+  * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
+  * tracking adds unnecessary overhead. This is used for sychronization purposes
+  * with unmount (see xfs_wait_buftarg()), so all we really need is a count of
+  * in-flight buffers.
+  *
+  * Buffers that are never released (e.g., superblock, iclog buffers) must set
+  * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
+  * never reaches zero and unmount hangs indefinitely.
+  */
+ static inline void
+ xfs_buf_ioacct_inc(
+       struct xfs_buf  *bp)
+ {
+       if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT))
+               return;
+ 
+       ASSERT(bp->b_flags & XBF_ASYNC);
+       bp->b_flags |= _XBF_IN_FLIGHT;
+       percpu_counter_inc(&bp->b_target->bt_io_count);
+ }
+ 
+ /*
+  * Clear the in-flight state on a buffer about to be released to the LRU or
+  * freed and unaccount from the buftarg.
+  */
+ static inline void
+ xfs_buf_ioacct_dec(
+       struct xfs_buf  *bp)
+ {
+       if (!(bp->b_flags & _XBF_IN_FLIGHT))
+               return;
+ 
+       ASSERT(bp->b_flags & XBF_ASYNC);
+       bp->b_flags &= ~_XBF_IN_FLIGHT;
+       percpu_counter_dec(&bp->b_target->bt_io_count);
+ }
+ 
   /*
    * When we mark a buffer stale, we remove the buffer from the LRU and clear the
    * b_lru_ref count so that the buffer is freed immediately when the buffer
@@@ -102,6 -143,14 +143,14 @@@ xfs_buf_stale
          */
         bp->b_flags &= ~_XBF_DELWRI_Q;
   
+       /*
+        * Once the buffer is marked stale and unlocked, a subsequent lookup
+        * could reset b_flags. There is no guarantee that the buffer is
+        * unaccounted (released to LRU) before that occurs. Drop in-flight
+        * status now to preserve accounting consistency.
+        */
+       xfs_buf_ioacct_dec(bp);
+ 
         spin_lock(&bp->b_lock);
         atomic_set(&bp->b_lru_ref, 0);
         if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
@@@ -815,7 -864,8 +864,8 @@@ xfs_buf_get_uncached
         struct xfs_buf          *bp;
         DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
   
-       bp = _xfs_buf_alloc(target, &map, 1, 0);
+       /* flags might contain irrelevant bits, pass only what we care about */
+       bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT);
         if (unlikely(bp == NULL))
                 goto fail;
   
@@@ -866,63 -916,85 +916,85 @@@ xfs_buf_hold
   }
   
   /*
-  *    Releases a hold on the specified buffer.  If the
-  *    the hold count is 1, calls xfs_buf_free.
+  * Release a hold on the specified buffer. If the hold count is 1, the buffer is
+  * placed on LRU or freed (depending on b_lru_ref).
    */
   void
   xfs_buf_rele(
         xfs_buf_t               *bp)
   {
         struct xfs_perag        *pag = bp->b_pag;
+       bool                    release;
+       bool                    freebuf = false;
   
         trace_xfs_buf_rele(bp, _RET_IP_);
   
         if (!pag) {
                 ASSERT(list_empty(&bp->b_lru));
                 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
-               if (atomic_dec_and_test(&bp->b_hold))
+               if (atomic_dec_and_test(&bp->b_hold)) {
+                       xfs_buf_ioacct_dec(bp);
                         xfs_buf_free(bp);
+               }
                 return;
         }
   
         ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
   
         ASSERT(atomic_read(&bp->b_hold) > 0);
-       if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
-               spin_lock(&bp->b_lock);
-               if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
-                       /*
-                        * If the buffer is added to the LRU take a new
-                        * reference to the buffer for the LRU and clear the
-                        * (now stale) dispose list state flag
-                        */
-                       if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
-                               bp->b_state &= ~XFS_BSTATE_DISPOSE;
-                               atomic_inc(&bp->b_hold);
-                       }
-                       spin_unlock(&bp->b_lock);
-                       spin_unlock(&pag->pag_buf_lock);
-               } else {
-                       /*
-                        * most of the time buffers will already be removed from
-                        * the LRU, so optimise that case by checking for the
-                        * XFS_BSTATE_DISPOSE flag indicating the last list the
-                        * buffer was on was the disposal list
-                        */
-                       if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
-                               list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
-                       } else {
-                               ASSERT(list_empty(&bp->b_lru));
-                       }
-                       spin_unlock(&bp->b_lock);
   
-                       ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
-                       rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
-                       spin_unlock(&pag->pag_buf_lock);
-                       xfs_perag_put(pag);
-                       xfs_buf_free(bp);
+       release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+       spin_lock(&bp->b_lock);
+       if (!release) {
+               /*
+                * Drop the in-flight state if the buffer is already on the LRU
+                * and it holds the only reference. This is racy because we
+                * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
+                * ensures the decrement occurs only once per-buf.
+                */
+               if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
+                       xfs_buf_ioacct_dec(bp);
+               goto out_unlock;
+       }
+ 
+       /* the last reference has been dropped ... */
+       xfs_buf_ioacct_dec(bp);
+       if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+               /*
+                * If the buffer is added to the LRU take a new reference to the
+                * buffer for the LRU and clear the (now stale) dispose list
+                * state flag
+                */
+               if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+                       bp->b_state &= ~XFS_BSTATE_DISPOSE;
+                       atomic_inc(&bp->b_hold);
+               }
+               spin_unlock(&pag->pag_buf_lock);
+       } else {
+               /*
+                * most of the time buffers will already be removed from the
+                * LRU, so optimise that case by checking for the
+                * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
+                * was on was the disposal list
+                */
+               if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
+                       list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+               } else {
+                       ASSERT(list_empty(&bp->b_lru));
                 }
+ 
+               ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+               rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+               spin_unlock(&pag->pag_buf_lock);
+               xfs_perag_put(pag);
+               freebuf = true;
         }
+ 
+ out_unlock:
+       spin_unlock(&bp->b_lock);
+ 
+       if (freebuf)
+               xfs_buf_free(bp);
   }
   
   
@@@ -944,10 -1016,12 +1016,12 @@@ xfs_buf_trylock
         int                     locked;
   
         locked = down_trylock(&bp->b_sema) == 0;
-       if (locked)
+       if (locked) {
                 XB_SET_OWNER(bp);
- 
-       trace_xfs_buf_trylock(bp, _RET_IP_);
+               trace_xfs_buf_trylock(bp, _RET_IP_);
+       } else {
+               trace_xfs_buf_trylock_fail(bp, _RET_IP_);
+       }
         return locked;
   }
   
@@@ -1127,8 -1201,7 +1201,8 @@@ xfs_buf_ioapply_map
         int             map,
         int             *buf_offset,
         int             *count,
- -      int             rw)
+ +      int             op,
+ +      int             op_flags)
   {
         int             page_index;
         int             total_nr_pages = bp->b_page_count;
@@@ -1158,14 -1231,16 +1232,14 @@@
   
   next_chunk:
         atomic_inc(&bp->b_io_remaining);
- -      nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
- -      if (nr_pages > total_nr_pages)
- -              nr_pages = total_nr_pages;
+ +      nr_pages = min(total_nr_pages, BIO_MAX_PAGES);
   
         bio = bio_alloc(GFP_NOIO, nr_pages);
         bio->bi_bdev = bp->b_target->bt_bdev;
         bio->bi_iter.bi_sector = sector;
         bio->bi_end_io = xfs_buf_bio_end_io;
         bio->bi_private = bp;
- -
+ +      bio_set_op_attrs(bio, op, op_flags);
   
         for (; size && nr_pages; nr_pages--, page_index++) {
                 int     rbytes, nbytes = PAGE_SIZE - offset;
@@@ -1189,7 -1264,7 +1263,7 @@@
                         flush_kernel_vmap_range(bp->b_addr,
                                                 xfs_buf_vmap_len(bp));
                 }
- -              submit_bio(rw, bio);
+ +              submit_bio(bio);
                 if (size)
                         goto next_chunk;
         } else {
@@@ -1209,8 -1284,7 +1283,8 @@@ _xfs_buf_ioapply
         struct xfs_buf  *bp)
   {
         struct blk_plug plug;
- -      int             rw;
+ +      int             op;
+ +      int             op_flags = 0;
         int             offset;
         int             size;
         int             i;
@@@ -1229,13 -1303,14 +1303,13 @@@
                 bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue;
   
         if (bp->b_flags & XBF_WRITE) {
+ +              op = REQ_OP_WRITE;
                 if (bp->b_flags & XBF_SYNCIO)
- -                      rw = WRITE_SYNC;
- -              else
- -                      rw = WRITE;
+ +                      op_flags = WRITE_SYNC;
                 if (bp->b_flags & XBF_FUA)
- -                      rw |= REQ_FUA;
+ +                      op_flags |= REQ_FUA;
                 if (bp->b_flags & XBF_FLUSH)
- -                      rw |= REQ_FLUSH;
+ +                      op_flags |= REQ_PREFLUSH;
   
                 /*
                  * Run the write verifier callback function if it exists. If
@@@ -1265,14 -1340,13 +1339,14 @@@
                         }
                 }
         } else if (bp->b_flags & XBF_READ_AHEAD) {
- -              rw = READA;
+ +              op = REQ_OP_READ;
+ +              op_flags = REQ_RAHEAD;
         } else {
- -              rw = READ;
+ +              op = REQ_OP_READ;
         }
   
         /* we only use the buffer cache for meta-data */
- -      rw |= REQ_META;
+ +      op_flags |= REQ_META;
   
         /*
          * Walk all the vectors issuing IO on them. Set up the initial offset
@@@ -1284,7 -1358,7 +1358,7 @@@
         size = BBTOB(bp->b_io_length);
         blk_start_plug(&plug);
         for (i = 0; i < bp->b_map_count; i++) {
- -              xfs_buf_ioapply_map(bp, i, &offset, &size, rw);
+ +              xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags);
                 if (bp->b_error)
                         break;
                 if (size <= 0)
@@@ -1339,6 -1413,7 +1413,7 @@@ xfs_buf_submit
          * xfs_buf_ioend too early.
          */
         atomic_set(&bp->b_io_remaining, 1);
+       xfs_buf_ioacct_inc(bp);
         _xfs_buf_ioapply(bp);
   
         /*
@@@ -1524,13 -1599,19 +1599,19 @@@ xfs_wait_buftarg
         int loop = 0;
   
         /*
-        * We need to flush the buffer workqueue to ensure that all IO
-        * completion processing is 100% done. Just waiting on buffer locks is
-        * not sufficient for async IO as the reference count held over IO is
-        * not released until after the buffer lock is dropped. Hence we need to
-        * ensure here that all reference counts have been dropped before we
-        * start walking the LRU list.
+        * First wait on the buftarg I/O count for all in-flight buffers to be
+        * released. This is critical as new buffers do not make the LRU until
+        * they are released.
+        *
+        * Next, flush the buffer workqueue to ensure all completion processing
+        * has finished. Just waiting on buffer locks is not sufficient for
+        * async IO as the reference count held over IO is not released until
+        * after the buffer lock is dropped. Hence we need to ensure here that
+        * all reference counts have been dropped before we start walking the
+        * LRU list.
          */
+       while (percpu_counter_sum(&btp->bt_io_count))
+               delay(100);
         drain_workqueue(btp->bt_mount->m_buf_workqueue);
   
         /* loop until there is nothing left on the lru list. */
@@@ -1627,6 -1708,8 +1708,8 @@@ xfs_free_buftarg
         struct xfs_buftarg      *btp)
   {
         unregister_shrinker(&btp->bt_shrinker);
+       ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
+       percpu_counter_destroy(&btp->bt_io_count);
         list_lru_destroy(&btp->bt_lru);
   
         if (mp->m_flags & XFS_MOUNT_BARRIER)
@@@ -1691,6 -1774,9 +1774,9 @@@ xfs_alloc_buftarg
         if (list_lru_init(&btp->bt_lru))
                 goto error;
   
+       if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+               goto error;
+ 
         btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
         btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
         btp->bt_shrinker.seeks = DEFAULT_SEEKS;
@@@ -1774,18 -1860,33 +1860,33 @@@ xfs_buf_cmp
         return 0;
   }
   
+ /*
+  * submit buffers for write.
+  *
+  * When we have a large buffer list, we do not want to hold all the buffers
+  * locked while we block on the request queue waiting for IO dispatch. To avoid
+  * this problem, we lock and submit buffers in groups of 50, thereby minimising
+  * the lock hold times for lists which may contain thousands of objects.
+  *
+  * To do this, we sort the buffer list before we walk the list to lock and
+  * submit buffers, and we plug and unplug around each group of buffers we
+  * submit.
+  */
   static int
- __xfs_buf_delwri_submit(
+ xfs_buf_delwri_submit_buffers(
         struct list_head        *buffer_list,
-       struct list_head        *io_list,
-       bool                    wait)
+       struct list_head        *wait_list)
   {
-       struct blk_plug         plug;
         struct xfs_buf          *bp, *n;
+       LIST_HEAD               (submit_list);
         int                     pinned = 0;
+       struct blk_plug         plug;
   
+       list_sort(NULL, buffer_list, xfs_buf_cmp);
+ 
+       blk_start_plug(&plug);
         list_for_each_entry_safe(bp, n, buffer_list, b_list) {
-               if (!wait) {
+               if (!wait_list) {
                         if (xfs_buf_ispinned(bp)) {
                                 pinned++;
                                 continue;
@@@ -1808,25 -1909,21 +1909,21 @@@
                         continue;
                 }
   
-               list_move_tail(&bp->b_list, io_list);
                 trace_xfs_buf_delwri_split(bp, _RET_IP_);
-       }
- 
-       list_sort(NULL, io_list, xfs_buf_cmp);
- 
-       blk_start_plug(&plug);
-       list_for_each_entry_safe(bp, n, io_list, b_list) {
-               bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
-               bp->b_flags |= XBF_WRITE | XBF_ASYNC;
   
                 /*
-                * we do all Io submission async. This means if we need to wait
-                * for IO completion we need to take an extra reference so the
-                * buffer is still valid on the other side.
+                * We do all IO submission async. This means if we need
+                * to wait for IO completion we need to take an extra
+                * reference so the buffer is still valid on the other
+                * side. We need to move the buffer onto the io_list
+                * at this point so the caller can still access it.
                  */
-               if (wait)
+               bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL);
+               bp->b_flags |= XBF_WRITE | XBF_ASYNC;
+               if (wait_list) {
                         xfs_buf_hold(bp);
-               else
+                       list_move_tail(&bp->b_list, wait_list);
+               } else
                         list_del_init(&bp->b_list);
   
                 xfs_buf_submit(bp);
@@@ -1849,8 -1946,7 +1946,7 @@@ in
   xfs_buf_delwri_submit_nowait(
         struct list_head        *buffer_list)
   {
-       LIST_HEAD               (io_list);
-       return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
+       return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
   }
   
   /*
@@@ -1865,15 -1961,15 +1961,15 @@@ in
   xfs_buf_delwri_submit(
         struct list_head        *buffer_list)
   {
-       LIST_HEAD               (io_list);
+       LIST_HEAD               (wait_list);
         int                     error = 0, error2;
         struct xfs_buf          *bp;
   
-       __xfs_buf_delwri_submit(buffer_list, &io_list, true);
+       xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
   
         /* Wait for IO to complete. */
-       while (!list_empty(&io_list)) {
-               bp = list_first_entry(&io_list, struct xfs_buf, b_list);
+       while (!list_empty(&wait_list)) {
+               bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
   
                 list_del_init(&bp->b_list);
   
diff --combined fs/xfs/xfs_file.c

index 1b3dc9d,35df757..ed95e5b
--- 1/fs/xfs/xfs_file.c
--- 2/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@@ -37,6 -37,7 +37,7 @@@
   #include "xfs_log.h"
   #include "xfs_icache.h"
   #include "xfs_pnfs.h"
+ #include "xfs_iomap.h"
   
   #include <linux/dcache.h>
   #include <linux/falloc.h>
@@@ -80,61 -81,17 +81,17 @@@ xfs_rw_ilock_demote
   }
   
   /*
-  * xfs_iozero clears the specified range supplied via the page cache (except in
-  * the DAX case). Writes through the page cache will allocate blocks over holes,
-  * though the callers usually map the holes first and avoid them. If a block is
-  * not completely zeroed, then it will be read from disk before being partially
-  * zeroed.
-  *
-  * In the DAX case, we can just directly write to the underlying pages. This
-  * will not allocate blocks, but will avoid holes and unwritten extents and so
-  * not do unnecessary work.
+  * Clear the specified ranges to zero through either the pagecache or DAX.
+  * Holes and unwritten extents will be left as-is as they already are zeroed.
    */
   int
- xfs_iozero(
-       struct xfs_inode        *ip,    /* inode                        */
-       loff_t                  pos,    /* offset in file               */
-       size_t                  count)  /* size of data to zero         */
+ xfs_zero_range(
+       struct xfs_inode        *ip,
+       xfs_off_t               pos,
+       xfs_off_t               count,
+       bool                    *did_zero)
   {
-       struct page             *page;
-       struct address_space    *mapping;
-       int                     status = 0;
- 
- 
-       mapping = VFS_I(ip)->i_mapping;
-       do {
-               unsigned offset, bytes;
-               void *fsdata;
- 
-               offset = (pos & (PAGE_SIZE -1)); /* Within page */
-               bytes = PAGE_SIZE - offset;
-               if (bytes > count)
-                       bytes = count;
- 
-               if (IS_DAX(VFS_I(ip))) {
-                       status = dax_zero_page_range(VFS_I(ip), pos, bytes,
-                                                    xfs_get_blocks_direct);
-                       if (status)
-                               break;
-               } else {
-                       status = pagecache_write_begin(NULL, mapping, pos, bytes,
-                                               AOP_FLAG_UNINTERRUPTIBLE,
-                                               &page, &fsdata);
-                       if (status)
-                               break;
- 
-                       zero_user(page, offset, bytes);
- 
-                       status = pagecache_write_end(NULL, mapping, pos, bytes,
-                                               bytes, page, fsdata);
-                       WARN_ON(status <= 0); /* can't return less than zero! */
-                       status = 0;
-               }
-               pos += bytes;
-               count -= bytes;
-       } while (count);
- 
-       return status;
+       return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
   }
   
   int
@@@ -282,48 -239,35 +239,35 @@@ xfs_file_fsync
   }
   
   STATIC ssize_t
- xfs_file_read_iter(
+ xfs_file_dio_aio_read(
         struct kiocb            *iocb,
         struct iov_iter         *to)
   {
-       struct file             *file = iocb->ki_filp;
-       struct inode            *inode = file->f_mapping->host;
+       struct address_space    *mapping = iocb->ki_filp->f_mapping;
+       struct inode            *inode = mapping->host;
         struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       size_t                  size = iov_iter_count(to);
+       loff_t                  isize = i_size_read(inode);
+       size_t                  count = iov_iter_count(to);
+       struct iov_iter         data;
+       struct xfs_buftarg      *target;
         ssize_t                 ret = 0;
-       int                     ioflags = 0;
-       xfs_fsize_t             n;
-       loff_t                  pos = iocb->ki_pos;
   
-       XFS_STATS_INC(mp, xs_read_calls);
- 
-       if (unlikely(iocb->ki_flags & IOCB_DIRECT))
-               ioflags |= XFS_IO_ISDIRECT;
-       if (file->f_mode & FMODE_NOCMTIME)
-               ioflags |= XFS_IO_INVIS;
- 
-       if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
-               xfs_buftarg_t   *target =
-                       XFS_IS_REALTIME_INODE(ip) ?
-                               mp->m_rtdev_targp : mp->m_ddev_targp;
-               /* DIO must be aligned to device logical sector size */
-               if ((pos | size) & target->bt_logical_sectormask) {
-                       if (pos == i_size_read(inode))
-                               return 0;
-                       return -EINVAL;
-               }
-       }
+       trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
   
-       n = mp->m_super->s_maxbytes - pos;
-       if (n <= 0 || size == 0)
-               return 0;
+       if (!count)
+               return 0; /* skip atime */
   
-       if (n < size)
-               size = n;
+       if (XFS_IS_REALTIME_INODE(ip))
+               target = ip->i_mount->m_rtdev_targp;
+       else
+               target = ip->i_mount->m_ddev_targp;
   
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return -EIO;
+       /* DIO must be aligned to device logical sector size */
+       if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
+               if (iocb->ki_pos == isize)
+                       return 0;
+               return -EINVAL;
+       }
   
         /*
          * Locking is a bit tricky here. If we take an exclusive lock for direct
@@@ -336,7 -280,7 +280,7 @@@
          * serialisation.
          */
         xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-       if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
+       if (mapping->nrpages) {
                 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
                 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
   
@@@ -351,8 -295,8 +295,8 @@@
                  * flush and reduce the chances of repeated iolock cycles going
                  * forward.
                  */
-               if (inode->i_mapping->nrpages) {
-                       ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+               if (mapping->nrpages) {
+                       ret = filemap_write_and_wait(mapping);
                         if (ret) {
                                 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
                                 return ret;
@@@ -363,20 -307,95 +307,95 @@@
                          * we fail to invalidate a page, but this should never
                          * happen on XFS. Warn if it does fail.
                          */
-                       ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
+                       ret = invalidate_inode_pages2(mapping);
                         WARN_ON_ONCE(ret);
                         ret = 0;
                 }
                 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
         }
   
-       trace_xfs_file_read(ip, size, pos, ioflags);
+       data = *to;
+       ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+                       xfs_get_blocks_direct, NULL, NULL, 0);
+       if (ret > 0) {
+               iocb->ki_pos += ret;
+               iov_iter_advance(to, ret);
+       }
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
   
+       file_accessed(iocb->ki_filp);
+       return ret;
+ }
+ 
+ static noinline ssize_t
+ xfs_file_dax_read(
+       struct kiocb            *iocb,
+       struct iov_iter         *to)
+ {
+       struct address_space    *mapping = iocb->ki_filp->f_mapping;
+       struct inode            *inode = mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct iov_iter         data = *to;
+       size_t                  count = iov_iter_count(to);
+       ssize_t                 ret = 0;
+ 
+       trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+ 
+       if (!count)
+               return 0; /* skip atime */
+ 
+       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+       ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
+       if (ret > 0) {
+               iocb->ki_pos += ret;
+               iov_iter_advance(to, ret);
+       }
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ 
+       file_accessed(iocb->ki_filp);
+       return ret;
+ }
+ 
+ STATIC ssize_t
+ xfs_file_buffered_aio_read(
+       struct kiocb            *iocb,
+       struct iov_iter         *to)
+ {
+       struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
+       ssize_t                 ret;
+ 
+       trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+ 
+       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
         ret = generic_file_read_iter(iocb, to);
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ 
+       return ret;
+ }
+ 
+ STATIC ssize_t
+ xfs_file_read_iter(
+       struct kiocb            *iocb,
+       struct iov_iter         *to)
+ {
+       struct inode            *inode = file_inode(iocb->ki_filp);
+       struct xfs_mount        *mp = XFS_I(inode)->i_mount;
+       ssize_t                 ret = 0;
+ 
+       XFS_STATS_INC(mp, xs_read_calls);
+ 
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+ 
+       if (IS_DAX(inode))
+               ret = xfs_file_dax_read(iocb, to);
+       else if (iocb->ki_flags & IOCB_DIRECT)
+               ret = xfs_file_dio_aio_read(iocb, to);
+       else
+               ret = xfs_file_buffered_aio_read(iocb, to);
+ 
         if (ret > 0)
                 XFS_STATS_ADD(mp, xs_read_bytes, ret);
- 
-       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
         return ret;
   }
   
@@@ -389,18 -408,14 +408,14 @@@ xfs_file_splice_read
         unsigned int            flags)
   {
         struct xfs_inode        *ip = XFS_I(infilp->f_mapping->host);
-       int                     ioflags = 0;
         ssize_t                 ret;
   
         XFS_STATS_INC(ip->i_mount, xs_read_calls);
   
-       if (infilp->f_mode & FMODE_NOCMTIME)
-               ioflags |= XFS_IO_INVIS;
- 
         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                 return -EIO;
   
-       trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+       trace_xfs_file_splice_read(ip, count, *ppos);
   
         /*
          * DAX inodes cannot ues the page cache for splice, so we have to push
@@@ -423,49 -438,6 +438,6 @@@ out
         return ret;
   }
   
- /*
-  * This routine is called to handle zeroing any space in the last block of the
-  * file that is beyond the EOF.  We do this since the size is being increased
-  * without writing anything to that block and we don't want to read the
-  * garbage on the disk.
-  */
- STATIC int                            /* error (positive) */
- xfs_zero_last_block(
-       struct xfs_inode        *ip,
-       xfs_fsize_t             offset,
-       xfs_fsize_t             isize,
-       bool                    *did_zeroing)
- {
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_fileoff_t           last_fsb = XFS_B_TO_FSBT(mp, isize);
-       int                     zero_offset = XFS_B_FSB_OFFSET(mp, isize);
-       int                     zero_len;
-       int                     nimaps = 1;
-       int                     error = 0;
-       struct xfs_bmbt_irec    imap;
- 
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       if (error)
-               return error;
- 
-       ASSERT(nimaps > 0);
- 
-       /*
-        * If the block underlying isize is just a hole, then there
-        * is nothing to zero.
-        */
-       if (imap.br_startblock == HOLESTARTBLOCK)
-               return 0;
- 
-       zero_len = mp->m_sb.sb_blocksize - zero_offset;
-       if (isize + zero_len > offset)
-               zero_len = offset - isize;
-       *did_zeroing = true;
-       return xfs_iozero(ip, isize, zero_len);
- }
- 
   /*
    * Zero any on disk space between the current EOF and the new, larger EOF.
    *
@@@ -484,94 -456,11 +456,11 @@@ xfs_zero_eof
         xfs_fsize_t             isize,          /* current inode size */
         bool                    *did_zeroing)
   {
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_fileoff_t           start_zero_fsb;
-       xfs_fileoff_t           end_zero_fsb;
-       xfs_fileoff_t           zero_count_fsb;
-       xfs_fileoff_t           last_fsb;
-       xfs_fileoff_t           zero_off;
-       xfs_fsize_t             zero_len;
-       int                     nimaps;
-       int                     error = 0;
-       struct xfs_bmbt_irec    imap;
- 
         ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
         ASSERT(offset > isize);
   
         trace_xfs_zero_eof(ip, isize, offset - isize);
- 
-       /*
-        * First handle zeroing the block on which isize resides.
-        *
-        * We only zero a part of that block so it is handled specially.
-        */
-       if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
-               error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
-               if (error)
-                       return error;
-       }
- 
-       /*
-        * Calculate the range between the new size and the old where blocks
-        * needing to be zeroed may exist.
-        *
-        * To get the block where the last byte in the file currently resides,
-        * we need to subtract one from the size and truncate back to a block
-        * boundary.  We subtract 1 in case the size is exactly on a block
-        * boundary.
-        */
-       last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
-       start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
-       end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
-       ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
-       if (last_fsb == end_zero_fsb) {
-               /*
-                * The size was only incremented on its last block.
-                * We took care of that above, so just return.
-                */
-               return 0;
-       }
- 
-       ASSERT(start_zero_fsb <= end_zero_fsb);
-       while (start_zero_fsb <= end_zero_fsb) {
-               nimaps = 1;
-               zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
- 
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
-                                         &imap, &nimaps, 0);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               if (error)
-                       return error;
- 
-               ASSERT(nimaps > 0);
- 
-               if (imap.br_state == XFS_EXT_UNWRITTEN ||
-                   imap.br_startblock == HOLESTARTBLOCK) {
-                       start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-                       ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-                       continue;
-               }
- 
-               /*
-                * There are blocks we need to zero.
-                */
-               zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
-               zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
- 
-               if ((zero_off + zero_len) > offset)
-                       zero_len = offset - zero_off;
- 
-               error = xfs_iozero(ip, zero_off, zero_len);
-               if (error)
-                       return error;
- 
-               *did_zeroing = true;
-               start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-               ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-       }
- 
-       return 0;
+       return xfs_zero_range(ip, isize, offset - isize, did_zeroing);
   }
   
   /*
@@@ -722,8 -611,7 +611,7 @@@ xfs_file_dio_aio_write
                                         mp->m_rtdev_targp : mp->m_ddev_targp;
   
         /* DIO must be aligned to device logical sector size */
-       if (!IS_DAX(inode) &&
-           ((iocb->ki_pos | count) & target->bt_logical_sectormask))
+       if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
                 return -EINVAL;
   
         /* "unaligned" here means not aligned to a filesystem block */
@@@ -762,7 -650,7 +650,7 @@@
         end = iocb->ki_pos + count - 1;
   
         /*
-        * See xfs_file_read_iter() for why we do a full-file flush here.
+        * See xfs_file_dio_aio_read() for why we do a full-file flush here.
          */
         if (mapping->nrpages) {
                 ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
@@@ -789,10 -677,12 +677,12 @@@
                 iolock = XFS_IOLOCK_SHARED;
         }
   
-       trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+       trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
   
         data = *from;
-       ret = mapping->a_ops->direct_IO(iocb, &data);
+       ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+                       xfs_get_blocks_direct, xfs_end_io_direct_write,
+                       NULL, DIO_ASYNC_EXTEND);
   
         /* see generic_file_direct_write() for why this is necessary */
         if (mapping->nrpages) {
@@@ -809,10 -699,70 +699,70 @@@ out
         xfs_rw_iunlock(ip, iolock);
   
         /*
-        * No fallback to buffered IO on errors for XFS. DAX can result in
-        * partial writes, but direct IO will either complete fully or fail.
+        * No fallback to buffered IO on errors for XFS, direct IO will either
+        * complete fully or fail.
          */
-       ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
+       ASSERT(ret < 0 || ret == count);
+       return ret;
+ }
+ 
+ static noinline ssize_t
+ xfs_file_dax_write(
+       struct kiocb            *iocb,
+       struct iov_iter         *from)
+ {
+       struct address_space    *mapping = iocb->ki_filp->f_mapping;
+       struct inode            *inode = mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       ssize_t                 ret = 0;
+       int                     unaligned_io = 0;
+       int                     iolock;
+       struct iov_iter         data;
+ 
+       /* "unaligned" here means not aligned to a filesystem block */
+       if ((iocb->ki_pos & mp->m_blockmask) ||
+           ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
+               unaligned_io = 1;
+               iolock = XFS_IOLOCK_EXCL;
+       } else if (mapping->nrpages) {
+               iolock = XFS_IOLOCK_EXCL;
+       } else {
+               iolock = XFS_IOLOCK_SHARED;
+       }
+       xfs_rw_ilock(ip, iolock);
+ 
+       ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+       if (ret)
+               goto out;
+ 
+       /*
+        * Yes, even DAX files can have page cache attached to them:  A zeroed
+        * page is inserted into the pagecache when we have to serve a write
+        * fault on a hole.  It should never be dirtied and can simply be
+        * dropped from the pagecache once we get real data for the page.
+        */
+       if (mapping->nrpages) {
+               ret = invalidate_inode_pages2(mapping);
+               WARN_ON_ONCE(ret);
+       }
+ 
+       if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
+               xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+               iolock = XFS_IOLOCK_SHARED;
+       }
+ 
+       trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
+ 
+       data = *from;
+       ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
+                       xfs_end_io_direct_write, 0);
+       if (ret > 0) {
+               iocb->ki_pos += ret;
+               iov_iter_advance(from, ret);
+       }
+ out:
+       xfs_rw_iunlock(ip, iolock);
         return ret;
   }
   
@@@ -839,9 -789,8 +789,8 @@@ xfs_file_buffered_aio_write
         current->backing_dev_info = inode_to_bdi(inode);
   
   write_retry:
-       trace_xfs_file_buffered_write(ip, iov_iter_count(from),
-                                     iocb->ki_pos, 0);
-       ret = generic_perform_write(file, from, iocb->ki_pos);
+       trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
+       ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
         if (likely(ret >= 0))
                 iocb->ki_pos += ret;
   
@@@ -895,7 -844,9 +844,9 @@@ xfs_file_write_iter
         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                 return -EIO;
   
-       if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
+       if (IS_DAX(inode))
+               ret = xfs_file_dax_write(iocb, from);
+       else if (iocb->ki_flags & IOCB_DIRECT)
                 ret = xfs_file_dio_aio_write(iocb, from);
         else
                 ret = xfs_file_buffered_aio_write(iocb, from);
@@@ -1551,9 -1502,9 +1502,9 @@@ xfs_filemap_page_mkwrite
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
   
         if (IS_DAX(inode)) {
- -              ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
+ +              ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
         } else {
-               ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+               ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
                 ret = block_page_mkwrite_return(ret);
         }
   
@@@ -1585,7 -1536,7 +1536,7 @@@ xfs_filemap_fault
                  * changes to xfs_get_blocks_direct() to map unwritten extent
                  * ioend for conversion on read-only mappings.
                  */
- -              ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
+ +              ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
         } else
                 ret = filemap_fault(vma, vmf);
         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@@ -1622,7 -1573,7 +1573,7 @@@ xfs_filemap_pmd_fault
         }
   
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- -      ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
+ +      ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
   
         if (flags & FAULT_FLAG_WRITE)
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 27 Jul 2016 16:53:35 +0000 (09:53 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 27 Jul 2016 16:53:35 +0000 (09:53 -0700)
		1	2
fs/buffer.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/internal.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfsd/blocklayout.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_aops.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_buf.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_file.c	patch \|	diff1 \|	diff2 \|	blob \| history