Merge tag 'xfs-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 27 Jul 2016 16:53:35 +0000 (09:53 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 27 Jul 2016 16:53:35 +0000 (09:53 -0700)
Pull xfs updates from Dave Chinner:
 "The major addition is the new iomap based block mapping
  infrastructure.  We've been kicking this about locally for years, but
  there are other filesystems want to use it too (e.g. gfs2).  Now it
  is fully working, reviewed and ready for merge and be used by other
  filesystems.

  There are a lot of other fixes and cleanups in the tree, but those are
  XFS internal things and none are of the scale or visibility of the
  iomap changes.  See below for details.

  I am likely to send another pull request next week - we're just about
  ready to merge some new functionality (on disk block->owner reverse
  mapping infrastructure), but that's a huge chunk of code (74 files
  changed, 7283 insertions(+), 1114 deletions(-)) so I'm keeping that
  separate to all the "normal" pull request changes so they don't get
  lost in the noise.

  Summary of changes in this update:
   - generic iomap based IO path infrastructure
   - generic iomap based fiemap implementation
   - xfs iomap based Io path implementation
   - buffer error handling fixes
   - tracking of in flight buffer IO for unmount serialisation
   - direct IO and DAX io path separation and simplification
   - shortform directory format definition changes for wider platform
     compatibility
   - various buffer cache fixes
   - cleanups in preparation for rmap merge
   - error injection cleanups and fixes
   - log item format buffer memory allocation restructuring to prevent
     rare OOM reclaim deadlocks
   - sparse inode chunks are now fully supported"

* tag 'xfs-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: (53 commits)
  xfs: remove EXPERIMENTAL tag from sparse inode feature
  xfs: bufferhead chains are invalid after end_page_writeback
  xfs: allocate log vector buffers outside CIL context lock
  libxfs: directory node splitting does not have an extra block
  xfs: remove dax code from object file when disabled
  xfs: skip dirty pages in ->releasepage()
  xfs: remove __arch_pack
  xfs: kill xfs_dir2_inou_t
  xfs: kill xfs_dir2_sf_off_t
  xfs: split direct I/O and DAX path
  xfs: direct calls in the direct I/O path
  xfs: stop using generic_file_read_iter for direct I/O
  xfs: split xfs_file_read_iter into buffered and direct I/O helpers
  xfs: remove s_maxbytes enforcement in xfs_file_read_iter
  xfs: kill ioflags
  xfs: don't pass ioflags around in the ioctl path
  xfs: track and serialize in-flight async buffers against unmount
  xfs: exclude never-released buffers from buftarg I/O accounting
  xfs: don't reset b_retries to 0 on every failure
  xfs: remove extraneous buffer flag changes
  ...

66 files changed:
fs/Kconfig
fs/Makefile
fs/buffer.c
fs/internal.h
fs/iomap.c [new file with mode: 0644]
fs/nfsd/blocklayout.c
fs/nfsd/blocklayoutxdr.c
fs/xfs/Kconfig
fs/xfs/libxfs/xfs_alloc.c
fs/xfs/libxfs/xfs_alloc.h
fs/xfs/libxfs/xfs_attr_leaf.h
fs/xfs/libxfs/xfs_bmap.c
fs/xfs/libxfs/xfs_bmap.h
fs/xfs/libxfs/xfs_bmap_btree.c
fs/xfs/libxfs/xfs_btree.c
fs/xfs/libxfs/xfs_btree.h
fs/xfs/libxfs/xfs_da_btree.c
fs/xfs/libxfs/xfs_da_format.c
fs/xfs/libxfs/xfs_da_format.h
fs/xfs/libxfs/xfs_dir2_sf.c
fs/xfs/libxfs/xfs_format.h
fs/xfs/libxfs/xfs_fs.h
fs/xfs/libxfs/xfs_ialloc.c
fs/xfs/libxfs/xfs_rtbitmap.c
fs/xfs/xfs_aops.c
fs/xfs/xfs_aops.h
fs/xfs/xfs_attr_inactive.c
fs/xfs/xfs_attr_list.c
fs/xfs/xfs_bmap_util.c
fs/xfs/xfs_bmap_util.h
fs/xfs/xfs_buf.c
fs/xfs/xfs_buf.h
fs/xfs/xfs_buf_item.c
fs/xfs/xfs_dquot.c
fs/xfs/xfs_dquot_item.c
fs/xfs/xfs_error.c
fs/xfs/xfs_error.h
fs/xfs/xfs_extfree_item.c
fs/xfs/xfs_file.c
fs/xfs/xfs_fsops.c
fs/xfs/xfs_icache.c
fs/xfs/xfs_icache.h
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_inode_item.c
fs/xfs/xfs_ioctl.c
fs/xfs/xfs_ioctl.h
fs/xfs/xfs_ioctl32.c
fs/xfs/xfs_iomap.c
fs/xfs/xfs_iomap.h
fs/xfs/xfs_iops.c
fs/xfs/xfs_linux.h
fs/xfs/xfs_log.c
fs/xfs/xfs_log.h
fs/xfs/xfs_log_cil.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_ondisk.h
fs/xfs/xfs_pnfs.c
fs/xfs/xfs_rtalloc.h
fs/xfs/xfs_super.c
fs/xfs/xfs_super.h
fs/xfs/xfs_sysfs.c
fs/xfs/xfs_trace.h
fs/xfs/xfs_trans.h
include/linux/exportfs.h
include/linux/iomap.h [new file with mode: 0644]

index b8fcb41..4524916 100644 (file)
@@ -10,6 +10,9 @@ config DCACHE_WORD_ACCESS
 
 if BLOCK
 
+config FS_IOMAP
+       bool
+
 source "fs/ext2/Kconfig"
 source "fs/ext4/Kconfig"
 source "fs/jbd2/Kconfig"
index 85b6e13..ed2b632 100644 (file)
@@ -49,6 +49,7 @@ obj-$(CONFIG_COREDUMP)                += coredump.o
 obj-$(CONFIG_SYSCTL)           += drop_caches.o
 
 obj-$(CONFIG_FHANDLE)          += fhandle.o
+obj-$(CONFIG_FS_IOMAP)         += iomap.o
 
 obj-y                          += quota/
 
index b9fa1be..9c8eb9b 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
 #include <linux/fs.h>
+#include <linux/iomap.h>
 #include <linux/mm.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
@@ -1892,8 +1893,62 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 }
 EXPORT_SYMBOL(page_zero_new_buffers);
 
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
-               get_block_t *get_block)
+static void
+iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
+               struct iomap *iomap)
+{
+       loff_t offset = block << inode->i_blkbits;
+
+       bh->b_bdev = iomap->bdev;
+
+       /*
+        * Block points to offset in file we need to map, iomap contains
+        * the offset at which the map starts. If the map ends before the
+        * current block, then do not map the buffer and let the caller
+        * handle it.
+        */
+       BUG_ON(offset >= iomap->offset + iomap->length);
+
+       switch (iomap->type) {
+       case IOMAP_HOLE:
+               /*
+                * If the buffer is not up to date or beyond the current EOF,
+                * we need to mark it as new to ensure sub-block zeroing is
+                * executed if necessary.
+                */
+               if (!buffer_uptodate(bh) ||
+                   (offset >= i_size_read(inode)))
+                       set_buffer_new(bh);
+               break;
+       case IOMAP_DELALLOC:
+               if (!buffer_uptodate(bh) ||
+                   (offset >= i_size_read(inode)))
+                       set_buffer_new(bh);
+               set_buffer_uptodate(bh);
+               set_buffer_mapped(bh);
+               set_buffer_delay(bh);
+               break;
+       case IOMAP_UNWRITTEN:
+               /*
+                * For unwritten regions, we always need to ensure that
+                * sub-block writes cause the regions in the block we are not
+                * writing to are zeroed. Set the buffer as new to ensure this.
+                */
+               set_buffer_new(bh);
+               set_buffer_unwritten(bh);
+               /* FALLTHRU */
+       case IOMAP_MAPPED:
+               if (offset >= i_size_read(inode))
+                       set_buffer_new(bh);
+               bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
+                               ((offset - iomap->offset) >> inode->i_blkbits);
+               set_buffer_mapped(bh);
+               break;
+       }
+}
+
+int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+               get_block_t *get_block, struct iomap *iomap)
 {
        unsigned from = pos & (PAGE_SIZE - 1);
        unsigned to = from + len;
@@ -1929,9 +1984,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                        clear_buffer_new(bh);
                if (!buffer_mapped(bh)) {
                        WARN_ON(bh->b_size != blocksize);
-                       err = get_block(inode, block, bh, 1);
-                       if (err)
-                               break;
+                       if (get_block) {
+                               err = get_block(inode, block, bh, 1);
+                               if (err)
+                                       break;
+                       } else {
+                               iomap_to_bh(inode, block, bh, iomap);
+                       }
+
                        if (buffer_new(bh)) {
                                unmap_underlying_metadata(bh->b_bdev,
                                                        bh->b_blocknr);
@@ -1972,6 +2032,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                page_zero_new_buffers(page, from, to);
        return err;
 }
+
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+               get_block_t *get_block)
+{
+       return __block_write_begin_int(page, pos, len, get_block, NULL);
+}
 EXPORT_SYMBOL(__block_write_begin);
 
 static int __block_commit_write(struct inode *inode, struct page *page,
index f57ced5..cef0913 100644 (file)
@@ -11,6 +11,7 @@
 
 struct super_block;
 struct file_system_type;
+struct iomap;
 struct linux_binprm;
 struct path;
 struct mount;
@@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
  * buffer.c
  */
 extern void guard_bio_eod(int rw, struct bio *bio);
+extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+               get_block_t *get_block, struct iomap *iomap);
 
 /*
  * char_dev.c
diff --git a/fs/iomap.c b/fs/iomap.c
new file mode 100644 (file)
index 0000000..48141b8
--- /dev/null
@@ -0,0 +1,497 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (c) 2016 Christoph Hellwig.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/uaccess.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include <linux/dax.h>
+#include "internal.h"
+
+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
+               void *data, struct iomap *iomap);
+
+/*
+ * Execute a iomap write on a segment of the mapping that spans a
+ * contiguous range of pages that have identical block mapping state.
+ *
+ * This avoids the need to map pages individually, do individual allocations
+ * for each page and most importantly avoid the need for filesystem specific
+ * locking per page. Instead, all the operations are amortised over the entire
+ * range of pages. It is assumed that the filesystems will lock whatever
+ * resources they require in the iomap_begin call, and release them in the
+ * iomap_end call.
+ */
+static loff_t
+iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
+               struct iomap_ops *ops, void *data, iomap_actor_t actor)
+{
+       struct iomap iomap = { 0 };
+       loff_t written = 0, ret;
+
+       /*
+        * Need to map a range from start position for length bytes. This can
+        * span multiple pages - it is only guaranteed to return a range of a
+        * single type of pages (e.g. all into a hole, all mapped or all
+        * unwritten). Failure at this point has nothing to undo.
+        *
+        * If allocation is required for this range, reserve the space now so
+        * that the allocation is guaranteed to succeed later on. Once we copy
+        * the data into the page cache pages, then we cannot fail otherwise we
+        * expose transient stale data. If the reserve fails, we can safely
+        * back out at this point as there is nothing to undo.
+        */
+       ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
+       if (ret)
+               return ret;
+       if (WARN_ON(iomap.offset > pos))
+               return -EIO;
+
+       /*
+        * Cut down the length to the one actually provided by the filesystem,
+        * as it might not be able to give us the whole size that we requested.
+        */
+       if (iomap.offset + iomap.length < pos + length)
+               length = iomap.offset + iomap.length - pos;
+
+       /*
+        * Now that we have guaranteed that the space allocation will succeed.
+        * we can do the copy-in page by page without having to worry about
+        * failures exposing transient data.
+        */
+       written = actor(inode, pos, length, data, &iomap);
+
+       /*
+        * Now the data has been copied, commit the range we've copied.  This
+        * should not fail unless the filesystem has had a fatal error.
+        */
+       ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0,
+                       flags, &iomap);
+
+       return written ? written : ret;
+}
+
+static void
+iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
+{
+       loff_t i_size = i_size_read(inode);
+
+       /*
+        * Only truncate newly allocated pages beyoned EOF, even if the
+        * write started inside the existing inode size.
+        */
+       if (pos + len > i_size)
+               truncate_pagecache_range(inode, max(pos, i_size), pos + len);
+}
+
+static int
+iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
+               struct page **pagep, struct iomap *iomap)
+{
+       pgoff_t index = pos >> PAGE_SHIFT;
+       struct page *page;
+       int status = 0;
+
+       BUG_ON(pos + len > iomap->offset + iomap->length);
+
+       page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
+       if (!page)
+               return -ENOMEM;
+
+       status = __block_write_begin_int(page, pos, len, NULL, iomap);
+       if (unlikely(status)) {
+               unlock_page(page);
+               put_page(page);
+               page = NULL;
+
+               iomap_write_failed(inode, pos, len);
+       }
+
+       *pagep = page;
+       return status;
+}
+
+static int
+iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+               unsigned copied, struct page *page)
+{
+       int ret;
+
+       ret = generic_write_end(NULL, inode->i_mapping, pos, len,
+                       copied, page, NULL);
+       if (ret < len)
+               iomap_write_failed(inode, pos, len);
+       return ret;
+}
+
+static loff_t
+iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+               struct iomap *iomap)
+{
+       struct iov_iter *i = data;
+       long status = 0;
+       ssize_t written = 0;
+       unsigned int flags = AOP_FLAG_NOFS;
+
+       /*
+        * Copies from kernel address space cannot fail (NFSD is a big user).
+        */
+       if (!iter_is_iovec(i))
+               flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+       do {
+               struct page *page;
+               unsigned long offset;   /* Offset into pagecache page */
+               unsigned long bytes;    /* Bytes to write to page */
+               size_t copied;          /* Bytes copied from user */
+
+               offset = (pos & (PAGE_SIZE - 1));
+               bytes = min_t(unsigned long, PAGE_SIZE - offset,
+                                               iov_iter_count(i));
+again:
+               if (bytes > length)
+                       bytes = length;
+
+               /*
+                * Bring in the user page that we will copy from _first_.
+                * Otherwise there's a nasty deadlock on copying from the
+                * same page as we're writing to, without it being marked
+                * up-to-date.
+                *
+                * Not only is this an optimisation, but it is also required
+                * to check that the address is actually valid, when atomic
+                * usercopies are used, below.
+                */
+               if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+                       status = -EFAULT;
+                       break;
+               }
+
+               status = iomap_write_begin(inode, pos, bytes, flags, &page,
+                               iomap);
+               if (unlikely(status))
+                       break;
+
+               if (mapping_writably_mapped(inode->i_mapping))
+                       flush_dcache_page(page);
+
+               pagefault_disable();
+               copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+               pagefault_enable();
+
+               flush_dcache_page(page);
+               mark_page_accessed(page);
+
+               status = iomap_write_end(inode, pos, bytes, copied, page);
+               if (unlikely(status < 0))
+                       break;
+               copied = status;
+
+               cond_resched();
+
+               iov_iter_advance(i, copied);
+               if (unlikely(copied == 0)) {
+                       /*
+                        * If we were unable to copy any data at all, we must
+                        * fall back to a single segment length write.
+                        *
+                        * If we didn't fallback here, we could livelock
+                        * because not all segments in the iov can be copied at
+                        * once without a pagefault.
+                        */
+                       bytes = min_t(unsigned long, PAGE_SIZE - offset,
+                                               iov_iter_single_seg_count(i));
+                       goto again;
+               }
+               pos += copied;
+               written += copied;
+               length -= copied;
+
+               balance_dirty_pages_ratelimited(inode->i_mapping);
+       } while (iov_iter_count(i) && length);
+
+       return written ? written : status;
+}
+
+ssize_t
+iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
+               struct iomap_ops *ops)
+{
+       struct inode *inode = iocb->ki_filp->f_mapping->host;
+       loff_t pos = iocb->ki_pos, ret = 0, written = 0;
+
+       while (iov_iter_count(iter)) {
+               ret = iomap_apply(inode, pos, iov_iter_count(iter),
+                               IOMAP_WRITE, ops, iter, iomap_write_actor);
+               if (ret <= 0)
+                       break;
+               pos += ret;
+               written += ret;
+       }
+
+       return written ? written : ret;
+}
+EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+
+static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
+               unsigned bytes, struct iomap *iomap)
+{
+       struct page *page;
+       int status;
+
+       status = iomap_write_begin(inode, pos, bytes,
+                       AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap);
+       if (status)
+               return status;
+
+       zero_user(page, offset, bytes);
+       mark_page_accessed(page);
+
+       return iomap_write_end(inode, pos, bytes, bytes, page);
+}
+
+static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
+               struct iomap *iomap)
+{
+       sector_t sector = iomap->blkno +
+               (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
+
+       return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
+}
+
+static loff_t
+iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
+               void *data, struct iomap *iomap)
+{
+       bool *did_zero = data;
+       loff_t written = 0;
+       int status;
+
+       /* already zeroed?  we're done. */
+       if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+               return count;
+
+       do {
+               unsigned offset, bytes;
+
+               offset = pos & (PAGE_SIZE - 1); /* Within page */
+               bytes = min_t(unsigned, PAGE_SIZE - offset, count);
+
+               if (IS_DAX(inode))
+                       status = iomap_dax_zero(pos, offset, bytes, iomap);
+               else
+                       status = iomap_zero(inode, pos, offset, bytes, iomap);
+               if (status < 0)
+                       return status;
+
+               pos += bytes;
+               count -= bytes;
+               written += bytes;
+               if (did_zero)
+                       *did_zero = true;
+       } while (count > 0);
+
+       return written;
+}
+
+int
+iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+               struct iomap_ops *ops)
+{
+       loff_t ret;
+
+       while (len > 0) {
+               ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
+                               ops, did_zero, iomap_zero_range_actor);
+               if (ret <= 0)
+                       return ret;
+
+               pos += ret;
+               len -= ret;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_zero_range);
+
+int
+iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+               struct iomap_ops *ops)
+{
+       unsigned blocksize = (1 << inode->i_blkbits);
+       unsigned off = pos & (blocksize - 1);
+
+       /* Block boundary? Nothing to do */
+       if (!off)
+               return 0;
+       return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
+}
+EXPORT_SYMBOL_GPL(iomap_truncate_page);
+
+static loff_t
+iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
+               void *data, struct iomap *iomap)
+{
+       struct page *page = data;
+       int ret;
+
+       ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length,
+                       NULL, iomap);
+       if (ret)
+               return ret;
+
+       block_commit_write(page, 0, length);
+       return length;
+}
+
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+               struct iomap_ops *ops)
+{
+       struct page *page = vmf->page;
+       struct inode *inode = file_inode(vma->vm_file);
+       unsigned long length;
+       loff_t offset, size;
+       ssize_t ret;
+
+       lock_page(page);
+       size = i_size_read(inode);
+       if ((page->mapping != inode->i_mapping) ||
+           (page_offset(page) > size)) {
+               /* We overload EFAULT to mean page got truncated */
+               ret = -EFAULT;
+               goto out_unlock;
+       }
+
+       /* page is wholly or partially inside EOF */
+       if (((page->index + 1) << PAGE_SHIFT) > size)
+               length = size & ~PAGE_MASK;
+       else
+               length = PAGE_SIZE;
+
+       offset = page_offset(page);
+       while (length > 0) {
+               ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
+                               ops, page, iomap_page_mkwrite_actor);
+               if (unlikely(ret <= 0))
+                       goto out_unlock;
+               offset += ret;
+               length -= ret;
+       }
+
+       set_page_dirty(page);
+       wait_for_stable_page(page);
+       return 0;
+out_unlock:
+       unlock_page(page);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
+
+struct fiemap_ctx {
+       struct fiemap_extent_info *fi;
+       struct iomap prev;
+};
+
+static int iomap_to_fiemap(struct fiemap_extent_info *fi,
+               struct iomap *iomap, u32 flags)
+{
+       switch (iomap->type) {
+       case IOMAP_HOLE:
+               /* skip holes */
+               return 0;
+       case IOMAP_DELALLOC:
+               flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
+               break;
+       case IOMAP_UNWRITTEN:
+               flags |= FIEMAP_EXTENT_UNWRITTEN;
+               break;
+       case IOMAP_MAPPED:
+               break;
+       }
+
+       return fiemap_fill_next_extent(fi, iomap->offset,
+                       iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
+                       iomap->length, flags | FIEMAP_EXTENT_MERGED);
+
+}
+
+static loff_t
+iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+               struct iomap *iomap)
+{
+       struct fiemap_ctx *ctx = data;
+       loff_t ret = length;
+
+       if (iomap->type == IOMAP_HOLE)
+               return length;
+
+       ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
+       ctx->prev = *iomap;
+       switch (ret) {
+       case 0:         /* success */
+               return length;
+       case 1:         /* extent array full */
+               return 0;
+       default:
+               return ret;
+       }
+}
+
+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
+               loff_t start, loff_t len, struct iomap_ops *ops)
+{
+       struct fiemap_ctx ctx;
+       loff_t ret;
+
+       memset(&ctx, 0, sizeof(ctx));
+       ctx.fi = fi;
+       ctx.prev.type = IOMAP_HOLE;
+
+       ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
+       if (ret)
+               return ret;
+
+       ret = filemap_write_and_wait(inode->i_mapping);
+       if (ret)
+               return ret;
+
+       while (len > 0) {
+               ret = iomap_apply(inode, start, len, 0, ops, &ctx,
+                               iomap_fiemap_actor);
+               if (ret < 0)
+                       return ret;
+               if (ret == 0)
+                       break;
+
+               start += ret;
+               len -= ret;
+       }
+
+       if (ctx.prev.type != IOMAP_HOLE) {
+               ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_fiemap);
index 31f3df1..ad2c05e 100644 (file)
@@ -2,6 +2,7 @@
  * Copyright (c) 2014-2016 Christoph Hellwig.
  */
 #include <linux/exportfs.h>
+#include <linux/iomap.h>
 #include <linux/genhd.h>
 #include <linux/slab.h>
 #include <linux/pr.h>
index 6c3b316..4ebaaf4 100644 (file)
@@ -3,6 +3,7 @@
  */
 #include <linux/sunrpc/svc.h>
 #include <linux/exportfs.h>
+#include <linux/iomap.h>
 #include <linux/nfs4.h>
 
 #include "nfsd.h"
index 5d47b4d..35faf12 100644 (file)
@@ -4,6 +4,7 @@ config XFS_FS
        depends on (64BIT || LBDAF)
        select EXPORTFS
        select LIBCRC32C
+       select FS_IOMAP
        help
          XFS is a high performance journaling filesystem which originated
          on the SGI IRIX platform.  It is completely multi-threaded, can
index a708e38..88c26b8 100644 (file)
@@ -84,7 +84,7 @@ xfs_alloc_lookup_ge(
  * Lookup the first record less than or equal to [bno, len]
  * in the btree given by cur.
  */
-int                                    /* error */
+static int                             /* error */
 xfs_alloc_lookup_le(
        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_agblock_t           bno,    /* starting block of extent */
@@ -1839,19 +1839,8 @@ void
 xfs_alloc_compute_maxlevels(
        xfs_mount_t     *mp)    /* file system mount structure */
 {
-       int             level;
-       uint            maxblocks;
-       uint            maxleafents;
-       int             minleafrecs;
-       int             minnoderecs;
-
-       maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
-       minleafrecs = mp->m_alloc_mnr[0];
-       minnoderecs = mp->m_alloc_mnr[1];
-       maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
-       for (level = 1; maxblocks > 1; level++)
-               maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
-       mp->m_ag_maxlevels = level;
+       mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_alloc_mnr,
+                       (mp->m_sb.sb_agblocks + 1) / 2);
 }
 
 /*
@@ -2658,55 +2647,79 @@ error0:
        return error;
 }
 
-/*
- * Free an extent.
- * Just break up the extent address and hand off to xfs_free_ag_extent
- * after fixing up the freelist.
- */
-int                            /* error */
-xfs_free_extent(
-       xfs_trans_t     *tp,    /* transaction pointer */
-       xfs_fsblock_t   bno,    /* starting block number of extent */
-       xfs_extlen_t    len)    /* length of extent */
+/* Ensure that the freelist is at full capacity. */
+int
+xfs_free_extent_fix_freelist(
+       struct xfs_trans        *tp,
+       xfs_agnumber_t          agno,
+       struct xfs_buf          **agbp)
 {
-       xfs_alloc_arg_t args;
-       int             error;
+       struct xfs_alloc_arg    args;
+       int                     error;
 
-       ASSERT(len != 0);
-       memset(&args, 0, sizeof(xfs_alloc_arg_t));
+       memset(&args, 0, sizeof(struct xfs_alloc_arg));
        args.tp = tp;
        args.mp = tp->t_mountp;
+       args.agno = agno;
 
        /*
         * validate that the block number is legal - the enables us to detect
         * and handle a silent filesystem corruption rather than crashing.
         */
-       args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
        if (args.agno >= args.mp->m_sb.sb_agcount)
                return -EFSCORRUPTED;
 
-       args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
-       if (args.agbno >= args.mp->m_sb.sb_agblocks)
-               return -EFSCORRUPTED;
-
        args.pag = xfs_perag_get(args.mp, args.agno);
        ASSERT(args.pag);
 
        error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
        if (error)
-               goto error0;
+               goto out;
+
+       *agbp = args.agbp;
+out:
+       xfs_perag_put(args.pag);
+       return error;
+}
+
+/*
+ * Free an extent.
+ * Just break up the extent address and hand off to xfs_free_ag_extent
+ * after fixing up the freelist.
+ */
+int                            /* error */
+xfs_free_extent(
+       struct xfs_trans        *tp,    /* transaction pointer */
+       xfs_fsblock_t           bno,    /* starting block number of extent */
+       xfs_extlen_t            len)    /* length of extent */
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+       struct xfs_buf          *agbp;
+       xfs_agnumber_t          agno = XFS_FSB_TO_AGNO(mp, bno);
+       xfs_agblock_t           agbno = XFS_FSB_TO_AGBNO(mp, bno);
+       int                     error;
+
+       ASSERT(len != 0);
+
+       error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
+       if (error)
+               return error;
+
+       XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err);
 
        /* validate the extent size is legal now we have the agf locked */
-       if (args.agbno + len >
-                       be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
-               error = -EFSCORRUPTED;
-               goto error0;
-       }
+       XFS_WANT_CORRUPTED_GOTO(mp,
+               agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
+                               err);
 
-       error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
-       if (!error)
-               xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
-error0:
-       xfs_perag_put(args.pag);
+       error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, 0);
+       if (error)
+               goto err;
+
+       xfs_extent_busy_insert(tp, agno, agbno, len, 0);
+       return 0;
+
+err:
+       xfs_trans_brelse(tp, agbp);
        return error;
 }
index 135eb3d..cf268b2 100644 (file)
@@ -212,13 +212,6 @@ xfs_free_extent(
        xfs_fsblock_t   bno,    /* starting block number of extent */
        xfs_extlen_t    len);   /* length of extent */
 
-int                                    /* error */
-xfs_alloc_lookup_le(
-       struct xfs_btree_cur    *cur,   /* btree cursor */
-       xfs_agblock_t           bno,    /* starting block of extent */
-       xfs_extlen_t            len,    /* length of extent */
-       int                     *stat); /* success/failure */
-
 int                            /* error */
 xfs_alloc_lookup_ge(
        struct xfs_btree_cur    *cur,   /* btree cursor */
@@ -236,5 +229,7 @@ xfs_alloc_get_rec(
 int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
                        xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
 int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
+int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
+               struct xfs_buf **agbp);
 
 #endif /* __XFS_ALLOC_H__ */
index 882c8d3..4f2aed0 100644 (file)
@@ -50,7 +50,6 @@ int   xfs_attr_shortform_lookup(struct xfs_da_args *args);
 int    xfs_attr_shortform_getvalue(struct xfs_da_args *args);
 int    xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
 int    xfs_attr_shortform_remove(struct xfs_da_args *args);
-int    xfs_attr_shortform_list(struct xfs_attr_list_context *context);
 int    xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
 int    xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
 void   xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
@@ -88,8 +87,6 @@ int   xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
 void   xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
                                       struct xfs_da_state_blk *drop_blk,
                                       struct xfs_da_state_blk *save_blk);
-int    xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
-
 /*
  * Utility routines.
  */
index 932381c..2f2c85c 100644 (file)
@@ -570,14 +570,12 @@ xfs_bmap_validate_ret(
  */
 void
 xfs_bmap_add_free(
+       struct xfs_mount        *mp,            /* mount point structure */
+       struct xfs_bmap_free    *flist,         /* list of extents */
        xfs_fsblock_t           bno,            /* fs block number of extent */
-       xfs_filblks_t           len,            /* length of extent */
-       xfs_bmap_free_t         *flist,         /* list of extents */
-       xfs_mount_t             *mp)            /* mount point structure */
+       xfs_filblks_t           len)            /* length of extent */
 {
-       xfs_bmap_free_item_t    *cur;           /* current (next) element */
-       xfs_bmap_free_item_t    *new;           /* new element */
-       xfs_bmap_free_item_t    *prev;          /* previous element */
+       struct xfs_bmap_free_item       *new;           /* new element */
 #ifdef DEBUG
        xfs_agnumber_t          agno;
        xfs_agblock_t           agbno;
@@ -597,17 +595,7 @@ xfs_bmap_add_free(
        new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
        new->xbfi_startblock = bno;
        new->xbfi_blockcount = (xfs_extlen_t)len;
-       for (prev = NULL, cur = flist->xbf_first;
-            cur != NULL;
-            prev = cur, cur = cur->xbfi_next) {
-               if (cur->xbfi_startblock >= bno)
-                       break;
-       }
-       if (prev)
-               prev->xbfi_next = new;
-       else
-               flist->xbf_first = new;
-       new->xbfi_next = cur;
+       list_add(&new->xbfi_list, &flist->xbf_flist);
        flist->xbf_count++;
 }
 
@@ -617,14 +605,10 @@ xfs_bmap_add_free(
  */
 void
 xfs_bmap_del_free(
-       xfs_bmap_free_t         *flist, /* free item list header */
-       xfs_bmap_free_item_t    *prev,  /* previous item on list, if any */
-       xfs_bmap_free_item_t    *free)  /* list item to be freed */
+       struct xfs_bmap_free            *flist, /* free item list header */
+       struct xfs_bmap_free_item       *free)  /* list item to be freed */
 {
-       if (prev)
-               prev->xbfi_next = free->xbfi_next;
-       else
-               flist->xbf_first = free->xbfi_next;
+       list_del(&free->xbfi_list);
        flist->xbf_count--;
        kmem_zone_free(xfs_bmap_free_item_zone, free);
 }
@@ -634,17 +618,16 @@ xfs_bmap_del_free(
  */
 void
 xfs_bmap_cancel(
-       xfs_bmap_free_t         *flist) /* list of bmap_free_items */
+       struct xfs_bmap_free            *flist) /* list of bmap_free_items */
 {
-       xfs_bmap_free_item_t    *free;  /* free list item */
-       xfs_bmap_free_item_t    *next;
+       struct xfs_bmap_free_item       *free;  /* free list item */
 
        if (flist->xbf_count == 0)
                return;
-       ASSERT(flist->xbf_first != NULL);
-       for (free = flist->xbf_first; free; free = next) {
-               next = free->xbfi_next;
-               xfs_bmap_del_free(flist, NULL, free);
+       while (!list_empty(&flist->xbf_flist)) {
+               free = list_first_entry(&flist->xbf_flist,
+                               struct xfs_bmap_free_item, xbfi_list);
+               xfs_bmap_del_free(flist, free);
        }
        ASSERT(flist->xbf_count == 0);
 }
@@ -699,7 +682,7 @@ xfs_bmap_btree_to_extents(
        cblock = XFS_BUF_TO_BLOCK(cbp);
        if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
                return error;
-       xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+       xfs_bmap_add_free(mp, cur->bc_private.b.flist, cbno, 1);
        ip->i_d.di_nblocks--;
        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
        xfs_trans_binval(tp, cbp);
@@ -5073,8 +5056,8 @@ xfs_bmap_del_extent(
         * If we need to, add to list of extents to delete.
         */
        if (do_fx)
-               xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
-                       mp);
+               xfs_bmap_add_free(mp, flist, del->br_startblock,
+                       del->br_blockcount);
        /*
         * Adjust inode # blocks in the file.
         */
index 423a34e..f1f3ae6 100644 (file)
@@ -62,12 +62,12 @@ struct xfs_bmalloca {
  * List of extents to be free "later".
  * The list is kept sorted on xbf_startblock.
  */
-typedef struct xfs_bmap_free_item
+struct xfs_bmap_free_item
 {
        xfs_fsblock_t           xbfi_startblock;/* starting fs block number */
        xfs_extlen_t            xbfi_blockcount;/* number of blocks in extent */
-       struct xfs_bmap_free_item *xbfi_next;   /* link to next entry */
-} xfs_bmap_free_item_t;
+       struct list_head        xbfi_list;
+};
 
 /*
  * Header for free extent list.
@@ -85,7 +85,7 @@ typedef struct xfs_bmap_free_item
  */
 typedef        struct xfs_bmap_free
 {
-       xfs_bmap_free_item_t    *xbf_first;     /* list of to-be-free extents */
+       struct list_head        xbf_flist;      /* list of to-be-free extents */
        int                     xbf_count;      /* count of items on list */
        int                     xbf_low;        /* alloc in low mode */
 } xfs_bmap_free_t;
@@ -141,8 +141,10 @@ static inline int xfs_bmapi_aflag(int w)
 
 static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
 {
-       ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
-               (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK);
+       INIT_LIST_HEAD(&flp->xbf_flist);
+       flp->xbf_count = 0;
+       flp->xbf_low = 0;
+       *fbp = NULLFSBLOCK;
 }
 
 /*
@@ -191,8 +193,8 @@ void        xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
 
 int    xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
 void   xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
-void   xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
-               struct xfs_bmap_free *flist, struct xfs_mount *mp);
+void   xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_bmap_free *flist,
+                         xfs_fsblock_t bno, xfs_filblks_t len);
 void   xfs_bmap_cancel(struct xfs_bmap_free *flist);
 int    xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
                        struct xfs_inode *ip);
index 6282f6e..db0c71e 100644 (file)
@@ -526,7 +526,7 @@ xfs_bmbt_free_block(
        struct xfs_trans        *tp = cur->bc_tp;
        xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
 
-       xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+       xfs_bmap_add_free(mp, cur->bc_private.b.flist, fsbno, 1);
        ip->i_d.di_nblocks--;
 
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
index 1f88e1c..07eeb0b 100644 (file)
@@ -543,12 +543,12 @@ xfs_btree_ptr_addr(
  */
 STATIC struct xfs_btree_block *
 xfs_btree_get_iroot(
-       struct xfs_btree_cur    *cur)
+       struct xfs_btree_cur    *cur)
 {
-       struct xfs_ifork        *ifp;
+       struct xfs_ifork        *ifp;
 
-       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
-       return (struct xfs_btree_block *)ifp->if_broot;
+       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+       return (struct xfs_btree_block *)ifp->if_broot;
 }
 
 /*
@@ -4152,3 +4152,22 @@ xfs_btree_sblock_verify(
 
        return true;
 }
+
+/*
+ * Calculate the number of btree levels needed to store a given number of
+ * records in a short-format btree.
+ */
+uint
+xfs_btree_compute_maxlevels(
+       struct xfs_mount        *mp,
+       uint                    *limits,
+       unsigned long           len)
+{
+       uint                    level;
+       unsigned long           maxblocks;
+
+       maxblocks = (len + limits[0] - 1) / limits[0];
+       for (level = 1; maxblocks > 1; level++)
+               maxblocks = (maxblocks + limits[1] - 1) / limits[1];
+       return level;
+}
index 2e874be..785a996 100644 (file)
@@ -474,5 +474,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block)
 
 bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
 bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
+uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
+                                unsigned long len);
 
 #endif /* __XFS_BTREE_H__ */
index 097bf77..0f1f165 100644 (file)
@@ -356,7 +356,6 @@ xfs_da3_split(
        struct xfs_da_state_blk *newblk;
        struct xfs_da_state_blk *addblk;
        struct xfs_da_intnode   *node;
-       struct xfs_buf          *bp;
        int                     max;
        int                     action = 0;
        int                     error;
@@ -397,7 +396,9 @@ xfs_da3_split(
                                break;
                        }
                        /*
-                        * Entry wouldn't fit, split the leaf again.
+                        * Entry wouldn't fit, split the leaf again. The new
+                        * extrablk will be consumed by xfs_da3_node_split if
+                        * the node is split.
                         */
                        state->extravalid = 1;
                        if (state->inleaf) {
@@ -445,6 +446,14 @@ xfs_da3_split(
        if (!addblk)
                return 0;
 
+       /*
+        * xfs_da3_node_split() should have consumed any extra blocks we added
+        * during a double leaf split in the attr fork. This is guaranteed as
+        * we can't be here if the attr fork only has a single leaf block.
+        */
+       ASSERT(state->extravalid == 0 ||
+              state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
+
        /*
         * Split the root node.
         */
@@ -457,43 +466,33 @@ xfs_da3_split(
        }
 
        /*
-        * Update pointers to the node which used to be block 0 and
-        * just got bumped because of the addition of a new root node.
-        * There might be three blocks involved if a double split occurred,
-        * and the original block 0 could be at any position in the list.
+        * Update pointers to the node which used to be block 0 and just got
+        * bumped because of the addition of a new root node.  Note that the
+        * original block 0 could be at any position in the list of blocks in
+        * the tree.
         *
-        * Note: the magic numbers and sibling pointers are in the same
-        * physical place for both v2 and v3 headers (by design). Hence it
-        * doesn't matter which version of the xfs_da_intnode structure we use
-        * here as the result will be the same using either structure.
+        * Note: the magic numbers and sibling pointers are in the same physical
+        * place for both v2 and v3 headers (by design). Hence it doesn't matter
+        * which version of the xfs_da_intnode structure we use here as the
+        * result will be the same using either structure.
         */
        node = oldblk->bp->b_addr;
        if (node->hdr.info.forw) {
-               if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
-                       bp = addblk->bp;
-               } else {
-                       ASSERT(state->extravalid);
-                       bp = state->extrablk.bp;
-               }
-               node = bp->b_addr;
+               ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno);
+               node = addblk->bp->b_addr;
                node->hdr.info.back = cpu_to_be32(oldblk->blkno);
-               xfs_trans_log_buf(state->args->trans, bp,
-                   XFS_DA_LOGRANGE(node, &node->hdr.info,
-                   sizeof(node->hdr.info)));
+               xfs_trans_log_buf(state->args->trans, addblk->bp,
+                                 XFS_DA_LOGRANGE(node, &node->hdr.info,
+                                 sizeof(node->hdr.info)));
        }
        node = oldblk->bp->b_addr;
        if (node->hdr.info.back) {
-               if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
-                       bp = addblk->bp;
-               } else {
-                       ASSERT(state->extravalid);
-                       bp = state->extrablk.bp;
-               }
-               node = bp->b_addr;
+               ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno);
+               node = addblk->bp->b_addr;
                node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
-               xfs_trans_log_buf(state->args->trans, bp,
-                   XFS_DA_LOGRANGE(node, &node->hdr.info,
-                   sizeof(node->hdr.info)));
+               xfs_trans_log_buf(state->args->trans, addblk->bp,
+                                 XFS_DA_LOGRANGE(node, &node->hdr.info,
+                                 sizeof(node->hdr.info)));
        }
        addblk->bp = NULL;
        return 0;
index 9d624a6..f1e8d4d 100644 (file)
@@ -40,8 +40,7 @@ xfs_dir2_sf_entsize(
        int count = sizeof(struct xfs_dir2_sf_entry);   /* namelen + offset */
 
        count += len;                                   /* name */
-       count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
-                               sizeof(xfs_dir2_ino4_t); /* ino # */
+       count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */
        return count;
 }
 
@@ -125,33 +124,33 @@ xfs_dir3_sfe_put_ftype(
 static xfs_ino_t
 xfs_dir2_sf_get_ino(
        struct xfs_dir2_sf_hdr  *hdr,
-       xfs_dir2_inou_t         *from)
+       __uint8_t               *from)
 {
        if (hdr->i8count)
-               return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
+               return get_unaligned_be64(from) & 0x00ffffffffffffffULL;
        else
-               return get_unaligned_be32(&from->i4.i);
+               return get_unaligned_be32(from);
 }
 
 static void
 xfs_dir2_sf_put_ino(
        struct xfs_dir2_sf_hdr  *hdr,
-       xfs_dir2_inou_t         *to,
+       __uint8_t               *to,
        xfs_ino_t               ino)
 {
        ASSERT((ino & 0xff00000000000000ULL) == 0);
 
        if (hdr->i8count)
-               put_unaligned_be64(ino, &to->i8.i);
+               put_unaligned_be64(ino, to);
        else
-               put_unaligned_be32(ino, &to->i4.i);
+               put_unaligned_be32(ino, to);
 }
 
 static xfs_ino_t
 xfs_dir2_sf_get_parent_ino(
        struct xfs_dir2_sf_hdr  *hdr)
 {
-       return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
+       return xfs_dir2_sf_get_ino(hdr, hdr->parent);
 }
 
 static void
@@ -159,7 +158,7 @@ xfs_dir2_sf_put_parent_ino(
        struct xfs_dir2_sf_hdr  *hdr,
        xfs_ino_t               ino)
 {
-       xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
+       xfs_dir2_sf_put_ino(hdr, hdr->parent, ino);
 }
 
 /*
@@ -173,8 +172,7 @@ xfs_dir2_sfe_get_ino(
        struct xfs_dir2_sf_hdr  *hdr,
        struct xfs_dir2_sf_entry *sfep)
 {
-       return xfs_dir2_sf_get_ino(hdr,
-                               (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
+       return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen]);
 }
 
 static void
@@ -183,8 +181,7 @@ xfs_dir2_sfe_put_ino(
        struct xfs_dir2_sf_entry *sfep,
        xfs_ino_t               ino)
 {
-       xfs_dir2_sf_put_ino(hdr,
-                           (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
+       xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen], ino);
 }
 
 static xfs_ino_t
@@ -192,8 +189,7 @@ xfs_dir3_sfe_get_ino(
        struct xfs_dir2_sf_hdr  *hdr,
        struct xfs_dir2_sf_entry *sfep)
 {
-       return xfs_dir2_sf_get_ino(hdr,
-                       (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
+       return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen + 1]);
 }
 
 static void
@@ -202,8 +198,7 @@ xfs_dir3_sfe_put_ino(
        struct xfs_dir2_sf_entry *sfep,
        xfs_ino_t               ino)
 {
-       xfs_dir2_sf_put_ino(hdr,
-                       (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
+       xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen + 1], ino);
 }
 
 
index 8d4d8bc..685f23b 100644 (file)
@@ -191,12 +191,6 @@ typedef    __uint16_t      xfs_dir2_data_off_t;
 #define        NULLDATAOFF     0xffffU
 typedef uint           xfs_dir2_data_aoff_t;   /* argument form */
 
-/*
- * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
- * Only need 16 bits, this is the byte offset into the single block form.
- */
-typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
-
 /*
  * Offset in data space of a data entry.
  */
@@ -214,22 +208,10 @@ typedef   xfs_off_t       xfs_dir2_off_t;
  */
 typedef        __uint32_t      xfs_dir2_db_t;
 
-/*
- * Inode number stored as 8 8-bit values.
- */
-typedef        struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
-
-/*
- * Inode number stored as 4 8-bit values.
- * Works a lot of the time, when all the inode numbers in a directory
- * fit in 32 bits.
- */
-typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
+#define XFS_INO32_SIZE 4
+#define XFS_INO64_SIZE 8
+#define XFS_INO64_DIFF (XFS_INO64_SIZE - XFS_INO32_SIZE)
 
-typedef union {
-       xfs_dir2_ino8_t i8;
-       xfs_dir2_ino4_t i4;
-} xfs_dir2_inou_t;
 #define        XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
 
 /*
@@ -246,39 +228,38 @@ typedef union {
 typedef struct xfs_dir2_sf_hdr {
        __uint8_t               count;          /* count of entries */
        __uint8_t               i8count;        /* count of 8-byte inode #s */
-       xfs_dir2_inou_t         parent;         /* parent dir inode number */
-} __arch_pack xfs_dir2_sf_hdr_t;
+       __uint8_t               parent[8];      /* parent dir inode number */
+} __packed xfs_dir2_sf_hdr_t;
 
 typedef struct xfs_dir2_sf_entry {
        __u8                    namelen;        /* actual name length */
-       xfs_dir2_sf_off_t       offset;         /* saved offset */
+       __u8                    offset[2];      /* saved offset */
        __u8                    name[];         /* name, variable size */
        /*
         * A single byte containing the file type field follows the inode
         * number for version 3 directory entries.
         *
-        * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
-        * variable offset after the name.
+        * A 64-bit or 32-bit inode number follows here, at a variable offset
+        * after the name.
         */
-} __arch_pack xfs_dir2_sf_entry_t;
+} xfs_dir2_sf_entry_t;
 
 static inline int xfs_dir2_sf_hdr_size(int i8count)
 {
        return sizeof(struct xfs_dir2_sf_hdr) -
-               (i8count == 0) *
-               (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
+               (i8count == 0) * XFS_INO64_DIFF;
 }
 
 static inline xfs_dir2_data_aoff_t
 xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
 {
-       return get_unaligned_be16(&sfep->offset.i);
+       return get_unaligned_be16(sfep->offset);
 }
 
 static inline void
 xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
 {
-       put_unaligned_be16(off, &sfep->offset.i);
+       put_unaligned_be16(off, sfep->offset);
 }
 
 static inline struct xfs_dir2_sf_entry *
index e5bb9cc..c6809ff 100644 (file)
@@ -126,13 +126,12 @@ xfs_dir2_block_sfsize(
                /*
                 * Calculate the new size, see if we should give up yet.
                 */
-               size = xfs_dir2_sf_hdr_size(i8count) +          /* header */
-                      count +                                  /* namelen */
-                      count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
-                      namelen +                                /* name */
-                      (i8count ?                               /* inumber */
-                               (uint)sizeof(xfs_dir2_ino8_t) * count :
-                               (uint)sizeof(xfs_dir2_ino4_t) * count);
+               size = xfs_dir2_sf_hdr_size(i8count) +  /* header */
+                      count * 3 * sizeof(u8) +         /* namelen + offset */
+                      namelen +                        /* name */
+                      (i8count ?                       /* inumber */
+                               count * XFS_INO64_SIZE :
+                               count * XFS_INO32_SIZE);
                if (size > XFS_IFORK_DSIZE(dp))
                        return size;            /* size value is a failure */
        }
@@ -319,10 +318,7 @@ xfs_dir2_sf_addname(
                /*
                 * Yes, adjust the inode size.  old count + (parent + new)
                 */
-               incr_isize +=
-                       (sfp->count + 2) *
-                       ((uint)sizeof(xfs_dir2_ino8_t) -
-                        (uint)sizeof(xfs_dir2_ino4_t));
+               incr_isize += (sfp->count + 2) * XFS_INO64_DIFF;
                objchange = 1;
        }
 
@@ -897,11 +893,7 @@ xfs_dir2_sf_replace(
                int     error;                  /* error return value */
                int     newsize;                /* new inode size */
 
-               newsize =
-                       dp->i_df.if_bytes +
-                       (sfp->count + 1) *
-                       ((uint)sizeof(xfs_dir2_ino8_t) -
-                        (uint)sizeof(xfs_dir2_ino4_t));
+               newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
                /*
                 * Won't fit as shortform, convert to block then do replace.
                 */
@@ -1022,10 +1014,7 @@ xfs_dir2_sf_toino4(
        /*
         * Compute the new inode size.
         */
-       newsize =
-               oldsize -
-               (oldsfp->count + 1) *
-               ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+       newsize = oldsize - (oldsfp->count + 1) * XFS_INO64_DIFF;
        xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
        xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
        /*
@@ -1048,7 +1037,7 @@ xfs_dir2_sf_toino4(
             i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
                  oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
                sfep->namelen = oldsfep->namelen;
-               sfep->offset = oldsfep->offset;
+               memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
                memcpy(sfep->name, oldsfep->name, sfep->namelen);
                dp->d_ops->sf_put_ino(sfp, sfep,
                                      dp->d_ops->sf_get_ino(oldsfp, oldsfep));
@@ -1098,10 +1087,7 @@ xfs_dir2_sf_toino8(
        /*
         * Compute the new inode size (nb: entry count + 1 for parent)
         */
-       newsize =
-               oldsize +
-               (oldsfp->count + 1) *
-               ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+       newsize = oldsize + (oldsfp->count + 1) * XFS_INO64_DIFF;
        xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
        xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
        /*
@@ -1124,7 +1110,7 @@ xfs_dir2_sf_toino8(
             i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
                  oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
                sfep->namelen = oldsfep->namelen;
-               sfep->offset = oldsfep->offset;
+               memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
                memcpy(sfep->name, oldsfep->name, sfep->namelen);
                dp->d_ops->sf_put_ino(sfp, sfep,
                                      dp->d_ops->sf_get_ino(oldsfp, oldsfep));
index dc97eb2..adb204d 100644 (file)
@@ -1435,41 +1435,57 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
  * with the crc feature bit, and all accesses to them must be conditional on
  * that flag.
  */
+/* short form block header */
+struct xfs_btree_block_shdr {
+       __be32          bb_leftsib;
+       __be32          bb_rightsib;
+
+       __be64          bb_blkno;
+       __be64          bb_lsn;
+       uuid_t          bb_uuid;
+       __be32          bb_owner;
+       __le32          bb_crc;
+};
+
+/* long form block header */
+struct xfs_btree_block_lhdr {
+       __be64          bb_leftsib;
+       __be64          bb_rightsib;
+
+       __be64          bb_blkno;
+       __be64          bb_lsn;
+       uuid_t          bb_uuid;
+       __be64          bb_owner;
+       __le32          bb_crc;
+       __be32          bb_pad; /* padding for alignment */
+};
+
 struct xfs_btree_block {
        __be32          bb_magic;       /* magic number for block type */
        __be16          bb_level;       /* 0 is a leaf */
        __be16          bb_numrecs;     /* current # of data records */
        union {
-               struct {
-                       __be32          bb_leftsib;
-                       __be32          bb_rightsib;
-
-                       __be64          bb_blkno;
-                       __be64          bb_lsn;
-                       uuid_t          bb_uuid;
-                       __be32          bb_owner;
-                       __le32          bb_crc;
-               } s;                    /* short form pointers */
-               struct  {
-                       __be64          bb_leftsib;
-                       __be64          bb_rightsib;
-
-                       __be64          bb_blkno;
-                       __be64          bb_lsn;
-                       uuid_t          bb_uuid;
-                       __be64          bb_owner;
-                       __le32          bb_crc;
-                       __be32          bb_pad; /* padding for alignment */
-               } l;                    /* long form pointers */
+               struct xfs_btree_block_shdr s;
+               struct xfs_btree_block_lhdr l;
        } bb_u;                         /* rest */
 };
 
-#define XFS_BTREE_SBLOCK_LEN   16      /* size of a short form block */
-#define XFS_BTREE_LBLOCK_LEN   24      /* size of a long form block */
+/* size of a short form block */
+#define XFS_BTREE_SBLOCK_LEN \
+       (offsetof(struct xfs_btree_block, bb_u) + \
+        offsetof(struct xfs_btree_block_shdr, bb_blkno))
+/* size of a long form block */
+#define XFS_BTREE_LBLOCK_LEN \
+       (offsetof(struct xfs_btree_block, bb_u) + \
+        offsetof(struct xfs_btree_block_lhdr, bb_blkno))
 
 /* sizes of CRC enabled btree blocks */
-#define XFS_BTREE_SBLOCK_CRC_LEN       (XFS_BTREE_SBLOCK_LEN + 40)
-#define XFS_BTREE_LBLOCK_CRC_LEN       (XFS_BTREE_LBLOCK_LEN + 48)
+#define XFS_BTREE_SBLOCK_CRC_LEN \
+       (offsetof(struct xfs_btree_block, bb_u) + \
+        sizeof(struct xfs_btree_block_shdr))
+#define XFS_BTREE_LBLOCK_CRC_LEN \
+       (offsetof(struct xfs_btree_block, bb_u) + \
+        sizeof(struct xfs_btree_block_lhdr))
 
 #define XFS_BTREE_SBLOCK_CRC_OFF \
        offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
index fffe3d0..f5ec9c5 100644 (file)
@@ -521,12 +521,8 @@ typedef struct xfs_swapext
 #define XFS_IOC_ERROR_CLEARALL      _IOW ('X', 117, struct xfs_error_injection)
 /*     XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118      */
 
-/*     XFS_IOC_FREEZE            -- FIFREEZE   119      */
-/*     XFS_IOC_THAW              -- FITHAW     120      */
-#ifndef FIFREEZE
-#define XFS_IOC_FREEZE              _IOWR('X', 119, int)
-#define XFS_IOC_THAW                _IOWR('X', 120, int)
-#endif
+#define XFS_IOC_FREEZE              _IOWR('X', 119, int)       /* aka FIFREEZE */
+#define XFS_IOC_THAW                _IOWR('X', 120, int)       /* aka FITHAW */
 
 #define XFS_IOC_FSSETDM_BY_HANDLE    _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
 #define XFS_IOC_ATTRLIST_BY_HANDLE   _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
index 22297f9..4b1e408 100644 (file)
@@ -1828,9 +1828,8 @@ xfs_difree_inode_chunk(
 
        if (!xfs_inobt_issparse(rec->ir_holemask)) {
                /* not sparse, calculate extent info directly */
-               xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
-                                 XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)),
-                                 mp->m_ialloc_blks, flist, mp);
+               xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, sagbno),
+                                 mp->m_ialloc_blks);
                return;
        }
 
@@ -1873,8 +1872,8 @@ xfs_difree_inode_chunk(
 
                ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
                ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
-               xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
-                                 flist, mp);
+               xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, agbno),
+                                 contigblk);
 
                /* reset range to current bit and carry on... */
                startidx = endidx = nextbit;
@@ -2395,20 +2394,11 @@ void
 xfs_ialloc_compute_maxlevels(
        xfs_mount_t     *mp)            /* file system mount structure */
 {
-       int             level;
-       uint            maxblocks;
-       uint            maxleafents;
-       int             minleafrecs;
-       int             minnoderecs;
-
-       maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
-               XFS_INODES_PER_CHUNK_LOG;
-       minleafrecs = mp->m_inobt_mnr[0];
-       minnoderecs = mp->m_inobt_mnr[1];
-       maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
-       for (level = 1; maxblocks > 1; level++)
-               maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
-       mp->m_in_maxlevels = level;
+       uint            inodes;
+
+       inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
+       mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr,
+                                                        inodes);
 }
 
 /*
index 951c044..e2e1106 100644 (file)
@@ -70,7 +70,7 @@ const struct xfs_buf_ops xfs_rtbuf_ops = {
  * Get a buffer for the bitmap or summary file block specified.
  * The buffer is returned read and locked.
  */
-int
+static int
 xfs_rtbuf_get(
        xfs_mount_t     *mp,            /* file system mount structure */
        xfs_trans_t     *tp,            /* transaction pointer */
index 87d2b21..7575cfc 100644 (file)
@@ -87,6 +87,12 @@ xfs_find_bdev_for_inode(
  * We're now finished for good with this page.  Update the page state via the
  * associated buffer_heads, paying attention to the start and end offsets that
  * we need to process on the page.
+ *
+ * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
+ * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
+ * the page at all, as we may be racing with memory reclaim and it can free both
+ * the bufferhead chain and the page as it will see the page as clean and
+ * unused.
  */
 static void
 xfs_finish_page_writeback(
@@ -95,8 +101,9 @@ xfs_finish_page_writeback(
        int                     error)
 {
        unsigned int            end = bvec->bv_offset + bvec->bv_len - 1;
-       struct buffer_head      *head, *bh;
+       struct buffer_head      *head, *bh, *next;
        unsigned int            off = 0;
+       unsigned int            bsize;
 
        ASSERT(bvec->bv_offset < PAGE_SIZE);
        ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
@@ -105,15 +112,17 @@ xfs_finish_page_writeback(
 
        bh = head = page_buffers(bvec->bv_page);
 
+       bsize = bh->b_size;
        do {
+               next = bh->b_this_page;
                if (off < bvec->bv_offset)
                        goto next_bh;
                if (off > end)
                        break;
                bh->b_end_io(bh, !error);
 next_bh:
-               off += bh->b_size;
-       } while ((bh = bh->b_this_page) != head);
+               off += bsize;
+       } while ((bh = next) != head);
 }
 
 /*
@@ -1041,6 +1050,20 @@ xfs_vm_releasepage(
 
        trace_xfs_releasepage(page->mapping->host, page, 0, 0);
 
+       /*
+        * mm accommodates an old ext3 case where clean pages might not have had
+        * the dirty bit cleared. Thus, it can send actual dirty pages to
+        * ->releasepage() via shrink_active_list(). Conversely,
+        * block_invalidatepage() can send pages that are still marked dirty
+        * but otherwise have invalidated buffers.
+        *
+        * We've historically freed buffers on the latter. Instead, quietly
+        * filter out all dirty pages to avoid spurious buffer state warnings.
+        * This can likely be removed once shrink_active_list() is fixed.
+        */
+       if (PageDirty(page))
+               return 0;
+
        xfs_count_page_state(page, &delalloc, &unwritten);
 
        if (WARN_ON_ONCE(delalloc))
@@ -1144,6 +1167,8 @@ __xfs_get_blocks(
        ssize_t                 size;
        int                     new = 0;
 
+       BUG_ON(create && !direct);
+
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
 
@@ -1151,22 +1176,14 @@ __xfs_get_blocks(
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
        size = bh_result->b_size;
 
-       if (!create && direct && offset >= i_size_read(inode))
+       if (!create && offset >= i_size_read(inode))
                return 0;
 
        /*
         * Direct I/O is usually done on preallocated files, so try getting
-        * a block mapping without an exclusive lock first.  For buffered
-        * writes we already have the exclusive iolock anyway, so avoiding
-        * a lock roundtrip here by taking the ilock exclusive from the
-        * beginning is a useful micro optimization.
+        * a block mapping without an exclusive lock first.
         */
-       if (create && !direct) {
-               lockmode = XFS_ILOCK_EXCL;
-               xfs_ilock(ip, lockmode);
-       } else {
-               lockmode = xfs_ilock_data_map_shared(ip);
-       }
+       lockmode = xfs_ilock_data_map_shared(ip);
 
        ASSERT(offset <= mp->m_super->s_maxbytes);
        if (offset + size > mp->m_super->s_maxbytes)
@@ -1185,37 +1202,19 @@ __xfs_get_blocks(
             (imap.br_startblock == HOLESTARTBLOCK ||
              imap.br_startblock == DELAYSTARTBLOCK) ||
             (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
-               if (direct || xfs_get_extsz_hint(ip)) {
-                       /*
-                        * xfs_iomap_write_direct() expects the shared lock. It
-                        * is unlocked on return.
-                        */
-                       if (lockmode == XFS_ILOCK_EXCL)
-                               xfs_ilock_demote(ip, lockmode);
-
-                       error = xfs_iomap_write_direct(ip, offset, size,
-                                                      &imap, nimaps);
-                       if (error)
-                               return error;
-                       new = 1;
+               /*
+                * xfs_iomap_write_direct() expects the shared lock. It
+                * is unlocked on return.
+                */
+               if (lockmode == XFS_ILOCK_EXCL)
+                       xfs_ilock_demote(ip, lockmode);
 
-               } else {
-                       /*
-                        * Delalloc reservations do not require a transaction,
-                        * we can go on without dropping the lock here. If we
-                        * are allocating a new delalloc block, make sure that
-                        * we set the new flag so that we mark the buffer new so
-                        * that we know that it is newly allocated if the write
-                        * fails.
-                        */
-                       if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
-                               new = 1;
-                       error = xfs_iomap_write_delay(ip, offset, size, &imap);
-                       if (error)
-                               goto out_unlock;
+               error = xfs_iomap_write_direct(ip, offset, size,
+                                              &imap, nimaps);
+               if (error)
+                       return error;
+               new = 1;
 
-                       xfs_iunlock(ip, lockmode);
-               }
                trace_xfs_get_blocks_alloc(ip, offset, size,
                                ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
                                                   : XFS_IO_DELALLOC, &imap);
@@ -1236,9 +1235,7 @@ __xfs_get_blocks(
        }
 
        /* trim mapping down to size requested */
-       if (direct || size > (1 << inode->i_blkbits))
-               xfs_map_trim_size(inode, iblock, bh_result,
-                                 &imap, offset, size);
+       xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
 
        /*
         * For unwritten extents do not report a disk address in the buffered
@@ -1251,7 +1248,7 @@ __xfs_get_blocks(
                if (ISUNWRITTEN(&imap))
                        set_buffer_unwritten(bh_result);
                /* direct IO needs special help */
-               if (create && direct) {
+               if (create) {
                        if (dax_fault)
                                ASSERT(!ISUNWRITTEN(&imap));
                        else
@@ -1280,14 +1277,7 @@ __xfs_get_blocks(
             (new || ISUNWRITTEN(&imap))))
                set_buffer_new(bh_result);
 
-       if (imap.br_startblock == DELAYSTARTBLOCK) {
-               BUG_ON(direct);
-               if (create) {
-                       set_buffer_uptodate(bh_result);
-                       set_buffer_mapped(bh_result);
-                       set_buffer_delay(bh_result);
-               }
-       }
+       BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
 
        return 0;
 
@@ -1337,7 +1327,7 @@ xfs_get_blocks_dax_fault(
  * whereas if we have flags set we will always be called in task context
  * (i.e. from a workqueue).
  */
-STATIC int
+int
 xfs_end_io_direct_write(
        struct kiocb            *iocb,
        loff_t                  offset,
@@ -1408,234 +1398,10 @@ xfs_vm_direct_IO(
        struct kiocb            *iocb,
        struct iov_iter         *iter)
 {
-       struct inode            *inode = iocb->ki_filp->f_mapping->host;
-       dio_iodone_t            *endio = NULL;
-       int                     flags = 0;
-       struct block_device     *bdev;
-
-       if (iov_iter_rw(iter) == WRITE) {
-               endio = xfs_end_io_direct_write;
-               flags = DIO_ASYNC_EXTEND;
-       }
-
-       if (IS_DAX(inode)) {
-               return dax_do_io(iocb, inode, iter,
-                                xfs_get_blocks_direct, endio, 0);
-       }
-
-       bdev = xfs_find_bdev_for_inode(inode);
-       return  __blockdev_direct_IO(iocb, inode, bdev, iter,
-                       xfs_get_blocks_direct, endio, NULL, flags);
-}
-
-/*
- * Punch out the delalloc blocks we have already allocated.
- *
- * Don't bother with xfs_setattr given that nothing can have made it to disk yet
- * as the page is still locked at this point.
- */
-STATIC void
-xfs_vm_kill_delalloc_range(
-       struct inode            *inode,
-       loff_t                  start,
-       loff_t                  end)
-{
-       struct xfs_inode        *ip = XFS_I(inode);
-       xfs_fileoff_t           start_fsb;
-       xfs_fileoff_t           end_fsb;
-       int                     error;
-
-       start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
-       end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
-       if (end_fsb <= start_fsb)
-               return;
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
-                                               end_fsb - start_fsb);
-       if (error) {
-               /* something screwed, just bail */
-               if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                       xfs_alert(ip->i_mount,
-               "xfs_vm_write_failed: unable to clean up ino %lld",
-                                       ip->i_ino);
-               }
-       }
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-}
-
-STATIC void
-xfs_vm_write_failed(
-       struct inode            *inode,
-       struct page             *page,
-       loff_t                  pos,
-       unsigned                len)
-{
-       loff_t                  block_offset;
-       loff_t                  block_start;
-       loff_t                  block_end;
-       loff_t                  from = pos & (PAGE_SIZE - 1);
-       loff_t                  to = from + len;
-       struct buffer_head      *bh, *head;
-       struct xfs_mount        *mp = XFS_I(inode)->i_mount;
-
        /*
-        * The request pos offset might be 32 or 64 bit, this is all fine
-        * on 64-bit platform.  However, for 64-bit pos request on 32-bit
-        * platform, the high 32-bit will be masked off if we evaluate the
-        * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
-        * 0xfffff000 as an unsigned long, hence the result is incorrect
-        * which could cause the following ASSERT failed in most cases.
-        * In order to avoid this, we can evaluate the block_offset of the
-        * start of the page by using shifts rather than masks the mismatch
-        * problem.
+        * We just need the method present so that open/fcntl allow direct I/O.
         */
-       block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT;
-
-       ASSERT(block_offset + from == pos);
-
-       head = page_buffers(page);
-       block_start = 0;
-       for (bh = head; bh != head || !block_start;
-            bh = bh->b_this_page, block_start = block_end,
-                                  block_offset += bh->b_size) {
-               block_end = block_start + bh->b_size;
-
-               /* skip buffers before the write */
-               if (block_end <= from)
-                       continue;
-
-               /* if the buffer is after the write, we're done */
-               if (block_start >= to)
-                       break;
-
-               /*
-                * Process delalloc and unwritten buffers beyond EOF. We can
-                * encounter unwritten buffers in the event that a file has
-                * post-EOF unwritten extents and an extending write happens to
-                * fail (e.g., an unaligned write that also involves a delalloc
-                * to the same page).
-                */
-               if (!buffer_delay(bh) && !buffer_unwritten(bh))
-                       continue;
-
-               if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
-                   block_offset < i_size_read(inode))
-                       continue;
-
-               if (buffer_delay(bh))
-                       xfs_vm_kill_delalloc_range(inode, block_offset,
-                                                  block_offset + bh->b_size);
-
-               /*
-                * This buffer does not contain data anymore. make sure anyone
-                * who finds it knows that for certain.
-                */
-               clear_buffer_delay(bh);
-               clear_buffer_uptodate(bh);
-               clear_buffer_mapped(bh);
-               clear_buffer_new(bh);
-               clear_buffer_dirty(bh);
-               clear_buffer_unwritten(bh);
-       }
-
-}
-
-/*
- * This used to call block_write_begin(), but it unlocks and releases the page
- * on error, and we need that page to be able to punch stale delalloc blocks out
- * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
- * the appropriate point.
- */
-STATIC int
-xfs_vm_write_begin(
-       struct file             *file,
-       struct address_space    *mapping,
-       loff_t                  pos,
-       unsigned                len,
-       unsigned                flags,
-       struct page             **pagep,
-       void                    **fsdata)
-{
-       pgoff_t                 index = pos >> PAGE_SHIFT;
-       struct page             *page;
-       int                     status;
-       struct xfs_mount        *mp = XFS_I(mapping->host)->i_mount;
-
-       ASSERT(len <= PAGE_SIZE);
-
-       page = grab_cache_page_write_begin(mapping, index, flags);
-       if (!page)
-               return -ENOMEM;
-
-       status = __block_write_begin(page, pos, len, xfs_get_blocks);
-       if (xfs_mp_fail_writes(mp))
-               status = -EIO;
-       if (unlikely(status)) {
-               struct inode    *inode = mapping->host;
-               size_t          isize = i_size_read(inode);
-
-               xfs_vm_write_failed(inode, page, pos, len);
-               unlock_page(page);
-
-               /*
-                * If the write is beyond EOF, we only want to kill blocks
-                * allocated in this write, not blocks that were previously
-                * written successfully.
-                */
-               if (xfs_mp_fail_writes(mp))
-                       isize = 0;
-               if (pos + len > isize) {
-                       ssize_t start = max_t(ssize_t, pos, isize);
-
-                       truncate_pagecache_range(inode, start, pos + len);
-               }
-
-               put_page(page);
-               page = NULL;
-       }
-
-       *pagep = page;
-       return status;
-}
-
-/*
- * On failure, we only need to kill delalloc blocks beyond EOF in the range of
- * this specific write because they will never be written. Previous writes
- * beyond EOF where block allocation succeeded do not need to be trashed, so
- * only new blocks from this write should be trashed. For blocks within
- * EOF, generic_write_end() zeros them so they are safe to leave alone and be
- * written with all the other valid data.
- */
-STATIC int
-xfs_vm_write_end(
-       struct file             *file,
-       struct address_space    *mapping,
-       loff_t                  pos,
-       unsigned                len,
-       unsigned                copied,
-       struct page             *page,
-       void                    *fsdata)
-{
-       int                     ret;
-
-       ASSERT(len <= PAGE_SIZE);
-
-       ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-       if (unlikely(ret < len)) {
-               struct inode    *inode = mapping->host;
-               size_t          isize = i_size_read(inode);
-               loff_t          to = pos + len;
-
-               if (to > isize) {
-                       /* only kill blocks in this write beyond EOF */
-                       if (pos > isize)
-                               isize = pos;
-                       xfs_vm_kill_delalloc_range(inode, isize, to);
-                       truncate_pagecache_range(inode, isize, to);
-               }
-       }
-       return ret;
+       return -EINVAL;
 }
 
 STATIC sector_t
@@ -1748,8 +1514,6 @@ const struct address_space_operations xfs_address_space_operations = {
        .set_page_dirty         = xfs_vm_set_page_dirty,
        .releasepage            = xfs_vm_releasepage,
        .invalidatepage         = xfs_vm_invalidatepage,
-       .write_begin            = xfs_vm_write_begin,
-       .write_end              = xfs_vm_write_end,
        .bmap                   = xfs_vm_bmap,
        .direct_IO              = xfs_vm_direct_IO,
        .migratepage            = buffer_migrate_page,
index 814aab7..bf2d9a1 100644 (file)
@@ -60,6 +60,9 @@ int   xfs_get_blocks_direct(struct inode *inode, sector_t offset,
 int    xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
                                 struct buffer_head *map_bh, int create);
 
+int    xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
+               ssize_t size, void *private);
+
 extern void xfs_count_page_state(struct page *, int *, int *);
 extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
 
index 55d2149..be0b79d 100644 (file)
@@ -322,7 +322,7 @@ xfs_attr3_node_inactive(
  * Recurse (gasp!) through the attribute nodes until we find leaves.
  * We're doing a depth-first traversal in order to invalidate everything.
  */
-int
+static int
 xfs_attr3_root_inactive(
        struct xfs_trans        **trans,
        struct xfs_inode        *dp)
index d25f26b..25e76cd 100644 (file)
@@ -65,7 +65,7 @@ xfs_attr_shortform_compare(const void *a, const void *b)
  * we have to calculate each entries' hashvalue and sort them before
  * we can begin returning them to the user.
  */
-int
+static int
 xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 {
        attrlist_cursor_kern_t *cursor;
index 586bb64..cd4a850 100644 (file)
@@ -79,6 +79,23 @@ xfs_zero_extent(
                GFP_NOFS, true);
 }
 
+/* Sort bmap items by AG. */
+static int
+xfs_bmap_free_list_cmp(
+       void                    *priv,
+       struct list_head        *a,
+       struct list_head        *b)
+{
+       struct xfs_mount        *mp = priv;
+       struct xfs_bmap_free_item       *ra;
+       struct xfs_bmap_free_item       *rb;
+
+       ra = container_of(a, struct xfs_bmap_free_item, xbfi_list);
+       rb = container_of(b, struct xfs_bmap_free_item, xbfi_list);
+       return  XFS_FSB_TO_AGNO(mp, ra->xbfi_startblock) -
+               XFS_FSB_TO_AGNO(mp, rb->xbfi_startblock);
+}
+
 /*
  * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
  * caller.  Frees all the extents that need freeing, which must be done
@@ -99,14 +116,15 @@ xfs_bmap_finish(
        int                             error;  /* error return value */
        int                             committed;/* xact committed or not */
        struct xfs_bmap_free_item       *free;  /* free extent item */
-       struct xfs_bmap_free_item       *next;  /* next item on free list */
 
        ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
        if (flist->xbf_count == 0)
                return 0;
 
+       list_sort((*tp)->t_mountp, &flist->xbf_flist, xfs_bmap_free_list_cmp);
+
        efi = xfs_trans_get_efi(*tp, flist->xbf_count);
-       for (free = flist->xbf_first; free; free = free->xbfi_next)
+       list_for_each_entry(free, &flist->xbf_flist, xbfi_list)
                xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
                        free->xbfi_blockcount);
 
@@ -125,9 +143,7 @@ xfs_bmap_finish(
                if (committed) {
                        xfs_efi_release(efi);
                        xfs_force_shutdown((*tp)->t_mountp,
-                               (error == -EFSCORRUPTED) ?
-                                       SHUTDOWN_CORRUPT_INCORE :
-                                       SHUTDOWN_META_IO_ERROR);
+                                          SHUTDOWN_META_IO_ERROR);
                }
                return error;
        }
@@ -138,15 +154,15 @@ xfs_bmap_finish(
         * on error.
         */
        efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
-       for (free = flist->xbf_first; free != NULL; free = next) {
-               next = free->xbfi_next;
-
+       while (!list_empty(&flist->xbf_flist)) {
+               free = list_first_entry(&flist->xbf_flist,
+                               struct xfs_bmap_free_item, xbfi_list);
                error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock,
                                              free->xbfi_blockcount);
                if (error)
                        return error;
 
-               xfs_bmap_del_free(flist, NULL, free);
+               xfs_bmap_del_free(flist, free);
        }
 
        return 0;
@@ -409,7 +425,7 @@ xfs_bmap_count_tree(
 /*
  * Count fsblocks of the given fork.
  */
-int                                            /* error */
+static int                                     /* error */
 xfs_bmap_count_blocks(
        xfs_trans_t             *tp,            /* transaction pointer */
        xfs_inode_t             *ip,            /* incore inode */
@@ -799,7 +815,7 @@ xfs_bmap_punch_delalloc_range(
                if (error)
                        break;
 
-               ASSERT(!flist.xbf_count && !flist.xbf_first);
+               ASSERT(!flist.xbf_count && list_empty(&flist.xbf_flist));
 next_block:
                start_fsb++;
                remaining--;
@@ -1089,99 +1105,120 @@ error1:        /* Just cancel transaction */
        return error;
 }
 
-/*
- * Zero file bytes between startoff and endoff inclusive.
- * The iolock is held exclusive and no blocks are buffered.
- *
- * This function is used by xfs_free_file_space() to zero
- * partial blocks when the range to free is not block aligned.
- * When unreserving space with boundaries that are not block
- * aligned we round up the start and round down the end
- * boundaries and then use this function to zero the parts of
- * the blocks that got dropped during the rounding.
- */
-STATIC int
-xfs_zero_remaining_bytes(
-       xfs_inode_t             *ip,
-       xfs_off_t               startoff,
-       xfs_off_t               endoff)
+static int
+xfs_unmap_extent(
+       struct xfs_inode        *ip,
+       xfs_fileoff_t           startoffset_fsb,
+       xfs_filblks_t           len_fsb,
+       int                     *done)
 {
-       xfs_bmbt_irec_t         imap;
-       xfs_fileoff_t           offset_fsb;
-       xfs_off_t               lastoffset;
-       xfs_off_t               offset;
-       xfs_buf_t               *bp;
-       xfs_mount_t             *mp = ip->i_mount;
-       int                     nimap;
-       int                     error = 0;
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_trans        *tp;
+       struct xfs_bmap_free    free_list;
+       xfs_fsblock_t           firstfsb;
+       uint                    resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+       int                     error;
 
-       /*
-        * Avoid doing I/O beyond eof - it's not necessary
-        * since nothing can read beyond eof.  The space will
-        * be zeroed when the file is extended anyway.
-        */
-       if (startoff >= XFS_ISIZE(ip))
-               return 0;
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+       if (error) {
+               ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+               return error;
+       }
 
-       if (endoff > XFS_ISIZE(ip))
-               endoff = XFS_ISIZE(ip);
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot,
+                       ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS);
+       if (error)
+               goto out_trans_cancel;
 
-       for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
-               uint lock_mode;
+       xfs_trans_ijoin(tp, ip, 0);
 
-               offset_fsb = XFS_B_TO_FSBT(mp, offset);
-               nimap = 1;
+       xfs_bmap_init(&free_list, &firstfsb);
+       error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb,
+                       &free_list, done);
+       if (error)
+               goto out_bmap_cancel;
 
-               lock_mode = xfs_ilock_data_map_shared(ip);
-               error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
-               xfs_iunlock(ip, lock_mode);
+       error = xfs_bmap_finish(&tp, &free_list, NULL);
+       if (error)
+               goto out_bmap_cancel;
 
-               if (error || nimap < 1)
-                       break;
-               ASSERT(imap.br_blockcount >= 1);
-               ASSERT(imap.br_startoff == offset_fsb);
-               ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+       error = xfs_trans_commit(tp);
+out_unlock:
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       return error;
 
-               if (imap.br_startblock == HOLESTARTBLOCK ||
-                   imap.br_state == XFS_EXT_UNWRITTEN) {
-                       /* skip the entire extent */
-                       lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
-                                                     imap.br_blockcount) - 1;
-                       continue;
-               }
+out_bmap_cancel:
+       xfs_bmap_cancel(&free_list);
+out_trans_cancel:
+       xfs_trans_cancel(tp);
+       goto out_unlock;
+}
 
-               lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
-               if (lastoffset > endoff)
-                       lastoffset = endoff;
+static int
+xfs_adjust_extent_unmap_boundaries(
+       struct xfs_inode        *ip,
+       xfs_fileoff_t           *startoffset_fsb,
+       xfs_fileoff_t           *endoffset_fsb)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_bmbt_irec    imap;
+       int                     nimap, error;
+       xfs_extlen_t            mod = 0;
 
-               /* DAX can just zero the backing device directly */
-               if (IS_DAX(VFS_I(ip))) {
-                       error = dax_zero_page_range(VFS_I(ip), offset,
-                                                   lastoffset - offset + 1,
-                                                   xfs_get_blocks_direct);
-                       if (error)
-                               return error;
-                       continue;
-               }
+       nimap = 1;
+       error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0);
+       if (error)
+               return error;
 
-               error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
-                               mp->m_rtdev_targp : mp->m_ddev_targp,
-                               xfs_fsb_to_db(ip, imap.br_startblock),
-                               BTOBB(mp->m_sb.sb_blocksize),
-                               0, &bp, NULL);
-               if (error)
-                       return error;
+       if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+               xfs_daddr_t     block;
 
-               memset(bp->b_addr +
-                               (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
-                      0, lastoffset - offset + 1);
+               ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+               block = imap.br_startblock;
+               mod = do_div(block, mp->m_sb.sb_rextsize);
+               if (mod)
+                       *startoffset_fsb += mp->m_sb.sb_rextsize - mod;
+       }
 
-               error = xfs_bwrite(bp);
-               xfs_buf_relse(bp);
-               if (error)
-                       return error;
+       nimap = 1;
+       error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0);
+       if (error)
+               return error;
+
+       if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+               ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+               mod++;
+               if (mod && mod != mp->m_sb.sb_rextsize)
+                       *endoffset_fsb -= mod;
        }
-       return error;
+
+       return 0;
+}
+
+static int
+xfs_flush_unmap_range(
+       struct xfs_inode        *ip,
+       xfs_off_t               offset,
+       xfs_off_t               len)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct inode            *inode = VFS_I(ip);
+       xfs_off_t               rounding, start, end;
+       int                     error;
+
+       /* wait for the completion of any pending DIOs */
+       inode_dio_wait(inode);
+
+       rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
+       start = round_down(offset, rounding);
+       end = round_up(offset + len, rounding) - 1;
+
+       error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+       if (error)
+               return error;
+       truncate_pagecache_range(inode, start, end);
+       return 0;
 }
 
 int
@@ -1190,24 +1227,10 @@ xfs_free_file_space(
        xfs_off_t               offset,
        xfs_off_t               len)
 {
-       int                     done;
-       xfs_fileoff_t           endoffset_fsb;
-       int                     error;
-       xfs_fsblock_t           firstfsb;
-       xfs_bmap_free_t         free_list;
-       xfs_bmbt_irec_t         imap;
-       xfs_off_t               ioffset;
-       xfs_off_t               iendoffset;
-       xfs_extlen_t            mod=0;
-       xfs_mount_t             *mp;
-       int                     nimap;
-       uint                    resblks;
-       xfs_off_t               rounding;
-       int                     rt;
+       struct xfs_mount        *mp = ip->i_mount;
        xfs_fileoff_t           startoffset_fsb;
-       xfs_trans_t             *tp;
-
-       mp = ip->i_mount;
+       xfs_fileoff_t           endoffset_fsb;
+       int                     done = 0, error;
 
        trace_xfs_free_file_space(ip);
 
@@ -1215,135 +1238,45 @@ xfs_free_file_space(
        if (error)
                return error;
 
-       error = 0;
        if (len <= 0)   /* if nothing being freed */
-               return error;
-       rt = XFS_IS_REALTIME_INODE(ip);
-       startoffset_fsb = XFS_B_TO_FSB(mp, offset);
-       endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
-
-       /* wait for the completion of any pending DIOs */
-       inode_dio_wait(VFS_I(ip));
+               return 0;
 
-       rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
-       ioffset = round_down(offset, rounding);
-       iendoffset = round_up(offset + len, rounding) - 1;
-       error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
-                                            iendoffset);
+       error = xfs_flush_unmap_range(ip, offset, len);
        if (error)
-               goto out;
-       truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset);
+               return error;
+
+       startoffset_fsb = XFS_B_TO_FSB(mp, offset);
+       endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
 
        /*
-        * Need to zero the stuff we're not freeing, on disk.
-        * If it's a realtime file & can't use unwritten extents then we
-        * actually need to zero the extent edges.  Otherwise xfs_bunmapi
-        * will take care of it for us.
+        * Need to zero the stuff we're not freeing, on disk.  If it's a RT file
+        * and we can't use unwritten extents then we actually need to ensure
+        * to zero the whole extent, otherwise we just need to take of block
+        * boundaries, and xfs_bunmapi will handle the rest.
         */
-       if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
-               nimap = 1;
-               error = xfs_bmapi_read(ip, startoffset_fsb, 1,
-                                       &imap, &nimap, 0);
-               if (error)
-                       goto out;
-               ASSERT(nimap == 0 || nimap == 1);
-               if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
-                       xfs_daddr_t     block;
-
-                       ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-                       block = imap.br_startblock;
-                       mod = do_div(block, mp->m_sb.sb_rextsize);
-                       if (mod)
-                               startoffset_fsb += mp->m_sb.sb_rextsize - mod;
-               }
-               nimap = 1;
-               error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
-                                       &imap, &nimap, 0);
+       if (XFS_IS_REALTIME_INODE(ip) &&
+           !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+               error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb,
+                               &endoffset_fsb);
                if (error)
-                       goto out;
-               ASSERT(nimap == 0 || nimap == 1);
-               if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
-                       ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-                       mod++;
-                       if (mod && (mod != mp->m_sb.sb_rextsize))
-                               endoffset_fsb -= mod;
-               }
-       }
-       if ((done = (endoffset_fsb <= startoffset_fsb)))
-               /*
-                * One contiguous piece to clear
-                */
-               error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
-       else {
-               /*
-                * Some full blocks, possibly two pieces to clear
-                */
-               if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
-                       error = xfs_zero_remaining_bytes(ip, offset,
-                               XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
-               if (!error &&
-                   XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
-                       error = xfs_zero_remaining_bytes(ip,
-                               XFS_FSB_TO_B(mp, endoffset_fsb),
-                               offset + len - 1);
+                       return error;
        }
 
-       /*
-        * free file space until done or until there is an error
-        */
-       resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
-       while (!error && !done) {
-
-               /*
-                * allocate and setup the transaction. Allow this
-                * transaction to dip into the reserve blocks to ensure
-                * the freeing of the space succeeds at ENOSPC.
-                */
-               error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
-                               &tp);
-               if (error) {
-                       ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-                       break;
+       if (endoffset_fsb > startoffset_fsb) {
+               while (!done) {
+                       error = xfs_unmap_extent(ip, startoffset_fsb,
+                                       endoffset_fsb - startoffset_fsb, &done);
+                       if (error)
+                               return error;
                }
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               error = xfs_trans_reserve_quota(tp, mp,
-                               ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
-                               resblks, 0, XFS_QMOPT_RES_REGBLKS);
-               if (error)
-                       goto error1;
-
-               xfs_trans_ijoin(tp, ip, 0);
-
-               /*
-                * issue the bunmapi() call to free the blocks
-                */
-               xfs_bmap_init(&free_list, &firstfsb);
-               error = xfs_bunmapi(tp, ip, startoffset_fsb,
-                                 endoffset_fsb - startoffset_fsb,
-                                 0, 2, &firstfsb, &free_list, &done);
-               if (error)
-                       goto error0;
-
-               /*
-                * complete the transaction
-                */
-               error = xfs_bmap_finish(&tp, &free_list, NULL);
-               if (error)
-                       goto error0;
-
-               error = xfs_trans_commit(tp);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
 
- out:
-       return error;
-
- error0:
-       xfs_bmap_cancel(&free_list);
- error1:
-       xfs_trans_cancel(tp);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       goto out;
+       /*
+        * Now that we've unmap all full blocks we'll have to zero out any
+        * partial block at the beginning and/or end.  xfs_zero_range is
+        * smart enough to skip any holes, including those we just created.
+        */
+       return xfs_zero_range(ip, offset, len, NULL);
 }
 
 /*
index af97d9a..f200714 100644 (file)
@@ -31,8 +31,6 @@ struct xfs_bmalloca;
 int    xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
 int    xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
                     int whichfork, int *eof);
-int    xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
-                             int whichfork, int *count);
 int    xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
                xfs_fileoff_t start_fsb, xfs_fileoff_t length);
 
@@ -43,7 +41,6 @@ int   xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
 
 /* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
 void   xfs_bmap_del_free(struct xfs_bmap_free *flist,
-                         struct xfs_bmap_free_item *prev,
                          struct xfs_bmap_free_item *free);
 int    xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
                               struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
index a87a0d5..47a318c 100644 (file)
@@ -79,6 +79,47 @@ xfs_buf_vmap_len(
        return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
 }
 
+/*
+ * Bump the I/O in flight count on the buftarg if we haven't yet done so for
+ * this buffer. The count is incremented once per buffer (per hold cycle)
+ * because the corresponding decrement is deferred to buffer release. Buffers
+ * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
+ * tracking adds unnecessary overhead. This is used for sychronization purposes
+ * with unmount (see xfs_wait_buftarg()), so all we really need is a count of
+ * in-flight buffers.
+ *
+ * Buffers that are never released (e.g., superblock, iclog buffers) must set
+ * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
+ * never reaches zero and unmount hangs indefinitely.
+ */
+static inline void
+xfs_buf_ioacct_inc(
+       struct xfs_buf  *bp)
+{
+       if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT))
+               return;
+
+       ASSERT(bp->b_flags & XBF_ASYNC);
+       bp->b_flags |= _XBF_IN_FLIGHT;
+       percpu_counter_inc(&bp->b_target->bt_io_count);
+}
+
+/*
+ * Clear the in-flight state on a buffer about to be released to the LRU or
+ * freed and unaccount from the buftarg.
+ */
+static inline void
+xfs_buf_ioacct_dec(
+       struct xfs_buf  *bp)
+{
+       if (!(bp->b_flags & _XBF_IN_FLIGHT))
+               return;
+
+       ASSERT(bp->b_flags & XBF_ASYNC);
+       bp->b_flags &= ~_XBF_IN_FLIGHT;
+       percpu_counter_dec(&bp->b_target->bt_io_count);
+}
+
 /*
  * When we mark a buffer stale, we remove the buffer from the LRU and clear the
  * b_lru_ref count so that the buffer is freed immediately when the buffer
@@ -102,6 +143,14 @@ xfs_buf_stale(
         */
        bp->b_flags &= ~_XBF_DELWRI_Q;
 
+       /*
+        * Once the buffer is marked stale and unlocked, a subsequent lookup
+        * could reset b_flags. There is no guarantee that the buffer is
+        * unaccounted (released to LRU) before that occurs. Drop in-flight
+        * status now to preserve accounting consistency.
+        */
+       xfs_buf_ioacct_dec(bp);
+
        spin_lock(&bp->b_lock);
        atomic_set(&bp->b_lru_ref, 0);
        if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
@@ -815,7 +864,8 @@ xfs_buf_get_uncached(
        struct xfs_buf          *bp;
        DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
 
-       bp = _xfs_buf_alloc(target, &map, 1, 0);
+       /* flags might contain irrelevant bits, pass only what we care about */
+       bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT);
        if (unlikely(bp == NULL))
                goto fail;
 
@@ -866,63 +916,85 @@ xfs_buf_hold(
 }
 
 /*
- *     Releases a hold on the specified buffer.  If the
- *     the hold count is 1, calls xfs_buf_free.
+ * Release a hold on the specified buffer. If the hold count is 1, the buffer is
+ * placed on LRU or freed (depending on b_lru_ref).
  */
 void
 xfs_buf_rele(
        xfs_buf_t               *bp)
 {
        struct xfs_perag        *pag = bp->b_pag;
+       bool                    release;
+       bool                    freebuf = false;
 
        trace_xfs_buf_rele(bp, _RET_IP_);
 
        if (!pag) {
                ASSERT(list_empty(&bp->b_lru));
                ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
-               if (atomic_dec_and_test(&bp->b_hold))
+               if (atomic_dec_and_test(&bp->b_hold)) {
+                       xfs_buf_ioacct_dec(bp);
                        xfs_buf_free(bp);
+               }
                return;
        }
 
        ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
 
        ASSERT(atomic_read(&bp->b_hold) > 0);
-       if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
-               spin_lock(&bp->b_lock);
-               if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
-                       /*
-                        * If the buffer is added to the LRU take a new
-                        * reference to the buffer for the LRU and clear the
-                        * (now stale) dispose list state flag
-                        */
-                       if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
-                               bp->b_state &= ~XFS_BSTATE_DISPOSE;
-                               atomic_inc(&bp->b_hold);
-                       }
-                       spin_unlock(&bp->b_lock);
-                       spin_unlock(&pag->pag_buf_lock);
-               } else {
-                       /*
-                        * most of the time buffers will already be removed from
-                        * the LRU, so optimise that case by checking for the
-                        * XFS_BSTATE_DISPOSE flag indicating the last list the
-                        * buffer was on was the disposal list
-                        */
-                       if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
-                               list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
-                       } else {
-                               ASSERT(list_empty(&bp->b_lru));
-                       }
-                       spin_unlock(&bp->b_lock);
 
-                       ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
-                       rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
-                       spin_unlock(&pag->pag_buf_lock);
-                       xfs_perag_put(pag);
-                       xfs_buf_free(bp);
+       release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+       spin_lock(&bp->b_lock);
+       if (!release) {
+               /*
+                * Drop the in-flight state if the buffer is already on the LRU
+                * and it holds the only reference. This is racy because we
+                * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
+                * ensures the decrement occurs only once per-buf.
+                */
+               if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
+                       xfs_buf_ioacct_dec(bp);
+               goto out_unlock;
+       }
+
+       /* the last reference has been dropped ... */
+       xfs_buf_ioacct_dec(bp);
+       if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+               /*
+                * If the buffer is added to the LRU take a new reference to the
+                * buffer for the LRU and clear the (now stale) dispose list
+                * state flag
+                */
+               if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+                       bp->b_state &= ~XFS_BSTATE_DISPOSE;
+                       atomic_inc(&bp->b_hold);
+               }
+               spin_unlock(&pag->pag_buf_lock);
+       } else {
+               /*
+                * most of the time buffers will already be removed from the
+                * LRU, so optimise that case by checking for the
+                * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
+                * was on was the disposal list
+                */
+               if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
+                       list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+               } else {
+                       ASSERT(list_empty(&bp->b_lru));
                }
+
+               ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+               rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+               spin_unlock(&pag->pag_buf_lock);
+               xfs_perag_put(pag);
+               freebuf = true;
        }
+
+out_unlock:
+       spin_unlock(&bp->b_lock);
+
+       if (freebuf)
+               xfs_buf_free(bp);
 }
 
 
@@ -944,10 +1016,12 @@ xfs_buf_trylock(
        int                     locked;
 
        locked = down_trylock(&bp->b_sema) == 0;
-       if (locked)
+       if (locked) {
                XB_SET_OWNER(bp);
-
-       trace_xfs_buf_trylock(bp, _RET_IP_);
+               trace_xfs_buf_trylock(bp, _RET_IP_);
+       } else {
+               trace_xfs_buf_trylock_fail(bp, _RET_IP_);
+       }
        return locked;
 }
 
@@ -1339,6 +1413,7 @@ xfs_buf_submit(
         * xfs_buf_ioend too early.
         */
        atomic_set(&bp->b_io_remaining, 1);
+       xfs_buf_ioacct_inc(bp);
        _xfs_buf_ioapply(bp);
 
        /*
@@ -1524,13 +1599,19 @@ xfs_wait_buftarg(
        int loop = 0;
 
        /*
-        * We need to flush the buffer workqueue to ensure that all IO
-        * completion processing is 100% done. Just waiting on buffer locks is
-        * not sufficient for async IO as the reference count held over IO is
-        * not released until after the buffer lock is dropped. Hence we need to
-        * ensure here that all reference counts have been dropped before we
-        * start walking the LRU list.
+        * First wait on the buftarg I/O count for all in-flight buffers to be
+        * released. This is critical as new buffers do not make the LRU until
+        * they are released.
+        *
+        * Next, flush the buffer workqueue to ensure all completion processing
+        * has finished. Just waiting on buffer locks is not sufficient for
+        * async IO as the reference count held over IO is not released until
+        * after the buffer lock is dropped. Hence we need to ensure here that
+        * all reference counts have been dropped before we start walking the
+        * LRU list.
         */
+       while (percpu_counter_sum(&btp->bt_io_count))
+               delay(100);
        drain_workqueue(btp->bt_mount->m_buf_workqueue);
 
        /* loop until there is nothing left on the lru list. */
@@ -1627,6 +1708,8 @@ xfs_free_buftarg(
        struct xfs_buftarg      *btp)
 {
        unregister_shrinker(&btp->bt_shrinker);
+       ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
+       percpu_counter_destroy(&btp->bt_io_count);
        list_lru_destroy(&btp->bt_lru);
 
        if (mp->m_flags & XFS_MOUNT_BARRIER)
@@ -1691,6 +1774,9 @@ xfs_alloc_buftarg(
        if (list_lru_init(&btp->bt_lru))
                goto error;
 
+       if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+               goto error;
+
        btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
        btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
        btp->bt_shrinker.seeks = DEFAULT_SEEKS;
@@ -1774,18 +1860,33 @@ xfs_buf_cmp(
        return 0;
 }
 
+/*
+ * submit buffers for write.
+ *
+ * When we have a large buffer list, we do not want to hold all the buffers
+ * locked while we block on the request queue waiting for IO dispatch. To avoid
+ * this problem, we lock and submit buffers in groups of 50, thereby minimising
+ * the lock hold times for lists which may contain thousands of objects.
+ *
+ * To do this, we sort the buffer list before we walk the list to lock and
+ * submit buffers, and we plug and unplug around each group of buffers we
+ * submit.
+ */
 static int
-__xfs_buf_delwri_submit(
+xfs_buf_delwri_submit_buffers(
        struct list_head        *buffer_list,
-       struct list_head        *io_list,
-       bool                    wait)
+       struct list_head        *wait_list)
 {
-       struct blk_plug         plug;
        struct xfs_buf          *bp, *n;
+       LIST_HEAD               (submit_list);
        int                     pinned = 0;
+       struct blk_plug         plug;
 
+       list_sort(NULL, buffer_list, xfs_buf_cmp);
+
+       blk_start_plug(&plug);
        list_for_each_entry_safe(bp, n, buffer_list, b_list) {
-               if (!wait) {
+               if (!wait_list) {
                        if (xfs_buf_ispinned(bp)) {
                                pinned++;
                                continue;
@@ -1808,25 +1909,21 @@ __xfs_buf_delwri_submit(
                        continue;
                }
 
-               list_move_tail(&bp->b_list, io_list);
                trace_xfs_buf_delwri_split(bp, _RET_IP_);
-       }
-
-       list_sort(NULL, io_list, xfs_buf_cmp);
-
-       blk_start_plug(&plug);
-       list_for_each_entry_safe(bp, n, io_list, b_list) {
-               bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
-               bp->b_flags |= XBF_WRITE | XBF_ASYNC;
 
                /*
-                * we do all Io submission async. This means if we need to wait
-                * for IO completion we need to take an extra reference so the
-                * buffer is still valid on the other side.
+                * We do all IO submission async. This means if we need
+                * to wait for IO completion we need to take an extra
+                * reference so the buffer is still valid on the other
+                * side. We need to move the buffer onto the io_list
+                * at this point so the caller can still access it.
                 */
-               if (wait)
+               bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL);
+               bp->b_flags |= XBF_WRITE | XBF_ASYNC;
+               if (wait_list) {
                        xfs_buf_hold(bp);
-               else
+                       list_move_tail(&bp->b_list, wait_list);
+               } else
                        list_del_init(&bp->b_list);
 
                xfs_buf_submit(bp);
@@ -1849,8 +1946,7 @@ int
 xfs_buf_delwri_submit_nowait(
        struct list_head        *buffer_list)
 {
-       LIST_HEAD               (io_list);
-       return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
+       return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
 }
 
 /*
@@ -1865,15 +1961,15 @@ int
 xfs_buf_delwri_submit(
        struct list_head        *buffer_list)
 {
-       LIST_HEAD               (io_list);
+       LIST_HEAD               (wait_list);
        int                     error = 0, error2;
        struct xfs_buf          *bp;
 
-       __xfs_buf_delwri_submit(buffer_list, &io_list, true);
+       xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
 
        /* Wait for IO to complete. */
-       while (!list_empty(&io_list)) {
-               bp = list_first_entry(&io_list, struct xfs_buf, b_list);
+       while (!list_empty(&wait_list)) {
+               bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
 
                list_del_init(&bp->b_list);
 
index 8bfb974..1c2e52b 100644 (file)
@@ -43,6 +43,7 @@ typedef enum {
 #define XBF_READ        (1 << 0) /* buffer intended for reading from device */
 #define XBF_WRITE       (1 << 1) /* buffer intended for writing to device */
 #define XBF_READ_AHEAD  (1 << 2) /* asynchronous read-ahead */
+#define XBF_NO_IOACCT   (1 << 3) /* bypass I/O accounting (non-LRU bufs) */
 #define XBF_ASYNC       (1 << 4) /* initiator will not wait for completion */
 #define XBF_DONE        (1 << 5) /* all pages in the buffer uptodate */
 #define XBF_STALE       (1 << 6) /* buffer has been staled, do not find it */
@@ -62,6 +63,7 @@ typedef enum {
 #define _XBF_KMEM       (1 << 21)/* backed by heap memory */
 #define _XBF_DELWRI_Q   (1 << 22)/* buffer on a delwri queue */
 #define _XBF_COMPOUND   (1 << 23)/* compound buffer */
+#define _XBF_IN_FLIGHT  (1 << 25) /* I/O in flight, for accounting purposes */
 
 typedef unsigned int xfs_buf_flags_t;
 
@@ -81,7 +83,8 @@ typedef unsigned int xfs_buf_flags_t;
        { _XBF_PAGES,           "PAGES" }, \
        { _XBF_KMEM,            "KMEM" }, \
        { _XBF_DELWRI_Q,        "DELWRI_Q" }, \
-       { _XBF_COMPOUND,        "COMPOUND" }
+       { _XBF_COMPOUND,        "COMPOUND" }, \
+       { _XBF_IN_FLIGHT,       "IN_FLIGHT" }
 
 
 /*
@@ -115,6 +118,8 @@ typedef struct xfs_buftarg {
        /* LRU control structures */
        struct shrinker         bt_shrinker;
        struct list_lru         bt_lru;
+
+       struct percpu_counter   bt_io_count;
 } xfs_buftarg_t;
 
 struct xfs_buf;
index 3425799..e455f90 100644 (file)
@@ -359,7 +359,7 @@ xfs_buf_item_format(
        for (i = 0; i < bip->bli_format_count; i++) {
                xfs_buf_item_format_segment(bip, lv, &vecp, offset,
                                            &bip->bli_formats[i]);
-               offset += bp->b_maps[i].bm_len;
+               offset += BBTOB(bp->b_maps[i].bm_len);
        }
 
        /*
@@ -915,20 +915,28 @@ xfs_buf_item_log(
        for (i = 0; i < bip->bli_format_count; i++) {
                if (start > last)
                        break;
-               end = start + BBTOB(bp->b_maps[i].bm_len);
+               end = start + BBTOB(bp->b_maps[i].bm_len) - 1;
+
+               /* skip to the map that includes the first byte to log */
                if (first > end) {
                        start += BBTOB(bp->b_maps[i].bm_len);
                        continue;
                }
+
+               /*
+                * Trim the range to this segment and mark it in the bitmap.
+                * Note that we must convert buffer offsets to segment relative
+                * offsets (e.g., the first byte of each segment is byte 0 of
+                * that segment).
+                */
                if (first < start)
                        first = start;
                if (end > last)
                        end = last;
-
-               xfs_buf_item_log_segment(first, end,
+               xfs_buf_item_log_segment(first - start, end - start,
                                         &bip->bli_formats[i].blf_data_map[0]);
 
-               start += bp->b_maps[i].bm_len;
+               start += BBTOB(bp->b_maps[i].bm_len);
        }
 }
 
@@ -949,6 +957,7 @@ xfs_buf_item_free(
        xfs_buf_log_item_t      *bip)
 {
        xfs_buf_item_free_format(bip);
+       kmem_free(bip->bli_item.li_lv_shadow);
        kmem_zone_free(xfs_buf_item_zone, bip);
 }
 
@@ -1073,6 +1082,8 @@ xfs_buf_iodone_callback_error(
        trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
        ASSERT(bp->b_iodone != NULL);
 
+       cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
+
        /*
         * If the write was asynchronous then no one will be looking for the
         * error.  If this is the first failure of this type, clear the error
@@ -1080,13 +1091,12 @@ xfs_buf_iodone_callback_error(
         * async write failure at least once, but we also need to set the buffer
         * up to behave correctly now for repeated failures.
         */
-       if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) ||
+       if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) ||
             bp->b_last_error != bp->b_error) {
-               bp->b_flags |= (XBF_WRITE | XBF_ASYNC |
-                               XBF_DONE | XBF_WRITE_FAIL);
+               bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
                bp->b_last_error = bp->b_error;
-               bp->b_retries = 0;
-               bp->b_first_retry_time = jiffies;
+               if (cfg->retry_timeout && !bp->b_first_retry_time)
+                       bp->b_first_retry_time = jiffies;
 
                xfs_buf_ioerror(bp, 0);
                xfs_buf_submit(bp);
@@ -1097,7 +1107,6 @@ xfs_buf_iodone_callback_error(
         * Repeated failure on an async write. Take action according to the
         * error configuration we have been set up to use.
         */
-       cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
 
        if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
            ++bp->b_retries > cfg->max_retries)
index e064665..ccb0811 100644 (file)
@@ -74,6 +74,7 @@ xfs_qm_dqdestroy(
 {
        ASSERT(list_empty(&dqp->q_lru));
 
+       kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
        mutex_destroy(&dqp->q_qlock);
 
        XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
index 814cff9..2c7a162 100644 (file)
@@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed(
        spin_lock(&ailp->xa_lock);
        xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
 
+       kmem_free(qfs->qql_item.li_lv_shadow);
+       kmem_free(lip->li_lv_shadow);
        kmem_free(qfs);
        kmem_free(qfe);
        return (xfs_lsn_t)-1;
index 88693a9..ed7ee4e 100644 (file)
@@ -55,12 +55,15 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
 }
 
 int
-xfs_errortag_add(int error_tag, xfs_mount_t *mp)
+xfs_errortag_add(unsigned int error_tag, xfs_mount_t *mp)
 {
        int i;
        int len;
        int64_t fsid;
 
+       if (error_tag >= XFS_ERRTAG_MAX)
+               return -EINVAL;
+
        memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
 
        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
index 4ed3042..2e4f67f 100644 (file)
@@ -128,7 +128,7 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
         xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
                        (rf))))
 
-extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
+extern int xfs_errortag_add(unsigned int error_tag, struct xfs_mount *mp);
 extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
 #else
 #define XFS_TEST_ERROR(expr, mp, tag, rf)      (expr)
index 4aa0153..ab77946 100644 (file)
@@ -40,6 +40,7 @@ void
 xfs_efi_item_free(
        struct xfs_efi_log_item *efip)
 {
+       kmem_free(efip->efi_item.li_lv_shadow);
        if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
                kmem_free(efip);
        else
@@ -300,6 +301,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
 STATIC void
 xfs_efd_item_free(struct xfs_efd_log_item *efdp)
 {
+       kmem_free(efdp->efd_item.li_lv_shadow);
        if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
                kmem_free(efdp);
        else
index 1b3dc9d..ed95e5b 100644 (file)
@@ -37,6 +37,7 @@
 #include "xfs_log.h"
 #include "xfs_icache.h"
 #include "xfs_pnfs.h"
+#include "xfs_iomap.h"
 
 #include <linux/dcache.h>
 #include <linux/falloc.h>
@@ -80,61 +81,17 @@ xfs_rw_ilock_demote(
 }
 
 /*
- * xfs_iozero clears the specified range supplied via the page cache (except in
- * the DAX case). Writes through the page cache will allocate blocks over holes,
- * though the callers usually map the holes first and avoid them. If a block is
- * not completely zeroed, then it will be read from disk before being partially
- * zeroed.
- *
- * In the DAX case, we can just directly write to the underlying pages. This
- * will not allocate blocks, but will avoid holes and unwritten extents and so
- * not do unnecessary work.
+ * Clear the specified ranges to zero through either the pagecache or DAX.
+ * Holes and unwritten extents will be left as-is as they already are zeroed.
  */
 int
-xfs_iozero(
-       struct xfs_inode        *ip,    /* inode                        */
-       loff_t                  pos,    /* offset in file               */
-       size_t                  count)  /* size of data to zero         */
+xfs_zero_range(
+       struct xfs_inode        *ip,
+       xfs_off_t               pos,
+       xfs_off_t               count,
+       bool                    *did_zero)
 {
-       struct page             *page;
-       struct address_space    *mapping;
-       int                     status = 0;
-
-
-       mapping = VFS_I(ip)->i_mapping;
-       do {
-               unsigned offset, bytes;
-               void *fsdata;
-
-               offset = (pos & (PAGE_SIZE -1)); /* Within page */
-               bytes = PAGE_SIZE - offset;
-               if (bytes > count)
-                       bytes = count;
-
-               if (IS_DAX(VFS_I(ip))) {
-                       status = dax_zero_page_range(VFS_I(ip), pos, bytes,
-                                                    xfs_get_blocks_direct);
-                       if (status)
-                               break;
-               } else {
-                       status = pagecache_write_begin(NULL, mapping, pos, bytes,
-                                               AOP_FLAG_UNINTERRUPTIBLE,
-                                               &page, &fsdata);
-                       if (status)
-                               break;
-
-                       zero_user(page, offset, bytes);
-
-                       status = pagecache_write_end(NULL, mapping, pos, bytes,
-                                               bytes, page, fsdata);
-                       WARN_ON(status <= 0); /* can't return less than zero! */
-                       status = 0;
-               }
-               pos += bytes;
-               count -= bytes;
-       } while (count);
-
-       return status;
+       return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
 }
 
 int
@@ -282,48 +239,35 @@ xfs_file_fsync(
 }
 
 STATIC ssize_t
-xfs_file_read_iter(
+xfs_file_dio_aio_read(
        struct kiocb            *iocb,
        struct iov_iter         *to)
 {
-       struct file             *file = iocb->ki_filp;
-       struct inode            *inode = file->f_mapping->host;
+       struct address_space    *mapping = iocb->ki_filp->f_mapping;
+       struct inode            *inode = mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       size_t                  size = iov_iter_count(to);
+       loff_t                  isize = i_size_read(inode);
+       size_t                  count = iov_iter_count(to);
+       struct iov_iter         data;
+       struct xfs_buftarg      *target;
        ssize_t                 ret = 0;
-       int                     ioflags = 0;
-       xfs_fsize_t             n;
-       loff_t                  pos = iocb->ki_pos;
 
-       XFS_STATS_INC(mp, xs_read_calls);
-
-       if (unlikely(iocb->ki_flags & IOCB_DIRECT))
-               ioflags |= XFS_IO_ISDIRECT;
-       if (file->f_mode & FMODE_NOCMTIME)
-               ioflags |= XFS_IO_INVIS;
-
-       if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
-               xfs_buftarg_t   *target =
-                       XFS_IS_REALTIME_INODE(ip) ?
-                               mp->m_rtdev_targp : mp->m_ddev_targp;
-               /* DIO must be aligned to device logical sector size */
-               if ((pos | size) & target->bt_logical_sectormask) {
-                       if (pos == i_size_read(inode))
-                               return 0;
-                       return -EINVAL;
-               }
-       }
+       trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
 
-       n = mp->m_super->s_maxbytes - pos;
-       if (n <= 0 || size == 0)
-               return 0;
+       if (!count)
+               return 0; /* skip atime */
 
-       if (n < size)
-               size = n;
+       if (XFS_IS_REALTIME_INODE(ip))
+               target = ip->i_mount->m_rtdev_targp;
+       else
+               target = ip->i_mount->m_ddev_targp;
 
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return -EIO;
+       /* DIO must be aligned to device logical sector size */
+       if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
+               if (iocb->ki_pos == isize)
+                       return 0;
+               return -EINVAL;
+       }
 
        /*
         * Locking is a bit tricky here. If we take an exclusive lock for direct
@@ -336,7 +280,7 @@ xfs_file_read_iter(
         * serialisation.
         */
        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-       if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
+       if (mapping->nrpages) {
                xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
                xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
 
@@ -351,8 +295,8 @@ xfs_file_read_iter(
                 * flush and reduce the chances of repeated iolock cycles going
                 * forward.
                 */
-               if (inode->i_mapping->nrpages) {
-                       ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+               if (mapping->nrpages) {
+                       ret = filemap_write_and_wait(mapping);
                        if (ret) {
                                xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
                                return ret;
@@ -363,20 +307,95 @@ xfs_file_read_iter(
                         * we fail to invalidate a page, but this should never
                         * happen on XFS. Warn if it does fail.
                         */
-                       ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
+                       ret = invalidate_inode_pages2(mapping);
                        WARN_ON_ONCE(ret);
                        ret = 0;
                }
                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
        }
 
-       trace_xfs_file_read(ip, size, pos, ioflags);
+       data = *to;
+       ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+                       xfs_get_blocks_direct, NULL, NULL, 0);
+       if (ret > 0) {
+               iocb->ki_pos += ret;
+               iov_iter_advance(to, ret);
+       }
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 
+       file_accessed(iocb->ki_filp);
+       return ret;
+}
+
+static noinline ssize_t
+xfs_file_dax_read(
+       struct kiocb            *iocb,
+       struct iov_iter         *to)
+{
+       struct address_space    *mapping = iocb->ki_filp->f_mapping;
+       struct inode            *inode = mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct iov_iter         data = *to;
+       size_t                  count = iov_iter_count(to);
+       ssize_t                 ret = 0;
+
+       trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+
+       if (!count)
+               return 0; /* skip atime */
+
+       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+       ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
+       if (ret > 0) {
+               iocb->ki_pos += ret;
+               iov_iter_advance(to, ret);
+       }
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+
+       file_accessed(iocb->ki_filp);
+       return ret;
+}
+
+STATIC ssize_t
+xfs_file_buffered_aio_read(
+       struct kiocb            *iocb,
+       struct iov_iter         *to)
+{
+       struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
+       ssize_t                 ret;
+
+       trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+
+       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
        ret = generic_file_read_iter(iocb, to);
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+
+       return ret;
+}
+
+STATIC ssize_t
+xfs_file_read_iter(
+       struct kiocb            *iocb,
+       struct iov_iter         *to)
+{
+       struct inode            *inode = file_inode(iocb->ki_filp);
+       struct xfs_mount        *mp = XFS_I(inode)->i_mount;
+       ssize_t                 ret = 0;
+
+       XFS_STATS_INC(mp, xs_read_calls);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+
+       if (IS_DAX(inode))
+               ret = xfs_file_dax_read(iocb, to);
+       else if (iocb->ki_flags & IOCB_DIRECT)
+               ret = xfs_file_dio_aio_read(iocb, to);
+       else
+               ret = xfs_file_buffered_aio_read(iocb, to);
+
        if (ret > 0)
                XFS_STATS_ADD(mp, xs_read_bytes, ret);
-
-       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
 }
 
@@ -389,18 +408,14 @@ xfs_file_splice_read(
        unsigned int            flags)
 {
        struct xfs_inode        *ip = XFS_I(infilp->f_mapping->host);
-       int                     ioflags = 0;
        ssize_t                 ret;
 
        XFS_STATS_INC(ip->i_mount, xs_read_calls);
 
-       if (infilp->f_mode & FMODE_NOCMTIME)
-               ioflags |= XFS_IO_INVIS;
-
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
 
-       trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+       trace_xfs_file_splice_read(ip, count, *ppos);
 
        /*
         * DAX inodes cannot ues the page cache for splice, so we have to push
@@ -423,49 +438,6 @@ out:
        return ret;
 }
 
-/*
- * This routine is called to handle zeroing any space in the last block of the
- * file that is beyond the EOF.  We do this since the size is being increased
- * without writing anything to that block and we don't want to read the
- * garbage on the disk.
- */
-STATIC int                             /* error (positive) */
-xfs_zero_last_block(
-       struct xfs_inode        *ip,
-       xfs_fsize_t             offset,
-       xfs_fsize_t             isize,
-       bool                    *did_zeroing)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_fileoff_t           last_fsb = XFS_B_TO_FSBT(mp, isize);
-       int                     zero_offset = XFS_B_FSB_OFFSET(mp, isize);
-       int                     zero_len;
-       int                     nimaps = 1;
-       int                     error = 0;
-       struct xfs_bmbt_irec    imap;
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       if (error)
-               return error;
-
-       ASSERT(nimaps > 0);
-
-       /*
-        * If the block underlying isize is just a hole, then there
-        * is nothing to zero.
-        */
-       if (imap.br_startblock == HOLESTARTBLOCK)
-               return 0;
-
-       zero_len = mp->m_sb.sb_blocksize - zero_offset;
-       if (isize + zero_len > offset)
-               zero_len = offset - isize;
-       *did_zeroing = true;
-       return xfs_iozero(ip, isize, zero_len);
-}
-
 /*
  * Zero any on disk space between the current EOF and the new, larger EOF.
  *
@@ -484,94 +456,11 @@ xfs_zero_eof(
        xfs_fsize_t             isize,          /* current inode size */
        bool                    *did_zeroing)
 {
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_fileoff_t           start_zero_fsb;
-       xfs_fileoff_t           end_zero_fsb;
-       xfs_fileoff_t           zero_count_fsb;
-       xfs_fileoff_t           last_fsb;
-       xfs_fileoff_t           zero_off;
-       xfs_fsize_t             zero_len;
-       int                     nimaps;
-       int                     error = 0;
-       struct xfs_bmbt_irec    imap;
-
        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
        ASSERT(offset > isize);
 
        trace_xfs_zero_eof(ip, isize, offset - isize);
-
-       /*
-        * First handle zeroing the block on which isize resides.
-        *
-        * We only zero a part of that block so it is handled specially.
-        */
-       if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
-               error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
-               if (error)
-                       return error;
-       }
-
-       /*
-        * Calculate the range between the new size and the old where blocks
-        * needing to be zeroed may exist.
-        *
-        * To get the block where the last byte in the file currently resides,
-        * we need to subtract one from the size and truncate back to a block
-        * boundary.  We subtract 1 in case the size is exactly on a block
-        * boundary.
-        */
-       last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
-       start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
-       end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
-       ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
-       if (last_fsb == end_zero_fsb) {
-               /*
-                * The size was only incremented on its last block.
-                * We took care of that above, so just return.
-                */
-               return 0;
-       }
-
-       ASSERT(start_zero_fsb <= end_zero_fsb);
-       while (start_zero_fsb <= end_zero_fsb) {
-               nimaps = 1;
-               zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
-
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
-                                         &imap, &nimaps, 0);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               if (error)
-                       return error;
-
-               ASSERT(nimaps > 0);
-
-               if (imap.br_state == XFS_EXT_UNWRITTEN ||
-                   imap.br_startblock == HOLESTARTBLOCK) {
-                       start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-                       ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-                       continue;
-               }
-
-               /*
-                * There are blocks we need to zero.
-                */
-               zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
-               zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
-
-               if ((zero_off + zero_len) > offset)
-                       zero_len = offset - zero_off;
-
-               error = xfs_iozero(ip, zero_off, zero_len);
-               if (error)
-                       return error;
-
-               *did_zeroing = true;
-               start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-               ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-       }
-
-       return 0;
+       return xfs_zero_range(ip, isize, offset - isize, did_zeroing);
 }
 
 /*
@@ -722,8 +611,7 @@ xfs_file_dio_aio_write(
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
 
        /* DIO must be aligned to device logical sector size */
-       if (!IS_DAX(inode) &&
-           ((iocb->ki_pos | count) & target->bt_logical_sectormask))
+       if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
                return -EINVAL;
 
        /* "unaligned" here means not aligned to a filesystem block */
@@ -762,7 +650,7 @@ xfs_file_dio_aio_write(
        end = iocb->ki_pos + count - 1;
 
        /*
-        * See xfs_file_read_iter() for why we do a full-file flush here.
+        * See xfs_file_dio_aio_read() for why we do a full-file flush here.
         */
        if (mapping->nrpages) {
                ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
@@ -789,10 +677,12 @@ xfs_file_dio_aio_write(
                iolock = XFS_IOLOCK_SHARED;
        }
 
-       trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+       trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
 
        data = *from;
-       ret = mapping->a_ops->direct_IO(iocb, &data);
+       ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+                       xfs_get_blocks_direct, xfs_end_io_direct_write,
+                       NULL, DIO_ASYNC_EXTEND);
 
        /* see generic_file_direct_write() for why this is necessary */
        if (mapping->nrpages) {
@@ -809,10 +699,70 @@ out:
        xfs_rw_iunlock(ip, iolock);
 
        /*
-        * No fallback to buffered IO on errors for XFS. DAX can result in
-        * partial writes, but direct IO will either complete fully or fail.
+        * No fallback to buffered IO on errors for XFS, direct IO will either
+        * complete fully or fail.
         */
-       ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
+       ASSERT(ret < 0 || ret == count);
+       return ret;
+}
+
+static noinline ssize_t
+xfs_file_dax_write(
+       struct kiocb            *iocb,
+       struct iov_iter         *from)
+{
+       struct address_space    *mapping = iocb->ki_filp->f_mapping;
+       struct inode            *inode = mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       ssize_t                 ret = 0;
+       int                     unaligned_io = 0;
+       int                     iolock;
+       struct iov_iter         data;
+
+       /* "unaligned" here means not aligned to a filesystem block */
+       if ((iocb->ki_pos & mp->m_blockmask) ||
+           ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
+               unaligned_io = 1;
+               iolock = XFS_IOLOCK_EXCL;
+       } else if (mapping->nrpages) {
+               iolock = XFS_IOLOCK_EXCL;
+       } else {
+               iolock = XFS_IOLOCK_SHARED;
+       }
+       xfs_rw_ilock(ip, iolock);
+
+       ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+       if (ret)
+               goto out;
+
+       /*
+        * Yes, even DAX files can have page cache attached to them:  A zeroed
+        * page is inserted into the pagecache when we have to serve a write
+        * fault on a hole.  It should never be dirtied and can simply be
+        * dropped from the pagecache once we get real data for the page.
+        */
+       if (mapping->nrpages) {
+               ret = invalidate_inode_pages2(mapping);
+               WARN_ON_ONCE(ret);
+       }
+
+       if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
+               xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+               iolock = XFS_IOLOCK_SHARED;
+       }
+
+       trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
+
+       data = *from;
+       ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
+                       xfs_end_io_direct_write, 0);
+       if (ret > 0) {
+               iocb->ki_pos += ret;
+               iov_iter_advance(from, ret);
+       }
+out:
+       xfs_rw_iunlock(ip, iolock);
        return ret;
 }
 
@@ -839,9 +789,8 @@ xfs_file_buffered_aio_write(
        current->backing_dev_info = inode_to_bdi(inode);
 
 write_retry:
-       trace_xfs_file_buffered_write(ip, iov_iter_count(from),
-                                     iocb->ki_pos, 0);
-       ret = generic_perform_write(file, from, iocb->ki_pos);
+       trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
+       ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
        if (likely(ret >= 0))
                iocb->ki_pos += ret;
 
@@ -895,7 +844,9 @@ xfs_file_write_iter(
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
 
-       if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
+       if (IS_DAX(inode))
+               ret = xfs_file_dax_write(iocb, from);
+       else if (iocb->ki_flags & IOCB_DIRECT)
                ret = xfs_file_dio_aio_write(iocb, from);
        else
                ret = xfs_file_buffered_aio_write(iocb, from);
@@ -1553,7 +1504,7 @@ xfs_filemap_page_mkwrite(
        if (IS_DAX(inode)) {
                ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
        } else {
-               ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+               ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
                ret = block_page_mkwrite_return(ret);
        }
 
index b4d7582..7191c38 100644 (file)
@@ -667,8 +667,11 @@ xfs_reserve_blocks(
        __uint64_t              *inval,
        xfs_fsop_resblks_t      *outval)
 {
-       __int64_t               lcounter, delta, fdblks_delta;
+       __int64_t               lcounter, delta;
+       __int64_t               fdblks_delta = 0;
        __uint64_t              request;
+       __int64_t               free;
+       int                     error = 0;
 
        /* If inval is null, report current values and return */
        if (inval == (__uint64_t *)NULL) {
@@ -682,24 +685,23 @@ xfs_reserve_blocks(
        request = *inval;
 
        /*
-        * With per-cpu counters, this becomes an interesting
-        * problem. we needto work out if we are freeing or allocation
-        * blocks first, then we can do the modification as necessary.
+        * With per-cpu counters, this becomes an interesting problem. we need
+        * to work out if we are freeing or allocation blocks first, then we can
+        * do the modification as necessary.
         *
-        * We do this under the m_sb_lock so that if we are near
-        * ENOSPC, we will hold out any changes while we work out
-        * what to do. This means that the amount of free space can
-        * change while we do this, so we need to retry if we end up
-        * trying to reserve more space than is available.
+        * We do this under the m_sb_lock so that if we are near ENOSPC, we will
+        * hold out any changes while we work out what to do. This means that
+        * the amount of free space can change while we do this, so we need to
+        * retry if we end up trying to reserve more space than is available.
         */
-retry:
        spin_lock(&mp->m_sb_lock);
 
        /*
         * If our previous reservation was larger than the current value,
-        * then move any unused blocks back to the free pool.
+        * then move any unused blocks back to the free pool. Modify the resblks
+        * counters directly since we shouldn't have any problems unreserving
+        * space.
         */
-       fdblks_delta = 0;
        if (mp->m_resblks > request) {
                lcounter = mp->m_resblks_avail - request;
                if (lcounter  > 0) {            /* release unused blocks */
@@ -707,54 +709,67 @@ retry:
                        mp->m_resblks_avail -= lcounter;
                }
                mp->m_resblks = request;
-       } else {
-               __int64_t       free;
+               if (fdblks_delta) {
+                       spin_unlock(&mp->m_sb_lock);
+                       error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
+                       spin_lock(&mp->m_sb_lock);
+               }
+
+               goto out;
+       }
 
+       /*
+        * If the request is larger than the current reservation, reserve the
+        * blocks before we update the reserve counters. Sample m_fdblocks and
+        * perform a partial reservation if the request exceeds free space.
+        */
+       error = -ENOSPC;
+       do {
                free = percpu_counter_sum(&mp->m_fdblocks) -
                                                        XFS_ALLOC_SET_ASIDE(mp);
                if (!free)
-                       goto out; /* ENOSPC and fdblks_delta = 0 */
+                       break;
 
                delta = request - mp->m_resblks;
                lcounter = free - delta;
-               if (lcounter < 0) {
+               if (lcounter < 0)
                        /* We can't satisfy the request, just get what we can */
-                       mp->m_resblks += free;
-                       mp->m_resblks_avail += free;
-                       fdblks_delta = -free;
-               } else {
-                       fdblks_delta = -delta;
-                       mp->m_resblks = request;
-                       mp->m_resblks_avail += delta;
-               }
-       }
-out:
-       if (outval) {
-               outval->resblks = mp->m_resblks;
-               outval->resblks_avail = mp->m_resblks_avail;
-       }
-       spin_unlock(&mp->m_sb_lock);
+                       fdblks_delta = free;
+               else
+                       fdblks_delta = delta;
 
-       if (fdblks_delta) {
                /*
-                * If we are putting blocks back here, m_resblks_avail is
-                * already at its max so this will put it in the free pool.
-                *
-                * If we need space, we'll either succeed in getting it
-                * from the free block count or we'll get an enospc. If
-                * we get a ENOSPC, it means things changed while we were
-                * calculating fdblks_delta and so we should try again to
-                * see if there is anything left to reserve.
+                * We'll either succeed in getting space from the free block
+                * count or we'll get an ENOSPC. If we get a ENOSPC, it means
+                * things changed while we were calculating fdblks_delta and so
+                * we should try again to see if there is anything left to
+                * reserve.
                 *
                 * Don't set the reserved flag here - we don't want to reserve
                 * the extra reserve blocks from the reserve.....
                 */
-               int error;
-               error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
-               if (error == -ENOSPC)
-                       goto retry;
+               spin_unlock(&mp->m_sb_lock);
+               error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
+               spin_lock(&mp->m_sb_lock);
+       } while (error == -ENOSPC);
+
+       /*
+        * Update the reserve counters if blocks have been successfully
+        * allocated.
+        */
+       if (!error && fdblks_delta) {
+               mp->m_resblks += fdblks_delta;
+               mp->m_resblks_avail += fdblks_delta;
        }
-       return 0;
+
+out:
+       if (outval) {
+               outval->resblks = mp->m_resblks;
+               outval->resblks_avail = mp->m_resblks_avail;
+       }
+
+       spin_unlock(&mp->m_sb_lock);
+       return error;
 }
 
 int
index 99ee6ee..fb39a66 100644 (file)
@@ -765,7 +765,7 @@ restart:
  * Background scanning to trim post-EOF preallocated space. This is queued
  * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
  */
-STATIC void
+void
 xfs_queue_eofblocks(
        struct xfs_mount *mp)
 {
index 62f1f91..05bac99 100644 (file)
@@ -68,6 +68,7 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
 int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
 int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
 void xfs_eofblocks_worker(struct work_struct *);
+void xfs_queue_eofblocks(struct xfs_mount *);
 
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
        int (*execute)(struct xfs_inode *ip, int flags, void *args),
index ee6799e..8825bcf 100644 (file)
@@ -431,7 +431,7 @@ xfs_lock_inumorder(int lock_mode, int subclass)
  * lock more than one at a time, lockdep will report false positives saying we
  * have violated locking orders.
  */
-void
+static void
 xfs_lock_inodes(
        xfs_inode_t     **ips,
        int             inodes,
@@ -667,14 +667,6 @@ xfs_ip2xflags(
        return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
 }
 
-uint
-xfs_dic2xflags(
-       struct xfs_dinode       *dip)
-{
-       return _xfs_dic2xflags(be16_to_cpu(dip->di_flags),
-                               be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip));
-}
-
 /*
  * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
  * is allowed, otherwise it has to be an exact match. If a CI match is found,
@@ -748,7 +740,7 @@ out_unlock:
  * are not linked into the directory structure - they are attached
  * directly to the superblock - and so have no parent.
  */
-int
+static int
 xfs_ialloc(
        xfs_trans_t     *tp,
        xfs_inode_t     *pip,
@@ -1085,7 +1077,7 @@ xfs_dir_ialloc(
  * link count to go to zero, move the inode to AGI unlinked list so that it can
  * be freed when the last active reference goes away via xfs_inactive().
  */
-int                            /* error */
+static int                     /* error */
 xfs_droplink(
        xfs_trans_t *tp,
        xfs_inode_t *ip)
@@ -1104,7 +1096,7 @@ xfs_droplink(
 /*
  * Increment the link count on an inode & log the change.
  */
-int
+static int
 xfs_bumplink(
        xfs_trans_t *tp,
        xfs_inode_t *ip)
index e52d7c7..8eb78ec 100644 (file)
@@ -395,12 +395,8 @@ void               xfs_ilock_demote(xfs_inode_t *, uint);
 int            xfs_isilocked(xfs_inode_t *, uint);
 uint           xfs_ilock_data_map_shared(struct xfs_inode *);
 uint           xfs_ilock_attr_map_shared(struct xfs_inode *);
-int            xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
-                          xfs_nlink_t, xfs_dev_t, prid_t, int,
-                          struct xfs_buf **, xfs_inode_t **);
 
 uint           xfs_ip2xflags(struct xfs_inode *);
-uint           xfs_dic2xflags(struct xfs_dinode *);
 int            xfs_ifree(struct xfs_trans *, xfs_inode_t *,
                           struct xfs_bmap_free *);
 int            xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
@@ -411,7 +407,6 @@ void                xfs_iunpin_wait(xfs_inode_t *);
 #define xfs_ipincount(ip)      ((unsigned int) atomic_read(&ip->i_pincount))
 
 int            xfs_iflush(struct xfs_inode *, struct xfs_buf **);
-void           xfs_lock_inodes(xfs_inode_t **, int, uint);
 void           xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 
 xfs_extlen_t   xfs_get_extsz_hint(struct xfs_inode *ip);
@@ -419,8 +414,6 @@ xfs_extlen_t        xfs_get_extsz_hint(struct xfs_inode *ip);
 int            xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
                               xfs_nlink_t, xfs_dev_t, prid_t, int,
                               struct xfs_inode **, int *);
-int            xfs_droplink(struct xfs_trans *, struct xfs_inode *);
-int            xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
 
 /* from xfs_file.c */
 enum xfs_prealloc_flags {
@@ -434,7 +427,8 @@ int xfs_update_prealloc_flags(struct xfs_inode *ip,
                                  enum xfs_prealloc_flags flags);
 int    xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
                     xfs_fsize_t isize, bool *did_zeroing);
-int    xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
+int    xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count,
+               bool *did_zero);
 loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start,
                             loff_t eof, int whence);
 
@@ -479,14 +473,4 @@ do { \
 
 extern struct kmem_zone        *xfs_inode_zone;
 
-/*
- * Flags for read/write calls
- */
-#define XFS_IO_ISDIRECT        0x00001         /* bypass page cache */
-#define XFS_IO_INVIS   0x00002         /* don't update inode timestamps */
-
-#define XFS_IO_FLAGS \
-       { XFS_IO_ISDIRECT,      "DIRECT" }, \
-       { XFS_IO_INVIS,         "INVIS"}
-
 #endif /* __XFS_INODE_H__ */
index a1b0761..892c2ac 100644 (file)
@@ -651,6 +651,7 @@ void
 xfs_inode_item_destroy(
        xfs_inode_t     *ip)
 {
+       kmem_free(ip->i_itemp->ili_item.li_lv_shadow);
        kmem_zone_free(xfs_ili_zone, ip->i_itemp);
 }
 
index 63a6ff2..9a7c878 100644 (file)
@@ -595,13 +595,12 @@ xfs_attrmulti_by_handle(
 
 int
 xfs_ioc_space(
-       struct xfs_inode        *ip,
-       struct inode            *inode,
        struct file             *filp,
-       int                     ioflags,
        unsigned int            cmd,
        xfs_flock64_t           *bf)
 {
+       struct inode            *inode = file_inode(filp);
+       struct xfs_inode        *ip = XFS_I(inode);
        struct iattr            iattr;
        enum xfs_prealloc_flags flags = 0;
        uint                    iolock = XFS_IOLOCK_EXCL;
@@ -626,7 +625,7 @@ xfs_ioc_space(
 
        if (filp->f_flags & O_DSYNC)
                flags |= XFS_PREALLOC_SYNC;
-       if (ioflags & XFS_IO_INVIS)
+       if (filp->f_mode & FMODE_NOCMTIME)
                flags |= XFS_PREALLOC_INVISIBLE;
 
        error = mnt_want_write_file(filp);
@@ -1464,8 +1463,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
 
 STATIC int
 xfs_ioc_getbmap(
-       struct xfs_inode        *ip,
-       int                     ioflags,
+       struct file             *file,
        unsigned int            cmd,
        void                    __user *arg)
 {
@@ -1479,10 +1477,10 @@ xfs_ioc_getbmap(
                return -EINVAL;
 
        bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
-       if (ioflags & XFS_IO_INVIS)
+       if (file->f_mode & FMODE_NOCMTIME)
                bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
 
-       error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
+       error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format,
                            (__force struct getbmap *)arg+1);
        if (error)
                return error;
@@ -1575,6 +1573,11 @@ xfs_ioc_swapext(
                goto out_put_tmp_file;
        }
 
+       /*
+        * We need to ensure that the fds passed in point to XFS inodes
+        * before we cast and access them as XFS structures as we have no
+        * control over what the user passes us here.
+        */
        if (f.file->f_op != &xfs_file_operations ||
            tmp.file->f_op != &xfs_file_operations) {
                error = -EINVAL;
@@ -1625,12 +1628,8 @@ xfs_file_ioctl(
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
        void                    __user *arg = (void __user *)p;
-       int                     ioflags = 0;
        int                     error;
 
-       if (filp->f_mode & FMODE_NOCMTIME)
-               ioflags |= XFS_IO_INVIS;
-
        trace_xfs_file_ioctl(ip);
 
        switch (cmd) {
@@ -1649,7 +1648,7 @@ xfs_file_ioctl(
 
                if (copy_from_user(&bf, arg, sizeof(bf)))
                        return -EFAULT;
-               return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+               return xfs_ioc_space(filp, cmd, &bf);
        }
        case XFS_IOC_DIOINFO: {
                struct dioattr  da;
@@ -1708,7 +1707,7 @@ xfs_file_ioctl(
 
        case XFS_IOC_GETBMAP:
        case XFS_IOC_GETBMAPA:
-               return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
+               return xfs_ioc_getbmap(filp, cmd, arg);
 
        case XFS_IOC_GETBMAPX:
                return xfs_ioc_getbmapx(ip, arg);
index 77c02c7..8b52881 100644 (file)
 
 extern int
 xfs_ioc_space(
-       struct xfs_inode        *ip,
-       struct inode            *inode,
        struct file             *filp,
-       int                     ioflags,
        unsigned int            cmd,
        xfs_flock64_t           *bf);
 
index 1a05d8a..321f577 100644 (file)
@@ -532,12 +532,8 @@ xfs_file_compat_ioctl(
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
        void                    __user *arg = (void __user *)p;
-       int                     ioflags = 0;
        int                     error;
 
-       if (filp->f_mode & FMODE_NOCMTIME)
-               ioflags |= XFS_IO_INVIS;
-
        trace_xfs_file_compat_ioctl(ip);
 
        switch (cmd) {
@@ -589,7 +585,7 @@ xfs_file_compat_ioctl(
                if (xfs_compat_flock64_copyin(&bf, arg))
                        return -EFAULT;
                cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
-               return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+               return xfs_ioc_space(filp, cmd, &bf);
        }
        case XFS_IOC_FSGEOMETRY_V1_32:
                return xfs_compat_ioc_fsgeometry_v1(mp, arg);
index 5839135..620fc91 100644 (file)
@@ -15,6 +15,7 @@
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
+#include <linux/iomap.h>
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
@@ -940,3 +941,173 @@ error_on_bmapi_transaction:
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
 }
+
+void
+xfs_bmbt_to_iomap(
+       struct xfs_inode        *ip,
+       struct iomap            *iomap,
+       struct xfs_bmbt_irec    *imap)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+
+       if (imap->br_startblock == HOLESTARTBLOCK) {
+               iomap->blkno = IOMAP_NULL_BLOCK;
+               iomap->type = IOMAP_HOLE;
+       } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+               iomap->blkno = IOMAP_NULL_BLOCK;
+               iomap->type = IOMAP_DELALLOC;
+       } else {
+               iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+               if (imap->br_state == XFS_EXT_UNWRITTEN)
+                       iomap->type = IOMAP_UNWRITTEN;
+               else
+                       iomap->type = IOMAP_MAPPED;
+       }
+       iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+       iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+       iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+}
+
+static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
+{
+       return !nimaps ||
+               imap->br_startblock == HOLESTARTBLOCK ||
+               imap->br_startblock == DELAYSTARTBLOCK;
+}
+
+static int
+xfs_file_iomap_begin(
+       struct inode            *inode,
+       loff_t                  offset,
+       loff_t                  length,
+       unsigned                flags,
+       struct iomap            *iomap)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_bmbt_irec    imap;
+       xfs_fileoff_t           offset_fsb, end_fsb;
+       int                     nimaps = 1, error = 0;
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+       ASSERT(offset <= mp->m_super->s_maxbytes);
+       if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
+               length = mp->m_super->s_maxbytes - offset;
+       offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       end_fsb = XFS_B_TO_FSB(mp, offset + length);
+
+       error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+                              &nimaps, XFS_BMAPI_ENTIRE);
+       if (error) {
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               return error;
+       }
+
+       if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
+               /*
+                * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+                * pages to keep the chunks of work done where somewhat symmetric
+                * with the work writeback does. This is a completely arbitrary
+                * number pulled out of thin air as a best guess for initial
+                * testing.
+                *
+                * Note that the values needs to be less than 32-bits wide until
+                * the lower level functions are updated.
+                */
+               length = min_t(loff_t, length, 1024 * PAGE_SIZE);
+               if (xfs_get_extsz_hint(ip)) {
+                       /*
+                        * xfs_iomap_write_direct() expects the shared lock. It
+                        * is unlocked on return.
+                        */
+                       xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
+                       error = xfs_iomap_write_direct(ip, offset, length, &imap,
+                                       nimaps);
+               } else {
+                       error = xfs_iomap_write_delay(ip, offset, length, &imap);
+                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               }
+
+               if (error)
+                       return error;
+
+               trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+               xfs_bmbt_to_iomap(ip, iomap, &imap);
+       } else if (nimaps) {
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+               xfs_bmbt_to_iomap(ip, iomap, &imap);
+       } else {
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
+               iomap->blkno = IOMAP_NULL_BLOCK;
+               iomap->type = IOMAP_HOLE;
+               iomap->offset = offset;
+               iomap->length = length;
+       }
+
+       return 0;
+}
+
+static int
+xfs_file_iomap_end_delalloc(
+       struct xfs_inode        *ip,
+       loff_t                  offset,
+       loff_t                  length,
+       ssize_t                 written)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           start_fsb;
+       xfs_fileoff_t           end_fsb;
+       int                     error = 0;
+
+       start_fsb = XFS_B_TO_FSB(mp, offset + written);
+       end_fsb = XFS_B_TO_FSB(mp, offset + length);
+
+       /*
+        * Trim back delalloc blocks if we didn't manage to write the whole
+        * range reserved.
+        *
+        * We don't need to care about racing delalloc as we hold i_mutex
+        * across the reserve/allocate/unreserve calls. If there are delalloc
+        * blocks in the range, they are ours.
+        */
+       if (start_fsb < end_fsb) {
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+                                              end_fsb - start_fsb);
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+               if (error && !XFS_FORCED_SHUTDOWN(mp)) {
+                       xfs_alert(mp, "%s: unable to clean up ino %lld",
+                               __func__, ip->i_ino);
+                       return error;
+               }
+       }
+
+       return 0;
+}
+
+static int
+xfs_file_iomap_end(
+       struct inode            *inode,
+       loff_t                  offset,
+       loff_t                  length,
+       ssize_t                 written,
+       unsigned                flags,
+       struct iomap            *iomap)
+{
+       if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
+               return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
+                               length, written);
+       return 0;
+}
+
+struct iomap_ops xfs_iomap_ops = {
+       .iomap_begin            = xfs_file_iomap_begin,
+       .iomap_end              = xfs_file_iomap_end,
+};
index 8688e66..e066d04 100644 (file)
@@ -18,6 +18,8 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
 
+#include <linux/iomap.h>
+
 struct xfs_inode;
 struct xfs_bmbt_irec;
 
@@ -29,4 +31,9 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
                        struct xfs_bmbt_irec *);
 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
 
+void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+               struct xfs_bmbt_irec *);
+
+extern struct iomap_ops xfs_iomap_ops;
+
 #endif /* __XFS_IOMAP_H__*/
index c5d4eba..ab820f8 100644 (file)
 #include "xfs_dir2.h"
 #include "xfs_trans_space.h"
 #include "xfs_pnfs.h"
+#include "xfs_iomap.h"
 
 #include <linux/capability.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/security.h>
-#include <linux/fiemap.h>
+#include <linux/iomap.h>
 #include <linux/slab.h>
 
 /*
@@ -800,21 +801,31 @@ xfs_setattr_size(
        if (error)
                return error;
 
+       /*
+        * Wait for all direct I/O to complete.
+        */
+       inode_dio_wait(inode);
+
        /*
         * File data changes must be complete before we start the transaction to
         * modify the inode.  This needs to be done before joining the inode to
         * the transaction because the inode cannot be unlocked once it is a
         * part of the transaction.
         *
-        * Start with zeroing any data block beyond EOF that we may expose on
-        * file extension.
+        * Start with zeroing any data beyond EOF that we may expose on file
+        * extension, or zeroing out the rest of the block on a downward
+        * truncate.
         */
        if (newsize > oldsize) {
                error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
-               if (error)
-                       return error;
+       } else {
+               error = iomap_truncate_page(inode, newsize, &did_zeroing,
+                               &xfs_iomap_ops);
        }
 
+       if (error)
+               return error;
+
        /*
         * We are going to log the inode size change in this transaction so
         * any previous writes that are beyond the on disk EOF and the new
@@ -823,17 +834,14 @@ xfs_setattr_size(
         * problem. Note that this includes any block zeroing we did above;
         * otherwise those blocks may not be zeroed after a crash.
         */
-       if (newsize > ip->i_d.di_size &&
-           (oldsize != ip->i_d.di_size || did_zeroing)) {
+       if (did_zeroing ||
+           (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
                error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
                                                      ip->i_d.di_size, newsize);
                if (error)
                        return error;
        }
 
-       /* Now wait for all direct I/O to complete. */
-       inode_dio_wait(inode);
-
        /*
         * We've already locked out new page faults, so now we can safely remove
         * pages from the page cache knowing they won't get refaulted until we
@@ -851,13 +859,6 @@ xfs_setattr_size(
         * to hope that the caller sees ENOMEM and retries the truncate
         * operation.
         */
-       if (IS_DAX(inode))
-               error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
-       else
-               error = block_truncate_page(inode->i_mapping, newsize,
-                                           xfs_get_blocks);
-       if (error)
-               return error;
        truncate_setsize(inode, newsize);
 
        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
@@ -998,51 +999,6 @@ xfs_vn_update_time(
        return xfs_trans_commit(tp);
 }
 
-#define XFS_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
-
-/*
- * Call fiemap helper to fill in user data.
- * Returns positive errors to xfs_getbmap.
- */
-STATIC int
-xfs_fiemap_format(
-       void                    **arg,
-       struct getbmapx         *bmv,
-       int                     *full)
-{
-       int                     error;
-       struct fiemap_extent_info *fieinfo = *arg;
-       u32                     fiemap_flags = 0;
-       u64                     logical, physical, length;
-
-       /* Do nothing for a hole */
-       if (bmv->bmv_block == -1LL)
-               return 0;
-
-       logical = BBTOB(bmv->bmv_offset);
-       physical = BBTOB(bmv->bmv_block);
-       length = BBTOB(bmv->bmv_length);
-
-       if (bmv->bmv_oflags & BMV_OF_PREALLOC)
-               fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
-       else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
-               fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
-                                FIEMAP_EXTENT_UNKNOWN);
-               physical = 0;   /* no block yet */
-       }
-       if (bmv->bmv_oflags & BMV_OF_LAST)
-               fiemap_flags |= FIEMAP_EXTENT_LAST;
-
-       error = fiemap_fill_next_extent(fieinfo, logical, physical,
-                                       length, fiemap_flags);
-       if (error > 0) {
-               error = 0;
-               *full = 1;      /* user array now full */
-       }
-
-       return error;
-}
-
 STATIC int
 xfs_vn_fiemap(
        struct inode            *inode,
@@ -1050,38 +1006,13 @@ xfs_vn_fiemap(
        u64                     start,
        u64                     length)
 {
-       xfs_inode_t             *ip = XFS_I(inode);
-       struct getbmapx         bm;
        int                     error;
 
-       error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
-       if (error)
-               return error;
-
-       /* Set up bmap header for xfs internal routine */
-       bm.bmv_offset = BTOBBT(start);
-       /* Special case for whole file */
-       if (length == FIEMAP_MAX_OFFSET)
-               bm.bmv_length = -1LL;
-       else
-               bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
-
-       /* We add one because in getbmap world count includes the header */
-       bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
-                                       fieinfo->fi_extents_max + 1;
-       bm.bmv_count = min_t(__s32, bm.bmv_count,
-                            (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
-       bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
-       if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
-               bm.bmv_iflags |= BMV_IF_ATTRFORK;
-       if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
-               bm.bmv_iflags |= BMV_IF_DELALLOC;
-
-       error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
-       if (error)
-               return error;
+       xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
+       error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops);
+       xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
 
-       return 0;
+       return error;
 }
 
 STATIC int
index a8192dc..b8d64d5 100644 (file)
@@ -328,13 +328,6 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
        return x;
 }
 
-/* ARM old ABI has some weird alignment/padding */
-#if defined(__arm__) && !defined(__ARM_EABI__)
-#define __arch_pack __attribute__((packed))
-#else
-#define __arch_pack
-#endif
-
 #define ASSERT_ALWAYS(expr)    \
        (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
 
index bde02f1..3b74fa0 100644 (file)
@@ -788,7 +788,7 @@ xfs_log_mount_cancel(
  * As far as I know, there weren't any dependencies on the old behaviour.
  */
 
-int
+static int
 xfs_log_unmount_write(xfs_mount_t *mp)
 {
        struct xlog      *log = mp->m_log;
@@ -1036,7 +1036,7 @@ xfs_log_space_wake(
  * there's no point in running a dummy transaction at this point because we
  * can't start trying to idle the log until both the CIL and AIL are empty.
  */
-int
+static int
 xfs_log_need_covered(xfs_mount_t *mp)
 {
        struct xlog     *log = mp->m_log;
@@ -1177,7 +1177,7 @@ xlog_space_left(
  * The log manager needs its own routine, in order to control what
  * happens with the buffer after the write completes.
  */
-void
+static void
 xlog_iodone(xfs_buf_t *bp)
 {
        struct xlog_in_core     *iclog = bp->b_fspriv;
@@ -1302,7 +1302,7 @@ xfs_log_work_queue(
  * disk. If there is nothing dirty, then we might need to cover the log to
  * indicate that the filesystem is idle.
  */
-void
+static void
 xfs_log_worker(
        struct work_struct      *work)
 {
@@ -1415,7 +1415,7 @@ xlog_alloc_log(
         */
        error = -ENOMEM;
        bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
-                          BTOBB(log->l_iclog_size), 0);
+                          BTOBB(log->l_iclog_size), XBF_NO_IOACCT);
        if (!bp)
                goto out_free_log;
 
@@ -1454,7 +1454,8 @@ xlog_alloc_log(
                prev_iclog = iclog;
 
                bp = xfs_buf_get_uncached(mp->m_logdev_targp,
-                                               BTOBB(log->l_iclog_size), 0);
+                                         BTOBB(log->l_iclog_size),
+                                         XBF_NO_IOACCT);
                if (!bp)
                        goto out_free_iclog;
 
index 80ba0c0..b5e7107 100644 (file)
@@ -163,12 +163,8 @@ int          xfs_log_reserve(struct xfs_mount *mp,
                          __uint8_t        clientid,
                          bool             permanent);
 int      xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
-int      xfs_log_unmount_write(struct xfs_mount *mp);
 void      xfs_log_unmount(struct xfs_mount *mp);
 int      xfs_log_force_umount(struct xfs_mount *mp, int logerror);
-int      xfs_log_need_covered(struct xfs_mount *mp);
-
-void     xlog_iodone(struct xfs_buf *);
 
 struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
 void     xfs_log_ticket_put(struct xlog_ticket *ticket);
@@ -178,7 +174,6 @@ void        xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
 bool   xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
 
 void   xfs_log_work_queue(struct xfs_mount *mp);
-void   xfs_log_worker(struct work_struct *work);
 void   xfs_log_quiesce(struct xfs_mount *mp);
 bool   xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
 
index 5e54e79..a4ab192 100644 (file)
@@ -78,6 +78,157 @@ xlog_cil_init_post_recovery(
        log->l_cilp->xc_ctx->sequence = 1;
 }
 
+static inline int
+xlog_cil_iovec_space(
+       uint    niovecs)
+{
+       return round_up((sizeof(struct xfs_log_vec) +
+                                       niovecs * sizeof(struct xfs_log_iovec)),
+                       sizeof(uint64_t));
+}
+
+/*
+ * Allocate or pin log vector buffers for CIL insertion.
+ *
+ * The CIL currently uses disposable buffers for copying a snapshot of the
+ * modified items into the log during a push. The biggest problem with this is
+ * the requirement to allocate the disposable buffer during the commit if:
+ *     a) does not exist; or
+ *     b) it is too small
+ *
+ * If we do this allocation within xlog_cil_insert_format_items(), it is done
+ * under the xc_ctx_lock, which means that a CIL push cannot occur during
+ * the memory allocation. This means that we have a potential deadlock situation
+ * under low memory conditions when we have lots of dirty metadata pinned in
+ * the CIL and we need a CIL commit to occur to free memory.
+ *
+ * To avoid this, we need to move the memory allocation outside the
+ * xc_ctx_lock, but because the log vector buffers are disposable, that opens
+ * up a TOCTOU race condition w.r.t. the CIL committing and removing the log
+ * vector buffers between the check and the formatting of the item into the
+ * log vector buffer within the xc_ctx_lock.
+ *
+ * Because the log vector buffer needs to be unchanged during the CIL push
+ * process, we cannot share the buffer between the transaction commit (which
+ * modifies the buffer) and the CIL push context that is writing the changes
+ * into the log. This means skipping preallocation of buffer space is
+ * unreliable, but we most definitely do not want to be allocating and freeing
+ * buffers unnecessarily during commits when overwrites can be done safely.
+ *
+ * The simplest solution to this problem is to allocate a shadow buffer when a
+ * log item is committed for the second time, and then to only use this buffer
+ * if necessary. The buffer can remain attached to the log item until such time
+ * it is needed, and this is the buffer that is reallocated to match the size of
+ * the incoming modification. Then during the formatting of the item we can swap
+ * the active buffer with the new one if we can't reuse the existing buffer. We
+ * don't free the old buffer as it may be reused on the next modification if
+ * it's size is right, otherwise we'll free and reallocate it at that point.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and attaches the vector to the log item in preparation
+ * for the formatting step which occurs under the xc_ctx_lock.
+ *
+ * While this means the memory footprint goes up, it avoids the repeated
+ * alloc/free pattern that repeated modifications of an item would otherwise
+ * cause, and hence minimises the CPU overhead of such behaviour.
+ */
+static void
+xlog_cil_alloc_shadow_bufs(
+       struct xlog             *log,
+       struct xfs_trans        *tp)
+{
+       struct xfs_log_item_desc *lidp;
+
+       list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+               struct xfs_log_item *lip = lidp->lid_item;
+               struct xfs_log_vec *lv;
+               int     niovecs = 0;
+               int     nbytes = 0;
+               int     buf_size;
+               bool    ordered = false;
+
+               /* Skip items which aren't dirty in this transaction. */
+               if (!(lidp->lid_flags & XFS_LID_DIRTY))
+                       continue;
+
+               /* get number of vecs and size of data to be stored */
+               lip->li_ops->iop_size(lip, &niovecs, &nbytes);
+
+               /*
+                * Ordered items need to be tracked but we do not wish to write
+                * them. We need a logvec to track the object, but we do not
+                * need an iovec or buffer to be allocated for copying data.
+                */
+               if (niovecs == XFS_LOG_VEC_ORDERED) {
+                       ordered = true;
+                       niovecs = 0;
+                       nbytes = 0;
+               }
+
+               /*
+                * We 64-bit align the length of each iovec so that the start
+                * of the next one is naturally aligned.  We'll need to
+                * account for that slack space here. Then round nbytes up
+                * to 64-bit alignment so that the initial buffer alignment is
+                * easy to calculate and verify.
+                */
+               nbytes += niovecs * sizeof(uint64_t);
+               nbytes = round_up(nbytes, sizeof(uint64_t));
+
+               /*
+                * The data buffer needs to start 64-bit aligned, so round up
+                * that space to ensure we can align it appropriately and not
+                * overrun the buffer.
+                */
+               buf_size = nbytes + xlog_cil_iovec_space(niovecs);
+
+               /*
+                * if we have no shadow buffer, or it is too small, we need to
+                * reallocate it.
+                */
+               if (!lip->li_lv_shadow ||
+                   buf_size > lip->li_lv_shadow->lv_size) {
+
+                       /*
+                        * We free and allocate here as a realloc would copy
+                        * unecessary data. We don't use kmem_zalloc() for the
+                        * same reason - we don't need to zero the data area in
+                        * the buffer, only the log vector header and the iovec
+                        * storage.
+                        */
+                       kmem_free(lip->li_lv_shadow);
+
+                       lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS);
+                       memset(lv, 0, xlog_cil_iovec_space(niovecs));
+
+                       lv->lv_item = lip;
+                       lv->lv_size = buf_size;
+                       if (ordered)
+                               lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+                       else
+                               lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
+                       lip->li_lv_shadow = lv;
+               } else {
+                       /* same or smaller, optimise common overwrite case */
+                       lv = lip->li_lv_shadow;
+                       if (ordered)
+                               lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+                       else
+                               lv->lv_buf_len = 0;
+                       lv->lv_bytes = 0;
+                       lv->lv_next = NULL;
+               }
+
+               /* Ensure the lv is set up according to ->iop_size */
+               lv->lv_niovecs = niovecs;
+
+               /* The allocated data region lies beyond the iovec region */
+               lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs);
+       }
+
+}
+
 /*
  * Prepare the log item for insertion into the CIL. Calculate the difference in
  * log space and vectors it will consume, and if it is a new item pin it as
@@ -100,16 +251,19 @@ xfs_cil_prepare_item(
        /*
         * If there is no old LV, this is the first time we've seen the item in
         * this CIL context and so we need to pin it. If we are replacing the
-        * old_lv, then remove the space it accounts for and free it.
+        * old_lv, then remove the space it accounts for and make it the shadow
+        * buffer for later freeing. In both cases we are now switching to the
+        * shadow buffer, so update the the pointer to it appropriately.
         */
-       if (!old_lv)
+       if (!old_lv) {
                lv->lv_item->li_ops->iop_pin(lv->lv_item);
-       else if (old_lv != lv) {
+               lv->lv_item->li_lv_shadow = NULL;
+       } else if (old_lv != lv) {
                ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
 
                *diff_len -= old_lv->lv_bytes;
                *diff_iovecs -= old_lv->lv_niovecs;
-               kmem_free(old_lv);
+               lv->lv_item->li_lv_shadow = old_lv;
        }
 
        /* attach new log vector to log item */
@@ -133,11 +287,13 @@ xfs_cil_prepare_item(
  * write it out asynchronously without needing to relock the object that was
  * modified at the time it gets written into the iclog.
  *
- * This function builds a vector for the changes in each log item in the
- * transaction. It then works out the length of the buffer needed for each log
- * item, allocates them and formats the vector for the item into the buffer.
- * The buffer is then attached to the log item are then inserted into the
- * Committed Item List for tracking until the next checkpoint is written out.
+ * This function takes the prepared log vectors attached to each log item, and
+ * formats the changes into the log vector buffer. The buffer it uses is
+ * dependent on the current state of the vector in the CIL - the shadow lv is
+ * guaranteed to be large enough for the current modification, but we will only
+ * use that if we can't reuse the existing lv. If we can't reuse the existing
+ * lv, then simple swap it out for the shadow lv. We don't free it - that is
+ * done lazily either by th enext modification or the freeing of the log item.
  *
  * We don't set up region headers during this process; we simply copy the
  * regions into the flat buffer. We can do this because we still have to do a
@@ -170,59 +326,29 @@ xlog_cil_insert_format_items(
        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
                struct xfs_log_item *lip = lidp->lid_item;
                struct xfs_log_vec *lv;
-               struct xfs_log_vec *old_lv;
-               int     niovecs = 0;
-               int     nbytes = 0;
-               int     buf_size;
+               struct xfs_log_vec *old_lv = NULL;
+               struct xfs_log_vec *shadow;
                bool    ordered = false;
 
                /* Skip items which aren't dirty in this transaction. */
                if (!(lidp->lid_flags & XFS_LID_DIRTY))
                        continue;
 
-               /* get number of vecs and size of data to be stored */
-               lip->li_ops->iop_size(lip, &niovecs, &nbytes);
-
-               /* Skip items that do not have any vectors for writing */
-               if (!niovecs)
-                       continue;
-
                /*
-                * Ordered items need to be tracked but we do not wish to write
-                * them. We need a logvec to track the object, but we do not
-                * need an iovec or buffer to be allocated for copying data.
+                * The formatting size information is already attached to
+                * the shadow lv on the log item.
                 */
-               if (niovecs == XFS_LOG_VEC_ORDERED) {
+               shadow = lip->li_lv_shadow;
+               if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
                        ordered = true;
-                       niovecs = 0;
-                       nbytes = 0;
-               }
 
-               /*
-                * We 64-bit align the length of each iovec so that the start
-                * of the next one is naturally aligned.  We'll need to
-                * account for that slack space here. Then round nbytes up
-                * to 64-bit alignment so that the initial buffer alignment is
-                * easy to calculate and verify.
-                */
-               nbytes += niovecs * sizeof(uint64_t);
-               nbytes = round_up(nbytes, sizeof(uint64_t));
-
-               /* grab the old item if it exists for reservation accounting */
-               old_lv = lip->li_lv;
-
-               /*
-                * The data buffer needs to start 64-bit aligned, so round up
-                * that space to ensure we can align it appropriately and not
-                * overrun the buffer.
-                */
-               buf_size = nbytes +
-                          round_up((sizeof(struct xfs_log_vec) +
-                                    niovecs * sizeof(struct xfs_log_iovec)),
-                                   sizeof(uint64_t));
+               /* Skip items that do not have any vectors for writing */
+               if (!shadow->lv_niovecs && !ordered)
+                       continue;
 
                /* compare to existing item size */
-               if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
+               old_lv = lip->li_lv;
+               if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
                        /* same or smaller, optimise common overwrite case */
                        lv = lip->li_lv;
                        lv->lv_next = NULL;
@@ -236,32 +362,29 @@ xlog_cil_insert_format_items(
                         */
                        *diff_iovecs -= lv->lv_niovecs;
                        *diff_len -= lv->lv_bytes;
+
+                       /* Ensure the lv is set up according to ->iop_size */
+                       lv->lv_niovecs = shadow->lv_niovecs;
+
+                       /* reset the lv buffer information for new formatting */
+                       lv->lv_buf_len = 0;
+                       lv->lv_bytes = 0;
+                       lv->lv_buf = (char *)lv +
+                                       xlog_cil_iovec_space(lv->lv_niovecs);
                } else {
-                       /* allocate new data chunk */
-                       lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
+                       /* switch to shadow buffer! */
+                       lv = shadow;
                        lv->lv_item = lip;
-                       lv->lv_size = buf_size;
                        if (ordered) {
                                /* track as an ordered logvec */
                                ASSERT(lip->li_lv == NULL);
-                               lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
                                goto insert;
                        }
-                       lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
                }
 
-               /* Ensure the lv is set up according to ->iop_size */
-               lv->lv_niovecs = niovecs;
-
-               /* The allocated data region lies beyond the iovec region */
-               lv->lv_buf_len = 0;
-               lv->lv_bytes = 0;
-               lv->lv_buf = (char *)lv + buf_size - nbytes;
                ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
-
                lip->li_ops->iop_format(lip, lv);
 insert:
-               ASSERT(lv->lv_buf_len <= nbytes);
                xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
        }
 }
@@ -783,6 +906,13 @@ xfs_log_commit_cil(
        struct xlog             *log = mp->m_log;
        struct xfs_cil          *cil = log->l_cilp;
 
+       /*
+        * Do all necessary memory allocation before we lock the CIL.
+        * This ensures the allocation does not deadlock with a CIL
+        * push in memory reclaim (e.g. from kswapd).
+        */
+       xlog_cil_alloc_shadow_bufs(log, tp);
+
        /* lock out background commit */
        down_read(&cil->xc_ctx_lock);
 
index e39b023..970c19b 100644 (file)
@@ -272,13 +272,15 @@ xfs_readsb(
        buf_ops = NULL;
 
        /*
-        * Allocate a (locked) buffer to hold the superblock.
-        * This will be kept around at all times to optimize
-        * access to the superblock.
+        * Allocate a (locked) buffer to hold the superblock. This will be kept
+        * around at all times to optimize access to the superblock. Therefore,
+        * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
+        * elevated.
         */
 reread:
        error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
-                                  BTOBB(sector_size), 0, &bp, buf_ops);
+                                     BTOBB(sector_size), XBF_NO_IOACCT, &bp,
+                                     buf_ops);
        if (error) {
                if (loud)
                        xfs_warn(mp, "SB validate failed with error %d.", error);
index 184c44e..0cc8d8f 100644 (file)
        BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
                #structname ") is wrong, expected " #size)
 
+#define XFS_CHECK_OFFSET(structname, member, off) \
+       BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \
+               "XFS: offsetof(" #structname ", " #member ") is wrong, " \
+               "expected " #off)
+
 static inline void __init
 xfs_check_ondisk_structs(void)
 {
@@ -34,6 +39,8 @@ xfs_check_ondisk_structs(void)
        XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key,              8);
        XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec,              16);
        XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block,            4);
+       XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr,      48);
+       XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr,      64);
        XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block,           72);
        XFS_CHECK_STRUCT_SIZE(struct xfs_dinode,                176);
        XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot,            104);
@@ -75,27 +82,39 @@ xfs_check_ondisk_structs(void)
        XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t,      12);
         */
 
+       XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen,  0);
+       XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen,   2);
+       XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval,   3);
+       XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk, 0);
+       XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4);
+       XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen,  8);
+       XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name,     9);
        XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t,             40);
-       XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t,             8);
+       XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize,     0);
+       XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count,       2);
+       XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4);
+       XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5);
+       XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags,   6);
+       XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7);
        XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t,                 12);
        XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t,                 16);
        XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t,              8);
        XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t,                16);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t,             4);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t,              16);
-       XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t,           6);
+       XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag,       0);
+       XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length,        2);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t,              16);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t,                  16);
-       XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t,                  4);
-       XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t,                  8);
-       XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t,                  8);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t,            8);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t,              16);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t,                  16);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t,             4);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t,              3);
+       XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen,          0);
+       XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset,           1);
+       XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name,             3);
        XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t,                10);
-       XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t,                2);
 
        /* log structures */
        XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat,          24);
index d5b7566..0f14b2e 100644 (file)
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2014 Christoph Hellwig.
  */
+#include <linux/iomap.h>
 #include "xfs.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
@@ -79,32 +80,6 @@ xfs_fs_get_uuid(
        return 0;
 }
 
-static void
-xfs_bmbt_to_iomap(
-       struct xfs_inode        *ip,
-       struct iomap            *iomap,
-       struct xfs_bmbt_irec    *imap)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-
-       if (imap->br_startblock == HOLESTARTBLOCK) {
-               iomap->blkno = IOMAP_NULL_BLOCK;
-               iomap->type = IOMAP_HOLE;
-       } else if (imap->br_startblock == DELAYSTARTBLOCK) {
-               iomap->blkno = IOMAP_NULL_BLOCK;
-               iomap->type = IOMAP_DELALLOC;
-       } else {
-               iomap->blkno =
-                       XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
-               if (imap->br_state == XFS_EXT_UNWRITTEN)
-                       iomap->type = IOMAP_UNWRITTEN;
-               else
-                       iomap->type = IOMAP_MAPPED;
-       }
-       iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
-       iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
-}
-
 /*
  * Get a layout for the pNFS client.
  */
index 76c0a4a..355dd9e 100644 (file)
@@ -98,8 +98,6 @@ xfs_growfs_rt(
 /*
  * From xfs_rtbitmap.c
  */
-int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
-                 xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
 int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
                      xfs_rtblock_t start, xfs_extlen_t len, int val,
                      xfs_rtblock_t *new, int *stat);
index 11ea5d5..0303f10 100644 (file)
@@ -546,7 +546,7 @@ xfs_showargs(
 
        return 0;
 }
-__uint64_t
+static __uint64_t
 xfs_max_file_offset(
        unsigned int            blockshift)
 {
@@ -1294,6 +1294,7 @@ xfs_fs_remount(
                 */
                xfs_restore_resvblks(mp);
                xfs_log_work_queue(mp);
+               xfs_queue_eofblocks(mp);
        }
 
        /* rw -> ro */
@@ -1306,6 +1307,13 @@ xfs_fs_remount(
                 * return it to the same size.
                 */
                xfs_save_resvblks(mp);
+
+               /*
+                * Cancel background eofb scanning so it cannot race with the
+                * final log force+buftarg wait and deadlock the remount.
+                */
+               cancel_delayed_work_sync(&mp->m_eofblocks_work);
+
                xfs_quiesce_attr(mp);
                mp->m_flags |= XFS_MOUNT_RDONLY;
        }
@@ -1565,10 +1573,6 @@ xfs_fs_fill_super(
                }
        }
 
-       if (xfs_sb_version_hassparseinodes(&mp->m_sb))
-               xfs_alert(mp,
-       "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
-
        error = xfs_mountfs(mp);
        if (error)
                goto out_filestream_unmount;
@@ -1692,8 +1696,9 @@ xfs_init_zones(void)
        if (!xfs_log_ticket_zone)
                goto out_free_ioend_bioset;
 
-       xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
-                                               "xfs_bmap_free_item");
+       xfs_bmap_free_item_zone = kmem_zone_init(
+                       sizeof(struct xfs_bmap_free_item),
+                       "xfs_bmap_free_item");
        if (!xfs_bmap_free_item_zone)
                goto out_destroy_log_ticket_zone;
 
index 2dfb1ce..529bce9 100644 (file)
@@ -61,8 +61,6 @@ struct xfs_mount;
 struct xfs_buftarg;
 struct block_device;
 
-extern __uint64_t xfs_max_file_offset(unsigned int);
-
 extern void xfs_flush_inodes(struct xfs_mount *mp);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
index 4c2c550..79cfd3f 100644 (file)
@@ -634,6 +634,9 @@ xfs_error_get_cfg(
 {
        struct xfs_error_cfg    *cfg;
 
+       if (error < 0)
+               error = -error;
+
        switch (error) {
        case EIO:
                cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];
index ea94ee0..1451690 100644 (file)
@@ -354,6 +354,7 @@ DEFINE_BUF_EVENT(xfs_buf_submit_wait);
 DEFINE_BUF_EVENT(xfs_buf_bawrite);
 DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
+DEFINE_BUF_EVENT(xfs_buf_trylock_fail);
 DEFINE_BUF_EVENT(xfs_buf_trylock);
 DEFINE_BUF_EVENT(xfs_buf_unlock);
 DEFINE_BUF_EVENT(xfs_buf_iowait);
@@ -1134,15 +1135,14 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
 )
 
 DECLARE_EVENT_CLASS(xfs_file_class,
-       TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
-       TP_ARGS(ip, count, offset, flags),
+       TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),
+       TP_ARGS(ip, count, offset),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(xfs_fsize_t, size)
                __field(loff_t, offset)
                __field(size_t, count)
-               __field(int, flags)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
@@ -1150,25 +1150,25 @@ DECLARE_EVENT_CLASS(xfs_file_class,
                __entry->size = ip->i_d.di_size;
                __entry->offset = offset;
                __entry->count = count;
-               __entry->flags = flags;
        ),
-       TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
-                 "offset 0x%llx count 0x%zx ioflags %s",
+       TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
                  __entry->offset,
-                 __entry->count,
-                 __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
+                 __entry->count)
 )
 
 #define DEFINE_RW_EVENT(name)          \
 DEFINE_EVENT(xfs_file_class, name,     \
-       TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
-       TP_ARGS(ip, count, offset, flags))
-DEFINE_RW_EVENT(xfs_file_read);
+       TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),    \
+       TP_ARGS(ip, count, offset))
+DEFINE_RW_EVENT(xfs_file_buffered_read);
+DEFINE_RW_EVENT(xfs_file_direct_read);
+DEFINE_RW_EVENT(xfs_file_dax_read);
 DEFINE_RW_EVENT(xfs_file_buffered_write);
 DEFINE_RW_EVENT(xfs_file_direct_write);
+DEFINE_RW_EVENT(xfs_file_dax_write);
 DEFINE_RW_EVENT(xfs_file_splice_read);
 
 DECLARE_EVENT_CLASS(xfs_page_class,
@@ -1295,6 +1295,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
+DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
index 9a462e8..9b2b9fa 100644 (file)
@@ -52,6 +52,7 @@ typedef struct xfs_log_item {
        /* delayed logging */
        struct list_head                li_cil;         /* CIL pointers */
        struct xfs_log_vec              *li_lv;         /* active log vector */
+       struct xfs_log_vec              *li_lv_shadow;  /* standby vector */
        xfs_lsn_t                       li_seq;         /* CIL commit seq */
 } xfs_log_item_t;
 
index d841450..b03c062 100644 (file)
@@ -6,6 +6,7 @@
 struct dentry;
 struct iattr;
 struct inode;
+struct iomap;
 struct super_block;
 struct vfsmount;
 
@@ -187,21 +188,6 @@ struct fid {
  *    get_name is not (which is possibly inconsistent)
  */
 
-/* types of block ranges for multipage write mappings. */
-#define IOMAP_HOLE     0x01    /* no blocks allocated, need allocation */
-#define IOMAP_DELALLOC 0x02    /* delayed allocation blocks */
-#define IOMAP_MAPPED   0x03    /* blocks allocated @blkno */
-#define IOMAP_UNWRITTEN        0x04    /* blocks allocated @blkno in unwritten state */
-
-#define IOMAP_NULL_BLOCK -1LL  /* blkno is not valid */
-
-struct iomap {
-       sector_t        blkno;  /* first sector of mapping */
-       loff_t          offset; /* file offset of mapping, bytes */
-       u64             length; /* length of mapping, bytes */
-       int             type;   /* type of mapping */
-};
-
 struct export_operations {
        int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
                        struct inode *parent);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
new file mode 100644 (file)
index 0000000..3267df4
--- /dev/null
@@ -0,0 +1,70 @@
+#ifndef LINUX_IOMAP_H
+#define LINUX_IOMAP_H 1
+
+#include <linux/types.h>
+
+struct fiemap_extent_info;
+struct inode;
+struct iov_iter;
+struct kiocb;
+struct vm_area_struct;
+struct vm_fault;
+
+/*
+ * Types of block ranges for iomap mappings:
+ */
+#define IOMAP_HOLE     0x01    /* no blocks allocated, need allocation */
+#define IOMAP_DELALLOC 0x02    /* delayed allocation blocks */
+#define IOMAP_MAPPED   0x03    /* blocks allocated @blkno */
+#define IOMAP_UNWRITTEN        0x04    /* blocks allocated @blkno in unwritten state */
+
+/*
+ * Magic value for blkno:
+ */
+#define IOMAP_NULL_BLOCK -1LL  /* blkno is not valid */
+
+struct iomap {
+       sector_t                blkno;  /* 1st sector of mapping, 512b units */
+       loff_t                  offset; /* file offset of mapping, bytes */
+       u64                     length; /* length of mapping, bytes */
+       int                     type;   /* type of mapping */
+       struct block_device     *bdev;  /* block device for I/O */
+};
+
+/*
+ * Flags for iomap_begin / iomap_end.  No flag implies a read.
+ */
+#define IOMAP_WRITE            (1 << 0)
+#define IOMAP_ZERO             (1 << 1)
+
+struct iomap_ops {
+       /*
+        * Return the existing mapping at pos, or reserve space starting at
+        * pos for up to length, as long as we can do it as a single mapping.
+        * The actual length is returned in iomap->length.
+        */
+       int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length,
+                       unsigned flags, struct iomap *iomap);
+
+       /*
+        * Commit and/or unreserve space previous allocated using iomap_begin.
+        * Written indicates the length of the successful write operation which
+        * needs to be commited, while the rest needs to be unreserved.
+        * Written might be zero if no data was written.
+        */
+       int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length,
+                       ssize_t written, unsigned flags, struct iomap *iomap);
+};
+
+ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
+               struct iomap_ops *ops);
+int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
+               bool *did_zero, struct iomap_ops *ops);
+int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+               struct iomap_ops *ops);
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+               struct iomap_ops *ops);
+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+               loff_t start, loff_t len, struct iomap_ops *ops);
+
+#endif /* LINUX_IOMAP_H */